In [1]:
import torch
import numpy as np
from sklearn.svm import SVC
from sklearn.decomposition import PCA
from sklearn.metrics import accuracy_score, precision_recall_fscore_support
from transformers import AutoModel, AutoTokenizer
from sklearn.model_selection import train_test_split

In [2]:
tokenizer = AutoTokenizer.from_pretrained("microsoft/codebert-base")
model = AutoModel.from_pretrained("microsoft/codebert-base")

In [9]:
def get_codebert_embeddings(texts, tokenizer, model):
    inputs = tokenizer(texts, padding=True, truncation=True, return_tensors="pt")
    with torch.no_grad():
        outputs = model(**inputs)
    embeddings = outputs.last_hidden_state.mean(dim=1)
    return outputs
    return embeddings.numpy()


In [4]:
# Charger les données
import pandas as pd
file_path = 'GoodBadVariableNames_JS.json'
data = pd.read_json(file_path)

# Fusionner les données 'bonnes' et 'mauvaises' en un seul DataFrame
bad, good = data['bad'].tolist(), data['good'].tolist()
dataset = [[code, 0] for code in bad] + [[code, 1] for code in good]

# Séparer les extraits de code et les étiquettes
X, y = zip(*dataset)

# Division des données en ensembles d'entraînement et de test
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1)

In [5]:
# Get embeddings for training data
X_train_embeddings = get_codebert_embeddings(X_train, tokenizer, model)

# Get embeddings for test data
X_test_embeddings = get_codebert_embeddings(X_test, tokenizer, model)

In [11]:
X_train_embeddings = get_codebert_embeddings(X_train, tokenizer, model)

In [16]:
X_train_embeddings.last_hidden_state.numpy().shape

(522, 109, 768)

In [18]:
522*109*768

400896

In [6]:
# Using PCA for dimensionality reduction
pca = PCA(n_components=50)  # Adjust n_components as needed
X_train_pca = pca.fit_transform(X_train_embeddings)
X_test_pca = pca.transform(X_test_embeddings)

In [7]:
# Initialize and train the SVM
svm_model = SVC(kernel='linear')  # Adjust kernel and other hyperparameters as needed
svm_model.fit(X_train_pca, y_train)

In [8]:
# Predict on test data
y_pred = svm_model.predict(X_test_pca)

# Calculate evaluation metrics
accuracy = accuracy_score(y_test, y_pred)
precision, recall, f1, _ = precision_recall_fscore_support(y_test, y_pred, average='binary')

print(f'Accuracy: {accuracy}, Precision: {precision}, Recall: {recall}, F1: {f1}')

Accuracy: 0.7413793103448276, Precision: 0.6129032258064516, Recall: 0.8636363636363636, F1: 0.7169811320754716
