In [1]:
import tensorflow as tf
from transformers import TFBertModel, BertTokenizer
import pandas as pd
from sklearn.model_selection import train_test_split


2024-01-17 21:10:24.270396: I tensorflow/core/util/port.cc:113] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2024-01-17 21:10:24.751293: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:9261] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-01-17 21:10:24.751343: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:607] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-01-17 21:10:24.835541: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1515] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2024-01-17 21:10:24.991641: I tensorflow/core/platform/cpu_feature_guar

In [2]:
import random

BAD, GOOD = 0, 1

def group_by(value_key_list, groups):
    return [[value for value, key in value_key_list if key == group] for group in groups]

def augment_dataset(dataset, factor = 10):
    num_examples = len(dataset) * factor
    bad_snippets, good_snippets = group_by(dataset, [BAD, GOOD])
    half_examples = num_examples // 2
    bad, good = [], []
    for _ in range(half_examples):
        # Generate 'bad' examples
        num_bad = random.randint(1, 3)  # At least one bad snippet
        num_good = random.randint(0, 2)  # Random number of good snippets
        snippets = random.choices(bad_snippets, k=num_bad) + random.choices(good_snippets, k=num_good)
        random.shuffle(snippets)
        bad.append("\n".join(snippets))

    for _ in range(half_examples, num_examples):
        # Generate 'good' examples
        num_snippets = random.randint(1, 4)
        snippets = random.choices(good_snippets, k=num_snippets)
        good.append("\n".join(snippets))

    new_dataset = [[code, BAD] for code in bad] + [[code, GOOD] for code in good]

    return dataset + new_dataset

In [3]:
# Load data
file_path = 'GoodBadVariableNames_JS.json'
data = pd.read_json(file_path)

bad, good = data['bad'].tolist(), data['good'].tolist()
dataset = [[code, BAD] for code in bad] + [[code, GOOD] for code in good]


In [4]:
train, test = train_test_split(dataset, test_size=0.1)
train, test = [augment_dataset(dataset) for dataset in (train, test)]
(X_train, y_train), (X_test, y_test) = ((list(x) for x in zip(*dataset)) for dataset in (train, test))


In [5]:
from transformers import AutoTokenizer, AutoModelForSequenceClassification
import torch
from torch.utils.data import Dataset, DataLoader

tokenizer = AutoTokenizer.from_pretrained("microsoft/codebert-base")
model = AutoModelForSequenceClassification.from_pretrained("microsoft/codebert-base", num_labels=2)


class CodeDataset(Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels[idx])
        return item

    def __len__(self):
        return len(self.labels)

encodings = tokenizer(X_train, truncation=True, padding=True)
dataset = CodeDataset(encodings, y_train)


Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at microsoft/codebert-base and are newly initialized: ['classifier.out_proj.bias', 'classifier.out_proj.weight', 'classifier.dense.weight', 'classifier.dense.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [6]:
model

RobertaForSequenceClassification(
  (roberta): RobertaModel(
    (embeddings): RobertaEmbeddings(
      (word_embeddings): Embedding(50265, 768, padding_idx=1)
      (position_embeddings): Embedding(514, 768, padding_idx=1)
      (token_type_embeddings): Embedding(1, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): RobertaEncoder(
      (layer): ModuleList(
        (0-11): 12 x RobertaLayer(
          (attention): RobertaAttention(
            (self): RobertaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): RobertaSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
             

In [7]:
# Define training parameters
loader = DataLoader(dataset, batch_size=16, shuffle=True)
optimizer = torch.optim.AdamW(model.parameters(), lr=1e-5)
from tqdm import tqdm

# Training loop
for epoch in range(1):
    for batch in tqdm(loader):
        optimizer.zero_grad()
        input_ids = batch['input_ids']
        attention_mask = batch['attention_mask']
        labels = batch['labels']
        outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
        loss = outputs.loss
        loss.backward()
        optimizer.step()


  4%|▍         | 16/359 [01:48<38:36,  6.75s/it]


KeyboardInterrupt: 

In [None]:
from sklearn.metrics import accuracy_score, precision_recall_fscore_support

val_encodings = tokenizer(X_test, truncation=True, padding=True)
val_dataset = CodeDataset(val_encodings, y_test)
val_loader = DataLoader(val_dataset, batch_size=16, shuffle=True)

def evaluate(model, val_loader):
    model.eval()
    predictions, true_labels = [], []
    with torch.no_grad():
        for batch in tqdm(val_loader):
            input_ids = batch['input_ids']
            attention_mask = batch['attention_mask']
            labels = batch['labels']
            outputs = model(input_ids, attention_mask=attention_mask)
            logits = outputs.logits
            predictions.extend(torch.argmax(logits, dim=1).tolist())
            true_labels.extend(labels.tolist())
    
    precision, recall, f1, _ = precision_recall_fscore_support(true_labels, predictions, average='binary')
    accuracy = accuracy_score(true_labels, predictions)
    return accuracy, precision, recall, f1

# Example evaluation after training
accuracy, precision, recall, f1 = evaluate(model, val_loader)
print(f'Accuracy: {accuracy}, Precision: {precision}, Recall: {recall}, F1: {f1}')


In [18]:
batch = next(iter(val_loader))
input_ids = batch['input_ids']
attention_mask = batch['attention_mask']
outputs = model(input_ids, attention_mask=attention_mask)

In [7]:
tokenizer(['var x = 4'], truncation=True, padding=True)['input_ids']

[[0, 10806, 3023, 5457, 204, 2]]

In [20]:
def predict_code_quality(model, tokenizer, code_snippet):
    # Tokenize the code snippet
    inputs = tokenizer(code_snippet, return_tensors="pt", truncation=True, padding=True)

    # Run the model
    with torch.no_grad():
        outputs = model(**inputs)
        print(outputs)
        bad, good = outputs.logits[0]
    
    # Interpret the output (0 for bad quality, 1 for good quality)
    quality = "Good Quality" if good > bad else "Bad Quality"
    return quality

# Example usage
for code_snippet in ["let s = new Student();", "let student = new Student();"]:
    quality = predict_code_quality(model, tokenizer, code_snippet)
    print(f"{code_snippet} is: {quality}")


SequenceClassifierOutput(loss=None, logits=tensor([[ 2.5786, -3.0655]]), hidden_states=None, attentions=None)
let s = new Student(); is: Bad Quality
SequenceClassifierOutput(loss=None, logits=tensor([[-1.1579,  1.4085]]), hidden_states=None, attentions=None)
let student = new Student(); is: Good Quality


In [14]:
# Example usage
for code_snippet in ["if "]:
    quality = predict_code_quality(model, tokenizer, code_snippet)
    print(f"{code_snippet} is: {quality}")

SequenceClassifierOutput(loss=None, logits=tensor([[0.0726, 0.1292]]), hidden_states=None, attentions=None)
if  is: Bad Quality


In [15]:
torch.save(model.state_dict(), 'tester1.pth')

In [16]:
pwd()

'/root/py/BADS'

In [25]:
for code_snippet in ["faddiittektlsektlm;"]:
    quality = predict_code_quality(model, tokenizer, code_snippet)
    print(f"{code_snippet} is: {quality}")

SequenceClassifierOutput(loss=None, logits=tensor([[ 0.8628, -0.8056]]), hidden_states=None, attentions=None)
faddiittektlsektlm; is: Bad Quality
