In [1]:
import os
import json
from tqdm import tqdm

os.chdir('dataset/Diff_Quality_Estimation')


In [2]:
from ds import SQLiteCodeDataset

In [3]:
from filenames import train_files

In [4]:
from transformers import AutoModelForSequenceClassification, AutoTokenizer
import torch

tokenizer = AutoTokenizer.from_pretrained("microsoft/codebert-base")
device = torch.device('cuda')
model = AutoModelForSequenceClassification.from_pretrained("microsoft/codebert-base", num_labels=2).to(device)
optimizer = torch.optim.AdamW(model.parameters(), lr=1e-5)


Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at microsoft/codebert-base and are newly initialized: ['classifier.dense.weight', 'classifier.dense.bias', 'classifier.out_proj.weight', 'classifier.out_proj.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [5]:
for param in model.roberta.base_model.parameters():
    param.requires_grad = False


In [18]:
for param in model.roberta.encoder.layer[10:].parameters():
    param.requires_grad = True

In [19]:
for name, param in model.named_parameters():
    print(f"{name}: {param.requires_grad} {list(param.shape)}")


roberta.embeddings.word_embeddings.weight: False [50265, 768]
roberta.embeddings.position_embeddings.weight: False [514, 768]
roberta.embeddings.token_type_embeddings.weight: False [1, 768]
roberta.embeddings.LayerNorm.weight: False [768]
roberta.embeddings.LayerNorm.bias: False [768]
roberta.encoder.layer.0.attention.self.query.weight: False [768, 768]
roberta.encoder.layer.0.attention.self.query.bias: False [768]
roberta.encoder.layer.0.attention.self.key.weight: False [768, 768]
roberta.encoder.layer.0.attention.self.key.bias: False [768]
roberta.encoder.layer.0.attention.self.value.weight: False [768, 768]
roberta.encoder.layer.0.attention.self.value.bias: False [768]
roberta.encoder.layer.0.attention.output.dense.weight: False [768, 768]
roberta.encoder.layer.0.attention.output.dense.bias: False [768]
roberta.encoder.layer.0.attention.output.LayerNorm.weight: False [768]
roberta.encoder.layer.0.attention.output.LayerNorm.bias: False [768]
roberta.encoder.layer.0.intermediate.dense

In [7]:
cpu = torch.device('cpu')

In [8]:
next(model.parameters()).device

device(type='cuda', index=0)

In [21]:
from torch.utils.data import DataLoader

dataset = SQLiteCodeDataset('train_js', tokenizer, 512, lambda x: x[0], lambda x: [1, 0] if x[1] else [0, 1])
dataloader = DataLoader(dataset, batch_size=16, shuffle=True)


In [None]:
batch = next(iter(dataloader))
batch

In [6]:
import gc; gc.collect()

28

In [29]:
model.load_state_dict(torch.load('lang-classifier5.pt'))
optimizer.load_state_dict(torch.load('lang-optimizer5.pt'))

In [7]:
torch.cuda.empty_cache()


In [27]:
model.train()
num_epochs = 10
batch_size = 64

for epoch in range(num_epochs):
    running_loss = 0.0
    dataset = SQLiteCodeDataset('train_js', tokenizer, 512, lambda x: x[0], lambda x: [1, 0] if x[1] else [0, 1])
    dataloader = DataLoader(dataset, batch_size=batch_size, shuffle=True)
    i, n = 0, len(dataloader)
    for batch in dataloader:
        optimizer.zero_grad()

        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['labels'].to(device)
        size = input_ids.size(0)
        i += size

        outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
        loss = outputs.loss
        loss.backward()
        optimizer.step()
        running_loss += loss.item() * size

        print(f"\rEpoch {epoch+1}/{num_epochs} - Batch {i//batch_size+1}/{n}, Loss: {running_loss/i:.4f}", end="")
        
    epoch_loss = running_loss / len(dataset)
    print(f"\rEpoch {epoch+1}/{num_epochs}, Average Loss: {epoch_loss:.4f}          ")



Epoch 1/10 - Batch 100/10861, Loss: 0.0061

KeyboardInterrupt: 

In [22]:
dataset.close()

In [23]:
torch.save(model.state_dict(), 'lang-classifier5.pt')
torch.save(optimizer.state_dict(), 'lang-optimizer5.pt')

In [28]:
from filenames import all_files
from torch.utils.data import Subset
from sklearn.metrics import accuracy_score, precision_recall_fscore_support
from ds import CodeDataset

N = 1600  # Number of items you want in your DataLoader

val_dataset = CodeDataset(all_files[0:1], '-testjs', tokenizer, 512, lambda x: x[0], lambda x: [1, 0] if x[1] else [0, 1])
limited_dataset = Subset(val_dataset, indices=range(N))
val_loader = DataLoader(limited_dataset, batch_size=16, shuffle=True)

def evaluate(model, val_loader):
    model.eval()
    predictions, true_labels, list_logits = [], [], []
    with torch.no_grad():
        for batch in tqdm(val_loader):
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['labels'].to(device)
    
            outputs = model(input_ids, attention_mask=attention_mask)

            logits = outputs.logits
            predictions.extend(torch.argmax(logits, dim=1).tolist())
            true_labels.extend(torch.argmax(labels, dim=1).tolist())
    
    precision, recall, f1, _ = precision_recall_fscore_support(true_labels, predictions, average='binary')
    accuracy = accuracy_score(true_labels, predictions)
    return accuracy, precision, recall, f1, predictions, true_labels

# Example evaluation after training
accuracy, precision, recall, f1, predictions, true_labels = evaluate(model, val_loader)
print(f'Accuracy: {accuracy}, Precision: {precision}, Recall: {recall}, F1: {f1}')


100%|██████████| 100/100 [00:30<00:00,  3.30it/s]

Accuracy: 0.980625, Precision: 0.8848484848484849, Recall: 0.9240506329113924, F1: 0.9040247678018577





# Ajouter les code js reconnue par le nouveau model, à la dataset des js

In [45]:
import json
from tqdm import tqdm
from filenames import train_files

for filename in train_files:
    savep = f"{filename}-extra2js.jsonl"
    if os.path.exists(savep):
        print(f"{savep} exists, skipping")
        continue
    with open(savep, "w") as fw:
        with open(f"{filename}-ready.jsonl", "r") as fr:
            i, j = 0, 0
            for line in tqdm(fr):
                code, label, js0unk_neg = json.loads(line)
                i += 1
                if js0unk_neg > 0:
                    continue # not js
                if js0unk_neg < 0:
                    encoding = tokenizer.encode_plus(
                        code,
                        add_special_tokens=True,
                        max_length=512,
                        return_token_type_ids=False,
                        padding='max_length',
                        return_attention_mask=True,
                        return_tensors='pt',
                        truncation=True,
                    )
                    outputs = model(
                        encoding['input_ids'].to(device),
                        attention_mask = encoding['attention_mask'].to(device)
                    )
                    logits = outputs.logits[0]
                    if logits[0] > logits[1]:
                        continue
                j += 1
                fw.write(line)
            print(f"{j}/{i} are saved to {savep}")



cls-train-chunk-1-extrajs.jsonl exists, skipping


66459it [11:47, 93.99it/s] 


7709/66459 are saved to cls-train-chunk-1-extrajs.jsonl
cls-train-chunk-2-extrajs.jsonl exists, skipping


66459it [11:49, 93.69it/s] 


7889/66459 are saved to cls-train-chunk-2-extrajs.jsonl


66459it [11:48, 93.81it/s] 

7733/66459 are saved to cls-train-chunk-3-extrajs.jsonl





In [1]:
line

NameError: name 'line' is not defined