In [1]:
import os
import json
from tqdm import tqdm

os.chdir('dataset/Diff_Quality_Estimation')
msg_file = f"comments.jsonl"


In [2]:
from transformers import RobertaForSequenceClassification, RobertaTokenizer, RobertaConfig
import torch
import torch.nn as nn
from torch.optim import AdamW
from torch.nn import BCELoss

# Load tokenizer
tokenizer = RobertaTokenizer.from_pretrained('microsoft/codebert-base')

# Load the configuration and modify it for multi-label classification
config = RobertaConfig.from_pretrained('microsoft/codebert-base')
config.num_labels = 4  # Set the number of labels

# Load model with modified configuration
model = RobertaForSequenceClassification.from_pretrained('microsoft/codebert-base', config=config)

# Replace the classifier head with a new one
model.classifier = nn.Sequential(
    nn.Linear(config.hidden_size, 4),  # Assuming '4' is the number of labels
    nn.Sigmoid()
)
device = torch.device('cuda')
model.to(device)

optimizer = AdamW(model.parameters(), lr=1e-5)
loss_fn = BCELoss()


Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at microsoft/codebert-base and are newly initialized: ['classifier.dense.bias', 'classifier.out_proj.weight', 'classifier.out_proj.bias', 'classifier.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [3]:
model.load_state_dict(torch.load('classifier2.pt'))
optimizer.load_state_dict(torch.load('optimizer2.pt'))

In [4]:
for param in model.roberta.base_model.parameters():
    param.requires_grad = False


In [5]:
for param in model.roberta.encoder.layer[11].parameters():
    param.requires_grad = True


In [6]:
model

RobertaForSequenceClassification(
  (roberta): RobertaModel(
    (embeddings): RobertaEmbeddings(
      (word_embeddings): Embedding(50265, 768, padding_idx=1)
      (position_embeddings): Embedding(514, 768, padding_idx=1)
      (token_type_embeddings): Embedding(1, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): RobertaEncoder(
      (layer): ModuleList(
        (0-11): 12 x RobertaLayer(
          (attention): RobertaAttention(
            (self): RobertaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): RobertaSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
             

In [None]:
from filenames import all_files
from torch.utils.data import Subset
from sklearn.metrics import accuracy_score, precision_recall_fscore_support
from ds import SQLiteCodeDataset
from torch.utils.data import DataLoader


model.train()
num_epochs = 20
batch_size = 64

for epoch in range(num_epochs):
    running_loss = 0.0
    dataset = SQLiteCodeDataset('ready_classes', tokenizer, 512, lambda x: x[0], lambda x: x[1:])
    dataloader = DataLoader(dataset, batch_size=batch_size, shuffle=True)
    i, n = 0, len(dataloader)
    min_alpha = .2
    alpha = 1 - min_alpha
    for batch in dataloader:
        optimizer.zero_grad()

        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['labels'].to(device)
        size = input_ids.size(0)
        i += size

        outputs = model(input_ids, attention_mask=attention_mask)
        cls_logits = outputs.logits[:, 0, :]
        loss = loss_fn(cls_logits, labels)

        loss.backward()
        optimizer.step()
        current_alpha = min_alpha + alpha
        running_loss = (1 - current_alpha) * running_loss + current_alpha * loss.item()
        alpha *= .5
        pred = (cls_logits > .5).int().flatten().to('cpu')
        labels = labels.int().flatten().to('cpu')
        
        print(f"\rEpoch {epoch+1}/{num_epochs} - Batch {i//batch_size+1}/{n}, Loss: {running_loss:.4f} {loss.item():.4f}", end="")
        #precision, recall, f1, _ = precision_recall_fscore_support(labels, pred, average='binary')
        #accuracy = accuracy_score(labels, pred)
        #print(f", Accuracy: {accuracy:.4f}, Precision: {precision:.4f}, Recall: {recall:.4f}, F1: {f1:.4f}", end="")
        
    epoch_loss = running_loss
    print(f"\rEpoch {epoch+1}/{num_epochs}, Average Loss: {epoch_loss:.4f}          ")



Epoch 1/20, Average Loss: 0.1003          .1066
Epoch 2/20 - Batch 311/486, Loss: 0.1071 0.1021

In [14]:
print(f"\rEpoch {epoch+1}/{num_epochs}, Average Loss: {epoch_loss:.4f}          ")

Epoch 20/20, Average Loss: 0.0852          


In [44]:
torch.save(model.state_dict(), 'classifier3.pt')
torch.save(optimizer.state_dict(), 'optimizer3.pt')

In [42]:
from filenames import all_files
from torch.utils.data import Subset
from sklearn.metrics import accuracy_score, precision_recall_fscore_support
from ds import CodeDataset

N = 160  # Number of items you want in your DataLoader

val_dataset = SQLiteCodeDataset('test_classes', tokenizer, 512, lambda x: x[0], lambda x: x[1:])
limited_dataset = Subset(val_dataset, indices=range(N))
val_loader = DataLoader(limited_dataset, batch_size=16, shuffle=True)

model.eval()
predictions, true_labels, losses = [], [], []
with torch.no_grad():
    for batch in tqdm(val_loader):
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['labels'].to(device)

        outputs = model(input_ids, attention_mask=attention_mask)

        logits = outputs.logits[:, 0, :]
        loss = loss_fn(logits, labels)
        pred = (logits > .5).int().flatten().to('cpu')
        labels = labels.int().flatten().to('cpu')
        predictions.append(pred)
        true_labels.append(labels)
        losses.append(loss)
        #precision, recall, f1, _ = precision_recall_fscore_support(labels, pred, average='binary')
        #accuracy = accuracy_score(labels, pred)
        #print(f'Accuracy: {accuracy}, Precision: {precision}, Recall: {recall}, F1: {f1}')




100%|██████████| 10/10 [00:04<00:00,  2.15it/s]


In [43]:
predictionsarr = torch.tensor([x[0] for x in predictions]).flatten()
true_labelsarr = torch.tensor([x[0] for x in true_labels]).flatten()

precision, recall, f1, _ = precision_recall_fscore_support(true_labelsarr, predictionsarr, average='binary')
accuracy = accuracy_score(true_labelsarr, predictionsarr)

print(f'Accuracy: {accuracy}, Precision: {precision}, Recall: {recall}, F1: {f1}')


Accuracy: 0.3, Precision: 0.0, Recall: 0.0, F1: 0.0


In [None]:
torch.mean(torch.tensor(losses))

In [15]:
model

RobertaForSequenceClassification(
  (roberta): RobertaModel(
    (embeddings): RobertaEmbeddings(
      (word_embeddings): Embedding(50265, 768, padding_idx=1)
      (position_embeddings): Embedding(514, 768, padding_idx=1)
      (token_type_embeddings): Embedding(1, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): RobertaEncoder(
      (layer): ModuleList(
        (0-11): 12 x RobertaLayer(
          (attention): RobertaAttention(
            (self): RobertaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): RobertaSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
             

In [27]:
from torchsummary import summary

# Assume 'model' is your PyTorch model and 'input_size' is the size of your input data
summary(model, input_size=(512,))


Layer (type:depth-idx)                   Param #
├─RobertaModel: 1-1                      --
|    └─RobertaEmbeddings: 2-1            --
|    |    └─Embedding: 3-1               (38,603,520)
|    |    └─Embedding: 3-2               (394,752)
|    |    └─Embedding: 3-3               (768)
|    |    └─LayerNorm: 3-4               (1,536)
|    |    └─Dropout: 3-5                 --
|    └─RobertaEncoder: 2-2               --
|    |    └─ModuleList: 3-6              (85,054,464)
├─Sequential: 1-2                        --
|    └─Linear: 2-3                       3,076
|    └─Sigmoid: 2-4                      --
Total params: 124,058,116
Trainable params: 3,076
Non-trainable params: 124,055,040


Layer (type:depth-idx)                   Param #
├─RobertaModel: 1-1                      --
|    └─RobertaEmbeddings: 2-1            --
|    |    └─Embedding: 3-1               (38,603,520)
|    |    └─Embedding: 3-2               (394,752)
|    |    └─Embedding: 3-3               (768)
|    |    └─LayerNorm: 3-4               (1,536)
|    |    └─Dropout: 3-5                 --
|    └─RobertaEncoder: 2-2               --
|    |    └─ModuleList: 3-6              (85,054,464)
├─Sequential: 1-2                        --
|    └─Linear: 2-3                       3,076
|    └─Sigmoid: 2-4                      --
Total params: 124,058,116
Trainable params: 3,076
Non-trainable params: 124,055,040

In [31]:
outt = model.roberta(input_ids, attention_mask=attention_mask)

In [36]:
outt.last_hidden_state.shape

torch.Size([16, 512, 768])

In [38]:
model

RobertaForSequenceClassification(
  (roberta): RobertaModel(
    (embeddings): RobertaEmbeddings(
      (word_embeddings): Embedding(50265, 768, padding_idx=1)
      (position_embeddings): Embedding(514, 768, padding_idx=1)
      (token_type_embeddings): Embedding(1, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): RobertaEncoder(
      (layer): ModuleList(
        (0-11): 12 x RobertaLayer(
          (attention): RobertaAttention(
            (self): RobertaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): RobertaSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
             