In [20]:
import pandas as pd
from transformers import RobertaTokenizer
from sklearn.metrics import accuracy_score, classification_report
from transformers import AdamW
from transformers import RobertaForSequenceClassification
from torch.utils.data import Dataset, DataLoader
import torch
from sklearn.model_selection import train_test_split

In [2]:
import torch
print(torch.__version__)

2.3.0+cpu


## Load and Prepare Data

In [23]:
data = pd.read_csv('data_labeled.csv')

tokenizer = RobertaTokenizer.from_pretrained("microsoft/codebert-base")

# Combine columns and tokenize
def combine_and_tokenize(row):
    # Combine the text columns with some separators
    combined_text = f"{row['Question Title']} [SEP] {row['Description']} [SEP] {row['Accepted Answer']}"
    return tokenizer(combined_text, padding="max_length", truncation=True, max_length=512)

data['inputs'] = data.apply(combine_and_tokenize, axis=1)

# Split data into training and validation sets
train_data, val_data = train_test_split(data, test_size=0.2, random_state=42)

# Extract inputs and labels for training and validation sets
train_inputs = {key: torch.tensor([val[key] for val in train_data['inputs'].values]) for key in ['input_ids', 'attention_mask']}
train_labels = torch.tensor(train_data['Label'].values)
val_inputs = {key: torch.tensor([val[key] for val in val_data['inputs'].values]) for key in ['input_ids', 'attention_mask']}
val_labels = torch.tensor(val_data['Label'].values)

## Create a Dataset and DataLoader

In [24]:
class StackOverflowDataset(Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = {key: self.encodings[key][idx] for key in self.encodings}
        item['labels'] = torch.tensor(self.labels[idx], dtype=torch.long)
        return item

    def __len__(self):
        return len(self.labels)

train_dataset = StackOverflowDataset(train_inputs, train_labels)
val_dataset = StackOverflowDataset(val_inputs, val_labels)

# Create DataLoaders for training and validation datasets
train_loader = DataLoader(train_dataset, batch_size=4, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=4, shuffle=False)

## Customize CodeBERT for Classification

In [25]:
model = RobertaForSequenceClassification.from_pretrained("microsoft/codebert-base", num_labels=2)

Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at microsoft/codebert-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


## Train the Model

In [None]:
optimizer = AdamW(model.parameters(), lr=5e-5)

device = "cuda" if torch.cuda.is_available() else "cpu"
model.to(device)

accumulation_steps = 4  # Accumulate gradients over 4 mini-batches
optimizer.zero_grad()

model.train()
for epoch in range(3):  # Number of epochs
    for step, batch in enumerate(train_loader):
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['labels'].to(device)
        outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
        loss = outputs.loss / accumulation_steps  # Normalize our loss (if averaging)
        loss.backward()

        if (step + 1) % accumulation_steps == 0:
            optimizer.step()
            optimizer.zero_grad()
        print(f"Epoch {epoch}, Step {step}, Loss: {loss.item() * accumulation_steps}")

  item['labels'] = torch.tensor(self.labels[idx], dtype=torch.long)


Epoch 0, Step 0, Loss: 18.08500862121582


## Evaluate the Model

In [33]:
model.eval()
predictions, true_labels = [], []

for idx, batch in enumerate(val_loader):
    print(f"Processing batch {idx+1}/{len(val_loader)}")  # Track batch processing
    with torch.no_grad():
        inputs = {k: v.to(device) for k, v in batch.items() if k != 'labels'}
        outputs = model(**inputs)
        logits = outputs.logits
        predicted_labels = logits.argmax(dim=1).cpu().numpy()
        predictions.extend(predicted_labels)
        true_labels.extend(batch['labels'].numpy())

print("Evaluation complete. Calculating metrics...")
print(accuracy_score(true_labels, predictions))

  item['labels'] = torch.tensor(self.labels[idx], dtype=torch.long)


Processing batch 1/30
Processing batch 2/30
Processing batch 3/30
Processing batch 4/30
Processing batch 5/30
Processing batch 6/30
Processing batch 7/30
Processing batch 8/30
Processing batch 9/30
Processing batch 10/30
Processing batch 11/30
Processing batch 12/30
Processing batch 13/30
Processing batch 14/30
Processing batch 15/30
Processing batch 16/30
Processing batch 17/30
Processing batch 18/30
Processing batch 19/30
Processing batch 20/30
Processing batch 21/30
Processing batch 22/30
Processing batch 23/30
Processing batch 24/30
Processing batch 25/30
Processing batch 26/30
Processing batch 27/30
Processing batch 28/30
Processing batch 29/30
Processing batch 30/30
Evaluation complete. Calculating metrics...
0.55
