In [20]:
import pandas as pd
from transformers import RobertaTokenizer
from sklearn.metrics import accuracy_score, classification_report
from transformers import AdamW
from transformers import RobertaForSequenceClassification
from torch.utils.data import Dataset, DataLoader
import torch
from sklearn.model_selection import train_test_split

In [2]:
import torch
print(torch.__version__)

2.3.0+cpu


## Load and Prepare Data

In [23]:
data = pd.read_csv('data_labeled.csv')

tokenizer = RobertaTokenizer.from_pretrained("microsoft/codebert-base")

# Combine columns and tokenize
def combine_and_tokenize(row):
    # Combine the text columns with some separators
    combined_text = f"{row['Question Title']} [SEP] {row['Description']} [SEP] {row['Accepted Answer']}"
    return tokenizer(combined_text, padding="max_length", truncation=True, max_length=512)

data['inputs'] = data.apply(combine_and_tokenize, axis=1)

# Split data into training and validation sets
train_data, val_data = train_test_split(data, test_size=0.2, random_state=42)

# Extract inputs and labels for training and validation sets
train_inputs = {key: torch.tensor([val[key] for val in train_data['inputs'].values]) for key in ['input_ids', 'attention_mask']}
train_labels = torch.tensor(train_data['Label'].values)
val_inputs = {key: torch.tensor([val[key] for val in val_data['inputs'].values]) for key in ['input_ids', 'attention_mask']}
val_labels = torch.tensor(val_data['Label'].values)

## Create a Dataset and DataLoader

In [24]:
class StackOverflowDataset(Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = {key: self.encodings[key][idx] for key in self.encodings}
        item['labels'] = torch.tensor(self.labels[idx], dtype=torch.long)
        return item

    def __len__(self):
        return len(self.labels)

train_dataset = StackOverflowDataset(train_inputs, train_labels)
val_dataset = StackOverflowDataset(val_inputs, val_labels)

# Create DataLoaders for training and validation datasets
train_loader = DataLoader(train_dataset, batch_size=4, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=4, shuffle=False)

## Customize CodeBERT for Classification

In [25]:
model = RobertaForSequenceClassification.from_pretrained("microsoft/codebert-base", num_labels=2)

Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at microsoft/codebert-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


## Train the Model

In [29]:
optimizer = AdamW(model.parameters(), lr=5e-5)

# Define the training parameters
epochs = 3
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Training loop
model.train()
for epoch in range(epochs):
    total_loss = 0
    for batch in train_loader:
        batch = {k: v.to(device) for k, v in batch.items()}
        
        outputs = model(**batch)
        loss = outputs.loss
        
        # Backpropagation
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        
        total_loss += loss.item()
    
    print(f"Epoch {epoch+1}/{epochs}, Loss: {total_loss/len(train_loader)}")

  item['labels'] = torch.tensor(self.labels[idx], dtype=torch.long)


KeyboardInterrupt: 

## Evaluate the Model

In [19]:
model.eval()
predictions, true_labels = [], []

for idx, batch in enumerate(val_loader):
    print(f"Processing batch {idx+1}/{len(loader)}")  # Track batch processing
    with torch.no_grad():
        inputs = {k: v.to(device) for k, v in batch.items() if k != 'labels'}
        outputs = model(**inputs)
        logits = outputs.logits
        predicted_labels = logits.argmax(dim=1).cpu().numpy()
        predictions.extend(predicted_labels)
        true_labels.extend(batch['labels'].numpy())

print("Evaluation complete. Calculating metrics...")
print(accuracy_score(true_labels, predictions))



  item['labels'] = torch.tensor(self.labels[idx], dtype=torch.long)


Processing batch 1/1
Evaluation complete. Calculating metrics...
1.0
