In [2]:
import pandas as pd
import random
import torch
import json
from torch.utils.data import Dataset, DataLoader
from transformers import AlbertTokenizer, AlbertForSequenceClassification
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score, roc_auc_score, matthews_corrcoef, confusion_matrix
from sklearn.preprocessing import label_binarize
import torch.optim as optim
import numpy as np

# Load the data
with open('/content/casualnet.json', 'r') as f:
    casulnet_data = json.load(f)


random_selected_samples = random.sample(casulnet_data, 1000)

# Define split sizes for an 80-20 split
train_size = int(0.8 * len(random_selected_samples))
train_data = random_selected_samples[:train_size]
validation_data = random_selected_samples[train_size:]

class CasulnetDataset(Dataset):
    def __init__(self, casulnet_data, tokenizer):
        self.casulnet_data = casulnet_data
        self.tokenizer = tokenizer

    def __len__(self):
        return len(self.casulnet_data)

    def __getitem__(self, idx):
        data = self.casulnet_data[idx]
        context_question = data['context']
        choices = [data[f'choice_id: {i}'] for i in range(3)]  # Assuming there are always 3 choices
        label = data['label']

        # Tokenizing context_question with each choice
        input_ids = []
        attention_masks = []
        for choice in choices:
            encoded_input = self.tokenizer(context_question, choice, padding='max_length', truncation=True, max_length=512, return_tensors='pt')
            input_ids.append(encoded_input['input_ids'].squeeze(0))
            attention_masks.append(encoded_input['attention_mask'].squeeze(0))

        label = torch.tensor([label]*3)  # Replicate label for each choice
        return {
            'input_ids': torch.stack(input_ids),
            'attention_mask': torch.stack(attention_masks),
            'labels': label
        }

# Load tokenizer and model
tokenizer = AlbertTokenizer.from_pretrained('albert-base-v2')
model = AlbertForSequenceClassification.from_pretrained('albert-base-v2', num_labels=3)  # Assuming 3 possible labels

casulnet_dataset = CasulnetDataset(train_data, tokenizer)
dataloader = DataLoader(casulnet_dataset, batch_size=8, shuffle=True)

optimizer = optim.Adam(model.parameters(), lr=1e-5)

# Training loop
model.train()
for epoch in range(3):
    for batch in dataloader:
        optimizer.zero_grad()
        input_ids = batch['input_ids'].view(-1, 512)  # Flatten the input for processing
        attention_mask = batch['attention_mask'].view(-1, 512)  # Flatten the attention mask for processing
        labels = batch['labels'].view(-1)  # Flatten the labels to match the input batch size

        outputs = model(input_ids=input_ids, attention_mask=attention_mask, labels=labels)
        loss = outputs.loss
        loss.backward()
        optimizer.step()
        print(f"Epoch: {epoch}, Loss: {loss.item()}")

validation_dataset = CasulnetDataset(validation_data, tokenizer)
validation_dataloader = DataLoader(validation_dataset, batch_size=64)

model.eval()
y_true, y_pred, y_scores = [], [], []

def compute_metrics(y_true, y_pred, y_prob):
    accuracy = accuracy_score(y_true, y_pred)
    f1 = f1_score(y_true, y_pred, average='weighted')
    precision = precision_score(y_true, y_pred, average='weighted')
    recall = recall_score(y_true, y_pred, average='weighted')
    roc_auc = roc_auc_score(label_binarize(y_true, classes=[0, 1, 2]), y_prob, multi_class='ovr', average='weighted')
    mcc = matthews_corrcoef(y_true, y_pred)
    conf_matrix = confusion_matrix(y_true, y_pred)
    return accuracy, f1, precision, recall, roc_auc, mcc, conf_matrix

with torch.no_grad():
    for batch in validation_dataloader:
        input_ids = batch['input_ids'].view(-1, 512)
        attention_mask = batch['attention_mask'].view(-1, 512)
        labels = batch['labels'].view(-1)
        outputs = model(input_ids=input_ids, attention_mask=attention_mask)
        predictions = torch.argmax(outputs.logits, dim=1)
        y_scores_batch = torch.softmax(outputs.logits, dim=1).cpu().numpy()

        y_true.extend(labels.cpu().numpy())
        y_pred.extend(predictions.cpu().numpy())
        y_scores.extend(y_scores_batch)

Some weights of AlbertForSequenceClassification were not initialized from the model checkpoint at albert-base-v2 and are newly initialized: ['classifier.weight', 'classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch: 0, Loss: 1.1563962697982788
Epoch: 0, Loss: 1.2075973749160767
Epoch: 0, Loss: 1.12165367603302
Epoch: 0, Loss: 1.1191492080688477
Epoch: 0, Loss: 1.1691914796829224
Epoch: 0, Loss: 1.264025330543518
Epoch: 0, Loss: 1.1338104009628296
Epoch: 0, Loss: 1.1013582944869995
Epoch: 0, Loss: 1.0892025232315063
Epoch: 0, Loss: 1.100443720817566
Epoch: 0, Loss: 1.0875569581985474
Epoch: 0, Loss: 1.0420490503311157
Epoch: 0, Loss: 1.0447098016738892
Epoch: 0, Loss: 1.0327993631362915
Epoch: 0, Loss: 1.147185206413269
Epoch: 0, Loss: 0.9357431530952454
Epoch: 0, Loss: 1.11296808719635
Epoch: 0, Loss: 1.3347187042236328
Epoch: 0, Loss: 0.973828136920929
Epoch: 0, Loss: 1.2323087453842163
Epoch: 0, Loss: 0.9776803851127625
Epoch: 0, Loss: 1.0307824611663818
Epoch: 0, Loss: 1.168930172920227
Epoch: 0, Loss: 1.2726839780807495
Epoch: 0, Loss: 0.9637457728385925
Epoch: 0, Loss: 1.1517966985702515
Epoch: 0, Loss: 1.129158854484558
Epoch: 0, Loss: 0.9343517422676086
Epoch: 0, Loss: 0.975039184093

In [3]:
# Calculate metrics
y_scores = np.vstack(y_scores)  # Ensure y_scores is properly shaped for multiclass ROC-AUC calculation
accuracy, f1, precision, recall, roc_auc, mcc, conf_matrix = compute_metrics(y_true, y_pred, y_scores)

print(f"Accuracy: {accuracy:.4f}")
print(f"F1: {f1:.4f}")
print(f"Precision: {precision:.4f}")
print(f"Recall: {recall:.4f}")
print(f"ROC-AUC: {roc_auc:.4f}")
print(f"MCC: {mcc:.4f}")



Accuracy: 0.3383
F1: 0.1934
Precision: 0.2721
Recall: 0.3383
ROC-AUC: 0.5125
MCC: -0.0036
Confusion Matrix:
[[  2 194  11]
 [  4 196   4]
 [  3 181   5]]
