In [None]:
import pandas as pd
import random
import torch
import json
from torch.utils.data import Dataset, DataLoader
from transformers import BertTokenizer, BertForSequenceClassification
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score, roc_auc_score, matthews_corrcoef, confusion_matrix
from sklearn.preprocessing import label_binarize
import torch.optim as optim
import numpy as np

# Load the data
with open('/content/maybefinal_cause.json', 'r') as f:
    casulnet_data = json.load(f)


random_selected_samples = random.sample(casulnet_data, 1000)

# Define split sizes for an 80-20 split
train_size = int(0.8 * len(random_selected_samples))
train_data = random_selected_samples[:train_size]
validation_data = random_selected_samples[train_size:]

class CasulnetDataset(Dataset):
    def __init__(self, casulnet_data, tokenizer):
        self.casulnet_data = casulnet_data
        self.tokenizer = tokenizer

    def __len__(self):
        return len(self.casulnet_data)

    def __getitem__(self, idx):
        data = self.casulnet_data[idx]
        context_question = data['context']
        choices = [data[f'choice_id: {i}'] for i in range(3)]  # Assuming there are always 3 choices
        label = data['label']

        # Tokenizing context_question with each choice
        input_ids = []
        attention_masks = []
        for choice in choices:
            encoded_input = self.tokenizer(context_question, choice, padding='max_length', truncation=True, max_length=512, return_tensors='pt')
            input_ids.append(encoded_input['input_ids'].squeeze(0))
            attention_masks.append(encoded_input['attention_mask'].squeeze(0))

        label = torch.tensor([label]*3)  # Replicate label for each choice
        return {
            'input_ids': torch.stack(input_ids),
            'attention_mask': torch.stack(attention_masks),
            'labels': label
        }

# Load tokenizer and model
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
model = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=3)  # Assuming 3 possible labels

casulnet_dataset = CasulnetDataset(train_data, tokenizer)
dataloader = DataLoader(casulnet_dataset, batch_size=8, shuffle=True)

optimizer = optim.Adam(model.parameters(), lr=1e-5)

# Training loop
model.train()
for epoch in range(3):
    for batch in dataloader:
        optimizer.zero_grad()
        input_ids = batch['input_ids'].view(-1, 512)  # Flatten the input for processing
        attention_mask = batch['attention_mask'].view(-1, 512)  # Flatten the attention mask for processing
        labels = batch['labels'].view(-1)  # Flatten the labels to match the input batch size

        outputs = model(input_ids=input_ids, attention_mask=attention_mask, labels=labels)
        loss = outputs.loss
        loss.backward()
        optimizer.step()
        print(f"Epoch: {epoch}, Loss: {loss.item()}")

validation_dataset = CasulnetDataset(validation_data, tokenizer)
validation_dataloader = DataLoader(validation_dataset, batch_size=64)

model.eval()
y_true, y_pred, y_scores = [], [], []

def compute_metrics(y_true, y_pred, y_prob):
    accuracy = accuracy_score(y_true, y_pred)
    f1 = f1_score(y_true, y_pred, average='weighted')
    precision = precision_score(y_true, y_pred, average='weighted')
    recall = recall_score(y_true, y_pred, average='weighted')
    roc_auc = roc_auc_score(label_binarize(y_true, classes=[0, 1, 2]), y_prob, multi_class='ovr', average='weighted')
    mcc = matthews_corrcoef(y_true, y_pred)
    conf_matrix = confusion_matrix(y_true, y_pred)
    return accuracy, f1, precision, recall, roc_auc, mcc, conf_matrix

with torch.no_grad():
    for batch in validation_dataloader:
        input_ids = batch['input_ids'].view(-1, 512)
        attention_mask = batch['attention_mask'].view(-1, 512)
        labels = batch['labels'].view(-1)
        outputs = model(input_ids=input_ids, attention_mask=attention_mask)
        predictions = torch.argmax(outputs.logits, dim=1)
        y_scores_batch = torch.softmax(outputs.logits, dim=1).cpu().numpy()

        y_true.extend(labels.cpu().numpy())
        y_pred.extend(predictions.cpu().numpy())
        y_scores.extend(y_scores_batch)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.weight', 'classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch: 0, Loss: 1.6911330223083496
Epoch: 0, Loss: 1.4879297018051147
Epoch: 0, Loss: 1.4574567079544067
Epoch: 0, Loss: 1.179187297821045
Epoch: 0, Loss: 1.1515558958053589
Epoch: 0, Loss: 1.1265836954116821
Epoch: 0, Loss: 1.065677285194397
Epoch: 0, Loss: 1.1524714231491089
Epoch: 0, Loss: 1.1948078870773315
Epoch: 0, Loss: 1.3020275831222534
Epoch: 0, Loss: 1.1365009546279907
Epoch: 0, Loss: 1.1424843072891235
Epoch: 0, Loss: 1.1073347330093384
Epoch: 0, Loss: 1.1166387796401978
Epoch: 0, Loss: 1.0620492696762085
Epoch: 0, Loss: 1.161596417427063
Epoch: 0, Loss: 1.097861886024475
Epoch: 0, Loss: 1.0989760160446167
Epoch: 0, Loss: 1.0486780405044556
Epoch: 0, Loss: 1.0980428457260132
Epoch: 0, Loss: 1.132535696029663
Epoch: 0, Loss: 1.106766700744629
Epoch: 0, Loss: 1.1222506761550903
Epoch: 0, Loss: 1.1065969467163086
Epoch: 0, Loss: 1.1446980237960815
Epoch: 0, Loss: 1.0685259103775024
Epoch: 0, Loss: 1.1179622411727905
Epoch: 0, Loss: 1.0440248250961304
Epoch: 0, Loss: 1.02726745

In [None]:
# Calculate metrics
y_scores = np.vstack(y_scores)  # Ensure y_scores is properly shaped for multiclass ROC-AUC calculation
accuracy, f1, precision, recall, roc_auc, mcc, conf_matrix = compute_metrics(y_true, y_pred, y_scores)

print(f"Accuracy: {accuracy:.4f}")
print(f"F1: {f1:.4f}")
print(f"Precision: {precision:.4f}")
print(f"Recall: {recall:.4f}")
print(f"ROC-AUC: {roc_auc:.4f}")
print(f"MCC: {mcc:.4f}")
print("Confusion Matrix:")
print(conf_matrix)

Accuracy: 0.3900
F1: 0.2188
Precision: 0.1521
Recall: 0.3900
ROC-AUC: 0.5094
MCC: 0.0000
Confusion Matrix:
[[  0  96   0]
 [  0 117   0]
 [  0  87   0]]


  _warn_prf(average, modifier, msg_start, len(result))
