__Objective__: Computing cross-validation score of CMV dataset using BERT

__Runtime__: GPU

In [None]:
!nvidia-smi

In [None]:
from google.colab import drive
drive.mount('/content/gdrive', force_remount=True)

In [None]:
!pip install transformers 
!pip install datasets

In [None]:
import os
import pandas as pd
import numpy as np
from tqdm import tqdm 
import pickle as pkl
import matplotlib.pyplot as plt
from matplotlib import colors

import torch
from torch.utils.data import DataLoader
from transformers import AdamW
from transformers import BertTokenizer, BertForSequenceClassification
from datasets import load_metric

# Loading CMV dataset and creating folds

In [None]:
def read_dataset(dir):
    """Reading texts and labels from dataset"""
    texts_labels = []
    with open(dir, 'r', encoding='utf-8') as f:
        lines = f.readlines()[1:]
        for line in lines:
            line = line.strip().split(",")
            assert(len(line) == 2)
            label = line[0].strip()
            text = line[1].strip()
            if label == 'AH':
                texts_labels.append((text, 1))
            else:
                texts_labels.append((text, 0))
    return texts_labels

In [None]:
dataset = read_dataset('/content/gdrive/MyDrive/DL/dataset/pytorch/train.csv')
dataset.extend(read_dataset('/content/gdrive/MyDrive/DL/dataset/pytorch/test.csv'))

In [None]:
len(dataset)

In [None]:
# shuffling the dataset 
np.random.shuffle(dataset)

# creating folds 
n_folds = 10
folds = [dataset[i::n_folds] for i in range(n_folds)]

In [None]:
# creating tokenizer to get encodings
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased') 
max_seq_length = 64

In [None]:
folds_text = [] 
folds_label = [] 
for i in range(n_folds):
    texts = []
    labels = [] 
    for ftext, flabel in folds[i]:
        texts.append(ftext)
        labels.append(flabel)
    folds_text.append(texts)
    folds_label.append(labels) 

In [None]:
# generating encodings
folds_encoding = [] 
for i in range(n_folds):
    folds_encoding.append(tokenizer(folds_text[i], truncation=True, max_length=max_seq_length, padding="max_length"))

In [None]:
class CustomDataset(torch.utils.data.Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels[idx])
        return item

    def __len__(self):
        return len(self.labels)

In [None]:
train_dataset = []
test_dataset = [] 

for i in tqdm(range(n_folds)):
    cur_text = [] 
    cur_label = [] 
    for j in range(n_folds):
        if i == j:
            continue 
        cur_text.extend(folds_text[j]) 
        cur_label.extend(folds_label[j]) 
    cur_encoding = tokenizer(cur_text, truncation=True, max_length=max_seq_length, padding="max_length")
    train_dataset.append(CustomDataset(cur_encoding, cur_label)) 
    test_dataset.append(CustomDataset(folds_encoding[i], folds_label[i]))

In [None]:
device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')

In [None]:
def evaluate(fold_id):
    model = BertForSequenceClassification.from_pretrained('bert-base-uncased')
    model.to(device)
    model.train()

    train_loader = DataLoader(train_dataset[fold_id], batch_size=64, shuffle=True)
    optim = AdamW(model.parameters(), lr=5e-5)

    for epoch in range(3):
        for batch in tqdm(train_loader):
            optim.zero_grad()
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['labels'].to(device)
            outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
            loss = outputs[0]
            loss.backward()
            optim.step()

    acc = load_metric("accuracy")
    f1 = load_metric("f1")
    model.eval()
    eval_loader = DataLoader(test_dataset[fold_id], batch_size=64, shuffle=False)
    for batch in eval_loader:
        batch = {k: v.to(device) for k, v in batch.items()}
        with torch.no_grad():
            outputs = model(**batch)

        logits = outputs.logits
        predictions = torch.argmax(logits, dim=-1)
        acc.add_batch(predictions=predictions, references=batch["labels"])
        f1.add_batch(predictions=predictions, references=batch["labels"])

    print(acc.compute())
    print(f1.compute())

In [None]:
evaluate(0)

In [None]:
print(f'\n\n\n{"-" * 100}\n\n\n')

In [None]:
for i in range(10):
    evaluate(i)
    print(f'\n\n\n{"-" * 100}\n\n\n')

In [None]:
bert_acc = [0.8204419889502762, 0.8342541436464088, 0.835635359116022, 0.8439226519337016, 0.8425414364640884, 0.835635359116022, 0.8176795580110497, 0.7983425414364641, 0.8441379310344828, 0.8041379310344827]
bert_f1 = [0.8059701492537313, 0.8387096774193548, 0.8199697428139183, 0.844566712517194, 0.8564231738035265, 0.8287769784172662, 0.8249336870026526, 0.7614379084967321, 0.8441379310344828, 0.7717041800643087]

df = pd.DataFrame({'Accuracy': bert_acc, 'F1': bert_f1})

In [None]:
df

In [None]:
df.describe()