In [None]:
! pip install sentencepiece

Collecting sentencepiece
  Downloading sentencepiece-0.1.99-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.3 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.3/1.3 MB[0m [31m8.9 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: sentencepiece
Successfully installed sentencepiece-0.1.99


In [None]:
import torch
import json
from torch.utils.data import Dataset, DataLoader
from transformers import AlbertTokenizer as tkz
from transformers import AlbertForMultipleChoice as mpc
import torch.optim as optim

In [None]:
# Reading the train dataset back
with open('train_dataset_final.json', 'r') as train_file:
    train_dataset_back = json.load(train_file)

# Reading the test dataset back
with open('test_dataset_final.json', 'r') as test_file:
    test_dataset_back = json.load(test_file)

# Display the sizes of the two datasets
print(f"Size of the train dataset: {len(train_dataset_back)}")
print(f"Size of the test dataset: {len(test_dataset_back)}")

Size of the train dataset: 1338
Size of the test dataset: 599


In [None]:
class TTDataset(Dataset):
    def __init__(self, copa_data, tokenizer):
        self.copa_data = copa_data
        self.tokenizer = tokenizer

    def __len__(self):
        return len(self.copa_data)

    def __getitem__(self, idx):
        data = self.copa_data[idx]
        premise = data['premise']
        hypotheses = [data['initial'], data['counterfactual']]
        label = data['label']

        # Tokenize
        encoded_input = self.tokenizer([premise] * 2, hypotheses, padding='max_length', truncation=True, return_tensors='pt')
        encoded_input['labels'] = torch.tensor(label)

        return encoded_input


In [None]:
model_base = 'albert-base-v2'

# Load tokenizer and model
tokenizer = tkz.from_pretrained(model_base)
model = mpc.from_pretrained(model_base)

# Prepare the DataLoader
TT_dataset = TTDataset(train_dataset_back, tokenizer)
dataloader = DataLoader(TT_dataset, batch_size=8, shuffle=True)

# Optimizer and Loss Function
optimizer = optim.Adam(model.parameters(), lr=1e-5)
loss_fn = torch.nn.CrossEntropyLoss()



The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


spiece.model:   0%|          | 0.00/760k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.31M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/684 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/47.4M [00:00<?, ?B/s]

Some weights of AlbertForMultipleChoice were not initialized from the model checkpoint at albert-base-v2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [None]:
# Training Loop
model.train()
for epoch in range(3):  # Number of epochs
    for batch in dataloader:
        optimizer.zero_grad()

        input_ids = batch['input_ids'].squeeze(1)
        attention_mask = batch['attention_mask'].squeeze(1)
        labels = batch['labels']

        outputs = model(input_ids=input_ids, attention_mask=attention_mask, labels=labels)
        loss = outputs.loss

        loss.backward()
        optimizer.step()

        print(f"Epoch: {epoch}, Loss: {loss.item()}")

print('Training done')

Epoch: 0, Loss: 0.7270599603652954
Epoch: 0, Loss: 0.6673921346664429
Epoch: 0, Loss: 0.667435884475708
Epoch: 0, Loss: 0.6605194807052612
Epoch: 0, Loss: 0.6014758944511414
Epoch: 0, Loss: 0.7094225883483887
Epoch: 0, Loss: 0.7609310150146484
Epoch: 0, Loss: 0.686905562877655
Epoch: 0, Loss: 0.6858932375907898
Epoch: 0, Loss: 0.6401957273483276
Epoch: 0, Loss: 0.7001824975013733
Epoch: 0, Loss: 0.655680775642395
Epoch: 0, Loss: 0.6678980588912964
Epoch: 0, Loss: 0.7118004560470581
Epoch: 0, Loss: 0.6762088537216187
Epoch: 0, Loss: 0.65179842710495
Epoch: 0, Loss: 0.6654494404792786
Epoch: 0, Loss: 0.7143689393997192
Epoch: 0, Loss: 0.6829431056976318
Epoch: 0, Loss: 0.7807716131210327
Epoch: 0, Loss: 0.6674270629882812
Epoch: 0, Loss: 0.6893079876899719
Epoch: 0, Loss: 0.6865692138671875
Epoch: 0, Loss: 0.6793285608291626
Epoch: 0, Loss: 0.6806218028068542
Epoch: 0, Loss: 0.6747983694076538
Epoch: 0, Loss: 0.6837022304534912
Epoch: 0, Loss: 0.6505004167556763
Epoch: 0, Loss: 0.7558925

In [None]:
tokenizer = tkz.from_pretrained(model_base)
validation_dataset = TTDataset(test_dataset_back, tokenizer)
validation_dataloader = DataLoader(validation_dataset, batch_size=8)

In [None]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

model.eval()
correct_predictions = 0
total_predictions = 0

accuracy, f1, precision, recall = [], [], [], []
batches = 0

def compute_metrics(y_true, y_pred):
    accuracy = accuracy_score(y_true, y_pred)
    f1 = f1_score(y_true, y_pred)
    precision = precision_score(y_true, y_pred)
    recall = recall_score(y_true, y_pred)
    return [accuracy, f1, precision, recall]

with torch.no_grad():
    for batch in validation_dataloader:
        input_ids = batch['input_ids'].squeeze(1)
        attention_mask = batch['attention_mask'].squeeze(1)
        labels = batch['labels']  # ground truth

        outputs = model(input_ids=input_ids, attention_mask=attention_mask)
        predictions = torch.argmax(outputs.logits, dim=1)  # predictions

        calculated_metrics = compute_metrics(labels, predictions)
        accuracy.append(calculated_metrics[0])
        f1.append(calculated_metrics[1])
        precision.append(calculated_metrics[2])
        recall.append(calculated_metrics[3])
        batches += 1

        correct_predictions += (predictions == labels).sum().item()
        total_predictions += labels.size(0)

overall_accuracy = correct_predictions / total_predictions
print(f"Overall Accuracy: {overall_accuracy:.4f}")

print(f"Accuracy: {sum(accuracy)/batches:.4f}")
print(f"F1: {sum(f1)/batches:.4f}")
print(f"Precision: {sum(precision)/batches:.4f}")
print(f"Recall: {sum(recall)/batches:.4f}")

print(f'Number of batches: {batches}')

In [None]:
model.eval()
correct_predictions = 0
total_predictions = 0

with torch.no_grad():
    for batch in validation_dataloader:
        input_ids = batch['input_ids'].squeeze(1)
        attention_mask = batch['attention_mask'].squeeze(1)
        labels = batch['labels']

        outputs = model(input_ids=input_ids, attention_mask=attention_mask)
        predictions = torch.argmax(outputs.logits, dim=1)
        correct_predictions += (predictions == labels).sum().item()
        total_predictions += labels.size(0)

overall_accuracy = correct_predictions / total_predictions
print(f"Overall Accuracy: {overall_accuracy:.4f}")

Overall Accuracy: 0.6594
