In [1]:
pip install transformers

Collecting transformers
  Downloading transformers-4.33.3-py3-none-any.whl (7.6 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.6/7.6 MB[0m [31m60.3 MB/s[0m eta [36m0:00:00[0m
Collecting huggingface-hub<1.0,>=0.15.1 (from transformers)
  Downloading huggingface_hub-0.17.3-py3-none-any.whl (295 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m295.0/295.0 kB[0m [31m34.2 MB/s[0m eta [36m0:00:00[0m
Collecting tokenizers!=0.11.3,<0.14,>=0.11.1 (from transformers)
  Downloading tokenizers-0.13.3-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (7.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.8/7.8 MB[0m [31m103.0 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting safetensors>=0.3.1 (from transformers)
  Downloading safetensors-0.3.3-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.3 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.3/1.3 MB[0m [31m80.3 MB/s[0m eta [36m0:00:

In [2]:
# import the necessary libraries
import numpy as np
import pandas as pd
from google.colab import drive
import torch
import torch.nn as nn
import transformers
from transformers import BertForMultipleChoice, BertTokenizer, AdamW
from torch.utils.data import DataLoader, TensorDataset

In [3]:
drive.mount('/content/gdrive')

Mounted at /content/gdrive


In [22]:
# Load data from .npy file
data = np.load("/content/gdrive/MyDrive/sentence_puzzle.npy", allow_pickle=True)[()]
len(data)

627

In [12]:
# Split data into training and validation
split_ratio = 0.8

total_samples = len(data)
split_index = int(total_samples * split_ratio)

training_data = data[:split_index]
validation_data = data[split_index:]

# Extract questions, options, and correct_indices for training and validation sets
questions = [entry['question'] for entry in training_data]
options = [entry['choice_list'] for entry in training_data]
correct_indices = [entry['label'] for entry in training_data]

validation_questions = [entry['question'] for entry in validation_data]
validation_options = [entry['choice_list'] for entry in validation_data]
validation_correct_indices = [entry['label'] for entry in validation_data]

In [16]:
df = pd.DataFrame(training_data)
df1 = pd.DataFrame(validation_data)
df

Unnamed: 0,0
0,"{'id': 'SP-0', 'question': 'Mr. and Mrs. Musta..."
1,"{'id': 'SP-0_SR', 'question': 'The six daughte..."
2,"{'id': 'SP-0_CR', 'question': 'A chess team ha..."
3,"{'id': 'SP-1', 'question': 'A woman shoots her..."
4,"{'id': 'SP-1_SR', 'question': 'An individual s..."
...,...
496,"{'id': 'SP-165_SR', 'question': 'Cleopatra and..."
497,"{'id': 'SP-165_CR', 'question': 'Two chickens ..."
498,"{'id': 'SP-166', 'question': 'A man has 9 chil..."
499,"{'id': 'SP-166_SR', 'question': 'A man has nin..."


In [14]:
df1

Unnamed: 0,0
0,"{'id': 'SP-167', 'question': 'There is a bomb ..."
1,"{'id': 'SP-167_SR', 'question': 'A PC has a bo..."
2,"{'id': 'SP-167_CR', 'question': 'A grenade is ..."
3,"{'id': 'SP-168', 'question': 'Andy is put in a..."
4,"{'id': 'SP-168_SR', 'question': 'Andy is place..."
...,...
121,"{'id': 'SP-207_SR', 'question': 'Ten pears hun..."
122,"{'id': 'SP-207_CR', 'question': 'There were tw..."
123,"{'id': 'SP-208', 'question': 'The more you tak..."
124,"{'id': 'SP-208_SR', 'question': 'The more you ..."


In [18]:
# Check if GPU is available
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Initialize BERT tokenizer
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

# Encode the questions and options
input_ids = []
attention_masks = []

for question, option_list in zip(questions, options):
    final_input_ids = []
    final_attention_mask = []

    question_tokens = tokenizer.tokenize(question)
    max_question_length = tokenizer.model_max_length - len(option_list) - 3
    question_tokens = question_tokens[:max_question_length]

    for option in option_list:
        option_tokens = tokenizer.tokenize(option)
        max_option_length = tokenizer.model_max_length - len(question_tokens) - 3
        option_tokens = option_tokens[:max_option_length]

        tokens = ["[CLS]"] + question_tokens + ["[SEP]"] + option_tokens + ["[SEP]"]
        input_id = tokenizer.convert_tokens_to_ids(tokens)
        attention_mask = [1] * len(input_id)

        max_length = tokenizer.model_max_length
        while len(input_id) < max_length:
            input_id.append(0)
            attention_mask.append(0)

        final_input_ids.append(input_id)
        final_attention_mask.append(attention_mask)

    input_ids.append(final_input_ids)
    attention_masks.append(final_attention_mask)

# Convert lists to PyTorch tensors
input_ids = torch.tensor(input_ids).to(device)
attention_masks = torch.tensor(attention_masks).to(device)
correct_indices = torch.tensor(correct_indices).to(device)

# Create a DataLoader for batching and shuffling the data
batch_size = 4
dataset = TensorDataset(input_ids, attention_masks, correct_indices)
dataloader = DataLoader(dataset, batch_size=batch_size, shuffle=True)

  correct_indices = torch.tensor(correct_indices).to(device)


In [19]:
# Encode the validation questions and options
validation_input_ids = []
validation_attention_masks = []

for question, option_list in zip(validation_questions, validation_options):
    final_input_ids = []
    final_attention_mask = []

    question_tokens = tokenizer.tokenize(question)
    max_question_length = tokenizer.model_max_length - len(option_list) - 3
    question_tokens = question_tokens[:max_question_length]

    for option in option_list:
        option_tokens = tokenizer.tokenize(option)
        max_option_length = tokenizer.model_max_length - len(question_tokens) - 3
        option_tokens = option_tokens[:max_option_length]

        tokens = ["[CLS]"] + question_tokens + ["[SEP]"] + option_tokens + ["[SEP]"]
        input_id = tokenizer.convert_tokens_to_ids(tokens)
        attention_mask = [1] * len(input_id)

        max_length = tokenizer.model_max_length
        while len(input_id) < max_length:
            input_id.append(0)
            attention_mask.append(0)

        final_input_ids.append(input_id)
        final_attention_mask.append(attention_mask)

    validation_input_ids.append(final_input_ids)
    validation_attention_masks.append(final_attention_mask)

# Convert lists to PyTorch tensors
validation_input_ids = torch.tensor(validation_input_ids).to(device)
validation_attention_masks = torch.tensor(validation_attention_masks).to(device)
validation_correct_indices = torch.tensor(validation_correct_indices).to(device)

# Create a DataLoader for the validation dataset
validation_dataset = TensorDataset(validation_input_ids, validation_attention_masks, validation_correct_indices)
validation_dataloader = DataLoader(validation_dataset, batch_size=batch_size, shuffle=False)

In [20]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Load the pretrained model and tokenizer
model = BertForMultipleChoice.from_pretrained('bert-base-uncased')
model.to(device)

tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

criterion = nn.CrossEntropyLoss()

optimizer = AdamW(model.parameters(), lr=2e-5)

# Training loop
num_epochs = 4
accumulation_steps = 2
total_steps = len(dataloader)
for epoch in range(num_epochs):
    model.train()
    for step, batch in enumerate(dataloader):
        input_ids, attention_mask, labels = batch
        input_ids = input_ids.to(device)
        attention_mask = attention_mask.to(device)
        labels = labels.to(device)

        optimizer.zero_grad()
        outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
        loss = outputs.loss
        loss.backward()

        if (step + 1) % accumulation_steps == 0 or step == total_steps - 1:
            optimizer.step()
            optimizer.zero_grad()

Downloading model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

Some weights of BertForMultipleChoice were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [21]:
model.eval()
correct_predictions = 0
total_predictions = 0

# Initialize a list to store predicted probabilities for each batch
all_probabilities = []

for step, batch in enumerate(validation_dataloader):
    input_ids, attention_mask, labels = batch
    input_ids = input_ids.to(device)
    attention_mask = attention_mask.to(device)
    labels = labels.to(device)

    with torch.no_grad():
        outputs = model(input_ids, attention_mask=attention_mask)
        logits = outputs.logits

    probabilities = torch.softmax(logits, dim=1)

    all_probabilities.append(probabilities.cpu().numpy())

    predicted_class = logits.argmax(dim=1)

    correct_predictions += (predicted_class == labels).sum().item()
    total_predictions += labels.size(0)

# Calculate accuracy
accuracy = correct_predictions / total_predictions
print(f"Accuracy: {accuracy}")

all_probabilities = np.concatenate(all_probabilities, axis=0)

# Print the probabilities for first 5 questions
for i, batch_probs in enumerate(all_probabilities[:5]):
    print(f"Batch {i + 1} Probabilities:")
    for j, probs in enumerate(batch_probs):
        print(f"Option {j + 1}: {probs}")


Accuracy: 0.6587301587301587
Batch 1 Probabilities:
Option 1: 0.006900370586663485
Option 2: 0.13737823069095612
Option 3: 0.8212735652923584
Option 4: 0.03444787859916687
Batch 2 Probabilities:
Option 1: 0.005919937044382095
Option 2: 0.14755307137966156
Option 3: 0.8137144446372986
Option 4: 0.03281249478459358
Batch 3 Probabilities:
Option 1: 0.3211188316345215
Option 2: 0.2412615865468979
Option 3: 0.4315137267112732
Option 4: 0.006105865351855755
Batch 4 Probabilities:
Option 1: 0.4321253001689911
Option 2: 0.0011234034318476915
Option 3: 0.5658923983573914
Option 4: 0.0008589131757616997
Batch 5 Probabilities:
Option 1: 0.0017188501078635454
Option 2: 0.42827606201171875
Option 3: 0.5686832666397095
Option 4: 0.0013217901578173041
