In [None]:
pip install accelerate -U



In [None]:
pip install transformers

Collecting transformers
  Downloading transformers-4.35.0-py3-none-any.whl (7.9 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.9/7.9 MB[0m [31m55.4 MB/s[0m eta [36m0:00:00[0m
Collecting tokenizers<0.15,>=0.14 (from transformers)
  Downloading tokenizers-0.14.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (3.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m3.8/3.8 MB[0m [31m105.9 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting safetensors>=0.3.1 (from transformers)
  Downloading safetensors-0.4.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.3 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.3/1.3 MB[0m [31m96.5 MB/s[0m eta [36m0:00:00[0m
Collecting huggingface-hub<1.0,>=0.16.4 (from transformers)
  Downloading huggingface_hub-0.17.3-py3-none-any.whl (295 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m295.0/295.0 kB[0m [31m37.1 MB/s[0m eta [36m0:00:00[0m
In

In [None]:
# import the necessary libraries
import numpy as np
import pandas as pd
from google.colab import drive
import torch
import torch.nn as nn
import transformers
from transformers import RobertaForMultipleChoice, RobertaTokenizer, AdamW
from torch.utils.data import DataLoader, TensorDataset

In [None]:
drive.mount('/content/gdrive')

Mounted at /content/gdrive


In [None]:
# Load data from .npy file
data = np.load("/content/gdrive/MyDrive/sentence_puzzle.npy", allow_pickle=True)[()]

In [None]:
import numpy as np
import torch

seed = 42  # You can choose any seed value
np.random.seed(seed)
torch.manual_seed(seed)

<torch._C.Generator at 0x7dada73fa0b0>

In [None]:
# Split data into training and validation
split_ratio = 0.8

total_samples = len(data)
split_index = int(total_samples * split_ratio)

training_data = data[:split_index]
validation_data = data[split_index:]

# Extract questions, options, and correct_indices for training and validation sets
questions = [entry['question'] for entry in training_data]
options = [entry['choice_list'] for entry in training_data]
correct_indices = [entry['label'] for entry in training_data]

validation_questions = [entry['question'] for entry in validation_data]
validation_options = [entry['choice_list'] for entry in validation_data]
validation_correct_indices = [entry['label'] for entry in validation_data]

In [None]:
print(questions[1])
print(options[1])
print(correct_indices[1])

The six daughters of Mr. and Mrs. Mustard each have one brother. However, the family only consists of nine people; how is that possible?
['Some brothers were not loved by family and moved away.', 'Each daughter shares the same brother.', 'Some daughters get married and have their own family.', 'None of above.']
1


In [None]:
len(training_data)

501

In [None]:
from transformers import AutoTokenizer

tokenizer = RobertaTokenizer.from_pretrained("roberta-base")

Downloading (…)olve/main/vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

Downloading (…)olve/main/merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

Downloading (…)/main/tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

Downloading (…)lve/main/config.json:   0%|          | 0.00/481 [00:00<?, ?B/s]

In [None]:
from transformers import AutoTokenizer
import numpy as np

# Tokenize questions and options for training set
tokenized_training_data = []
for i in range(len(questions)):
    choices = options[i]
    input_ids = []
    attention_mask = []

    for choice in choices:
        # Combine the question and choice
        text = questions[i] + " " + choice

        # Tokenize and encode the text
        inputs = tokenizer(text, return_tensors='pt', padding=True, truncation=True)

        input_ids.append(inputs['input_ids'])
        attention_mask.append(inputs['attention_mask'])

    tokenized_training_data.append({
        'input_ids': input_ids,
        'attention_mask': attention_mask,
        'correct_index': correct_indices[i]
    })

# Tokenize questions and options for validation set
tokenized_validation_data = []
for i in range(len(validation_questions)):
    choices = validation_options[i]
    input_ids = []
    attention_mask = []

    for choice in choices:
        # Combine the question and choice
        text = validation_questions[i] + " " + choice

        # Tokenize and encode the text
        inputs = tokenizer(text, return_tensors='pt', padding=True, truncation=True)

        input_ids.append(inputs['input_ids'])
        attention_mask.append(inputs['attention_mask'])

    tokenized_validation_data.append({
        'input_ids': input_ids,
        'attention_mask': attention_mask,
        'correct_index': validation_correct_indices[i]
    })

# Save tokenized data as numpy arrays
np.save('tokenized_training_data.npy', tokenized_training_data)
np.save('tokenized_validation_data.npy', tokenized_validation_data)

In [None]:
from dataclasses import dataclass
from transformers.tokenization_utils_base import PreTrainedTokenizerBase, PaddingStrategy
from typing import Optional, Union
import torch
import numpy as np

@dataclass
class DataCollatorForMultipleChoice:
    """
    Data collator that will dynamically pad the inputs for multiple choice received.
    """

    tokenizer: PreTrainedTokenizerBase
    padding: Union[bool, str, PaddingStrategy] = True
    max_length: Optional[int] = None
    pad_to_multiple_of: Optional[int] = None

    def __call__(self, features):
        # Extract questions, options, and correct indices
        input_ids = np.array([feature["input_ids"] for feature in features])
        attention_mask = np.array([feature["attention_mask"] for feature in features])
        labels = np.array([feature["correct_index"] for feature in features])

        batch_size = len(input_ids)
        num_choices = input_ids.shape[1]

        # Flatten features
        flattened_features = {
            "input_ids": input_ids.reshape(batch_size * num_choices, -1),
            "attention_mask": attention_mask.reshape(batch_size * num_choices, -1),
        }

        # Pad and reshape
        batch = self.tokenizer.pad(
            flattened_features,
            padding=self.padding,
            max_length=self.max_length,
            pad_to_multiple_of=self.pad_to_multiple_of,
            return_tensors="pt",
        )

        # Reshape the batch
        batch = {
            k: v.view(batch_size, num_choices, -1) for k, v in batch.items()
        }
        # Add back labels
        batch["labels"] = torch.tensor(labels, dtype=torch.int64)
        return batch

In [None]:
from transformers import AutoTokenizer

# Tokenize questions and options for training set
tokenized_training_data = []
max_sequence_length = 128  # Define your desired sequence length

for i in range(len(questions)):
    choices = options[i]
    input_ids = []
    attention_mask = []

    for choice in choices:
        # Combine the question and choice
        text = questions[i] + " " + choice

        # Tokenize and encode the text with padding and truncation
        inputs = tokenizer(
            text,
            return_tensors='pt',
            padding='max_length',  # Pad to the specified sequence length
            truncation=True,
            max_length=max_sequence_length,
        )

        input_ids.append(inputs['input_ids'])
        attention_mask.append(inputs['attention_mask'])

    tokenized_training_data.append({
        'input_ids': input_ids,
        'attention_mask': attention_mask,
        'correct_index': correct_indices[i]
    })

# Tokenize questions and options for validation set
tokenized_validation_data = []
for i in range(len(validation_questions)):
    choices = validation_options[i]
    input_ids = []
    attention_mask = []

    for choice in choices:
        # Combine the question and choice
        text = validation_questions[i] + " " + choice

        # Tokenize and encode the text with padding and truncation
        inputs = tokenizer(
            text,
            return_tensors='pt',
            padding='max_length',  # Pad to the specified sequence length
            truncation=True,
            max_length=max_sequence_length,
        )

        input_ids.append(inputs['input_ids'])
        attention_mask.append(inputs['attention_mask'])

    tokenized_validation_data.append({
        'input_ids': input_ids,
        'attention_mask': attention_mask,
        'correct_index': validation_correct_indices[i]
    })

# Save tokenized data as numpy arrays
np.save('tokenized_training_data.npy', tokenized_training_data)
np.save('tokenized_validation_data.npy', tokenized_validation_data)

In [None]:
from transformers import AutoModelForMultipleChoice, TrainingArguments, Trainer
from transformers.data.data_collator import DataCollatorWithPadding

# Load your tokenized training and validation data
tokenized_training_data = np.load('tokenized_training_data.npy', allow_pickle=True)
tokenized_validation_data = np.load('tokenized_validation_data.npy', allow_pickle=True)

# Define a custom dataset class for multiple-choice questions
class MultipleChoiceDataset(torch.utils.data.Dataset):
    def __init__(self, data):
        self.data = data

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        return {
            'input_ids': self.data[idx]['input_ids'],
            'attention_mask': self.data[idx]['attention_mask'],
            'labels': self.data[idx]['correct_index']
        }

# Create DataLoader for training and validation
batch_size = 16  # Adjust as needed
train_dataset = MultipleChoiceDataset(tokenized_training_data)
train_data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

validation_dataset = MultipleChoiceDataset(tokenized_validation_data)
validation_data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

model = RobertaForMultipleChoice.from_pretrained("roberta-base")

# Define training arguments
training_args = TrainingArguments(
    output_dir="fine_tune_roberta",
    evaluation_strategy="epoch",
    save_strategy="epoch",
    load_best_model_at_end=True,
    learning_rate=5e-5,
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    num_train_epochs=3,
    weight_decay=0.01,
    push_to_hub=False,  # Set to True if you want to push the model to the Hugging Face Model Hub
)

# Create Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    data_collator=train_data_collator,  # Use the data collator for training
    train_dataset=train_dataset,  # Use the dataset for training
    eval_dataset=validation_dataset,  # Use the dataset for validation
    tokenizer=tokenizer,
)

# Start training
trainer.train()

Downloading model.safetensors:   0%|          | 0.00/499M [00:00<?, ?B/s]

Some weights of RobertaForMultipleChoice were not initialized from the model checkpoint at roberta-base and are newly initialized: ['roberta.pooler.dense.bias', 'classifier.bias', 'roberta.pooler.dense.weight', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss
1,No log,1.22429
2,No log,1.238547
3,No log,1.320316


TrainOutput(global_step=96, training_loss=0.6610579490661621, metrics={'train_runtime': 158.7963, 'train_samples_per_second': 9.465, 'train_steps_per_second': 0.605, 'total_flos': 395452365566976.0, 'train_loss': 0.6610579490661621, 'epoch': 3.0})

In [None]:
pip install huggingface_hub



In [None]:
from huggingface_hub import notebook_login

notebook_login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

In [None]:
trainer.push_to_hub()

model.safetensors:   0%|          | 0.00/499M [00:00<?, ?B/s]

events.out.tfevents.1699215126.9c56d5895734.395.0:   0%|          | 0.00/5.36k [00:00<?, ?B/s]

Upload 3 LFS files:   0%|          | 0/3 [00:00<?, ?it/s]

training_args.bin:   0%|          | 0.00/4.60k [00:00<?, ?B/s]

'https://huggingface.co/yashikam19/fine_tune_roberta/tree/main/'

In [None]:
from transformers import AutoTokenizer, AutoModelForMultipleChoice

In [None]:
model = AutoModelForMultipleChoice.from_pretrained("fine_tune_roberta")

In [None]:
tokenizer = AutoTokenizer.from_pretrained("fine_tune_roberta")

In [None]:
def calculate_accuracy(model, dataloader):
    correct = 0
    total = 0
    model.eval()
    with torch.no_grad():
        for batch in dataloader:
            inputs = {key: value.to(model.device) for key, value in batch.items()}
            labels = inputs.pop("labels")
            outputs = model(**inputs)
            logits = outputs.logits
            predicted_labels = torch.argmax(logits, dim=1)
            correct += torch.sum(predicted_labels == labels).item()
            total += len(labels)
    return correct / total

In [None]:
from torch.utils.data import DataLoader

# Define your batch size
batch_size = 16  # Adjust according to your needs

# Create a DataLoader for the validation dataset
validation_dataloader = DataLoader(
    validation_dataset,
    batch_size=batch_size,
    collate_fn=validation_data_collator,  # Use your validation_data_collator
)

# Calculate validation accuracy
validation_accuracy = calculate_accuracy(model, validation_dataloader)
print(f"Validation Accuracy: {validation_accuracy:.2%}")

Validation Accuracy: 53.17%
