In [None]:
"""
Author: tanishq-ids

This module loads a pre-trained Hugging Face model, creates a custom
 dataset class,  fine-tunes the model on the custom dataset,
 and saves the fine-tuned  model and tokenizer. It also includes code
 to load the saved model locally.
 
"""

In [None]:
import pandas as pd
import torch
from transformers import TrainingArguments, Trainer
from transformers import AutoModelForSequenceClassification, AutoTokenizer
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from torch.utils.data import Dataset

### Load Dataset

In [None]:
# Load your dataset into a pandas DataFrame
df = pd.read_csv("data/output_curator.csv")

In [None]:
class CustomDataset(Dataset):
    """
    Custom dataset class for handling question-answer pairs.

    Args:
        tokenizer (transformers.PreTrainedTokenizer): tokenizing input text.
        questions (list): List of questions.
        contexts (list): List of corresponding contexts.
        labels (list): List of labels.
        max_length (int): Maximum length of input sequences.

    Methods:
        __len__: Returns the length of the dataset.
        __getitem__: Returns an item from the dataset by index.

    Example:
        dataset = CustomDataset(tokenizer, questions,
                    contexts, labels, max_length)
    """

    def __init__(self, tokenizer, questions, contexts, labels, max_length):
        self.tokenizer = tokenizer
        self.questions = questions
        self.contexts = contexts
        self.labels = labels
        self.max_length = max_length

    def __len__(self):
        return len(self.questions)

    def __getitem__(self, idx):
        question = str(self.questions[idx])
        context = str(self.contexts[idx])
        label = self.labels[idx]

        inputs = self.tokenizer(
            question, context, truncation=True, padding="max_length", max_length=self.max_length, return_tensors="pt"
        )

        input_ids = inputs["input_ids"].squeeze()
        attention_mask = inputs["attention_mask"].squeeze()

        return {
            "input_ids": input_ids,
            "attention_mask": attention_mask,
            "labels": torch.tensor(label, dtype=torch.long),
        }

### Load Model

In [None]:
MODEL_NAME = "sentence-transformers/all-MiniLM-L6-v2"
NUM_LABELS = 2
model = AutoModelForSequenceClassification.from_pretrained(MODEL_NAME, num_labels=NUM_LABELS)

tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)

### Generate Train & Eval dataset

In [None]:
train_df, eval_df = train_test_split(df, test_size=0.2, random_state=42)
train_df = train_df.reset_index(drop=True)
eval_df = eval_df.reset_index(drop=True)
MAX_LENGTH = 512

# Create training dataset
train_dataset = CustomDataset(tokenizer, train_df["question"], train_df["context"], train_df["label"], MAX_LENGTH)

# Create evaluation dataset
eval_dataset = CustomDataset(tokenizer, eval_df["question"], eval_df["context"], eval_df["label"], MAX_LENGTH)

### Training

In [None]:
EPOCHS = 2
training_args = TrainingArguments(
    output_dir="./saved_models_during_training",
    overwrite_output_dir=True,
    num_train_epochs=EPOCHS,
    per_device_train_batch_size=4,
    save_steps=500,
    # save_total_limit=2,
    logging_dir="./logs",
)

In [None]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=eval_dataset,
)

# Start training
trainer.train()

### Evaluate

In [None]:
# Evaluate the model on the evaluation dataset
eval_result = trainer.evaluate(eval_dataset)

# Print evaluation results
print("Evaluation results:")
for key, value in eval_result.items():
    print(f"{key}: {value}")

In [None]:
# Predict labels for the evaluation dataset
predictions = trainer.predict(eval_dataset)

# Get predicted labels and logits
predicted_labels = predictions.predictions.argmax(axis=1)
predicted_logits = predictions.predictions

# Get true labels from the evaluation dataset
true_labels = eval_dataset[:]["labels"]

# Optionally, calculate metrics such as accuracy

accuracy = accuracy_score(true_labels, predicted_labels)
print("Accuracy:", accuracy)

In [None]:
# Print inputs along with predicted labels
for i, eva_data in enumerate(eval_dataset):
    input_ids = eva_data["input_ids"]
    attention_mask = eva_data["attention_mask"]
    true_label = true_labels[i]
    predicted_label = predicted_labels[i]
    print(f"Input: {tokenizer.decode(input_ids, skip_special_tokens=True)}")
    print(f"True Label: {true_label}, Predicted Label: {predicted_label}")
    print()

### Saving the model locally

In [None]:
# from transformers import AutoModelForSequenceClassification, AutoTokenizer
# Assuming "saved_model" is the directory to save your model & tokenizer
model.save_pretrained("saved_model")
tokenizer.save_pretrained("saved_model")

### Loading the Model

In [None]:
# from transformers import AutoModelForSequenceClassification, AutoTokenizer
# Load the model and tokenizer from the "saved_model" directory

model = AutoModelForSequenceClassification.from_pretrained("saved_model")
tokenizer = AutoTokenizer.from_pretrained("saved_model")