In [1]:
# imports and setup
import pandas as pd
# lemmitization
import spacy
# python -m spacy download en_core_web_sm
nlp = spacy.load("en_core_web_sm")
# pure stemming
from nltk.stem import SnowballStemmer
stemmer = SnowballStemmer("english")

import matplotlib.pyplot as plt

from pandas import json_normalize
from sklearn.model_selection import train_test_split
from transformers import AutoModelForCausalLM, AutoTokenizer, Trainer, TrainingArguments
from datasets import Dataset

## Data Collection and Preprocessing:

In [None]:
# read json data tp json_data
json_data = pd.read_json('data/dev-v1.1.json')

# Use json_normalize to flatten question and id, while keeping answers
df = json_normalize(
    json_data['data'], 
    record_path=['paragraphs', 'qas'], 
    meta=['title', ['paragraphs', 'context']], 
    errors='ignore'
)

# Extract answers and create separate columns for answer1, answer2, answer3
df[['answer1', 'answer2', 'answer3']] = pd.DataFrame(
    df['answers'].apply(lambda ans: [answer['text'] for answer in ans[:3]]).to_list(), index=df.index
)

# Drop the original 'answers' column
df = df.drop(columns=['answers'])

# Display the result
df.head()

In [3]:
# Create a new column combining questions and context for fine-tuning
df['input_text'] = df.apply(lambda row: f"Context: {row['paragraphs.context']} Question: {row['question']} Answer:", axis=1)

# Split the data into training and validation sets
train_data, val_data = train_test_split(df[['input_text', 'answer1']], test_size=0.2)

# Convert to list of dictionaries for training
train_data = [{'input_text': row['input_text'], 'target_text': row['answer1']} for idx, row in train_data.iterrows()]
val_data = [{'input_text': row['input_text'], 'target_text': row['answer1']} for idx, row in val_data.iterrows()]

In [None]:
# Load pre-trained model and tokenizer (using GPT-2 here)
model_name = "gpt2"
tokenizer = AutoTokenizer.from_pretrained(model_name)
tokenizer.pad_token = tokenizer.eos_token
model = AutoModelForCausalLM.from_pretrained(model_name)

# Tokenize the training and validation datasets
def tokenize_function(examples):
    return tokenizer(examples['input_text'], padding='max_length', truncation=True, max_length=512)

train_dataset = Dataset.from_pandas(pd.DataFrame(train_data))
val_dataset = Dataset.from_pandas(pd.DataFrame(val_data))

train_dataset = train_dataset.map(tokenize_function, batched=True, remove_columns=["input_text"])
val_dataset = val_dataset.map(tokenize_function, batched=True, remove_columns=["input_text"])

# Set training arguments
training_args = TrainingArguments(
    output_dir="./results",
    evaluation_strategy="epoch",
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    num_train_epochs=3,
    weight_decay=0.01,
    logging_dir="./logs",
)

# Initialize the Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
)

# Train the model
trainer.train()

# Save the model
model.save_pretrained("./fine_tuned_model")
tokenizer.save_pretrained("./fine_tuned_model")