In [1]:
import pandas as pd
import spacy
# python -m spacy download en_core_web_sm
nlp = spacy.load("en_core_web_sm")
from nltk.stem import SnowballStemmer
stemmer = SnowballStemmer("english")
import matplotlib.pyplot as plt
from pandas import json_normalize
from sklearn.model_selection import train_test_split
from transformers import T5Tokenizer, T5ForConditionalGeneration, Trainer, TrainingArguments
from datasets import Dataset

## Data Collection and Preprocessing:

In [3]:
# read json data tp json_data
json_data = pd.read_json('data/dev-v1.1.json')

# Use json_normalize to flatten question and id, while keeping answers
df = json_normalize(
    json_data['data'], 
    record_path=['paragraphs', 'qas'], 
    meta=['title', ['paragraphs', 'context']], 
    errors='ignore'
)

# Extract answers and create separate columns for answer1, answer2, answer3
df[['answer1', 'answer2', 'answer3']] = pd.DataFrame(
    df['answers'].apply(lambda ans: [answer['text'] for answer in ans[:3]]).to_list(), index=df.index
)

# Drop the original 'answers' column
df = df.drop(columns=['answers'])

# Display the result
df.head()

Unnamed: 0,question,id,title,paragraphs.context,answer1,answer2,answer3
0,Which NFL team represented the AFC at Super Bo...,56be4db0acb8001400a502ec,Super_Bowl_50,Super Bowl 50 was an American football game to...,Denver Broncos,Denver Broncos,Denver Broncos
1,Which NFL team represented the NFC at Super Bo...,56be4db0acb8001400a502ed,Super_Bowl_50,Super Bowl 50 was an American football game to...,Carolina Panthers,Carolina Panthers,Carolina Panthers
2,Where did Super Bowl 50 take place?,56be4db0acb8001400a502ee,Super_Bowl_50,Super Bowl 50 was an American football game to...,"Santa Clara, California",Levi's Stadium,Levi's Stadium in the San Francisco Bay Area a...
3,Which NFL team won Super Bowl 50?,56be4db0acb8001400a502ef,Super_Bowl_50,Super Bowl 50 was an American football game to...,Denver Broncos,Denver Broncos,Denver Broncos
4,What color was used to emphasize the 50th anni...,56be4db0acb8001400a502f0,Super_Bowl_50,Super Bowl 50 was an American football game to...,gold,gold,gold


In [4]:

# Column that combines questions and context
df['input_text'] = df.apply(lambda row: f"Context: {row['paragraphs.context']} Question: {row['question']} Answer:", axis=1)

# Split the data into training and validation sets
train_data, val_data = train_test_split(df[['input_text', 'answer1']], test_size=0.2)

# Convert to list of dictionaries for training
train_data = [{'input_text': row['input_text'], 'target_text': row['answer1']} for idx, row in train_data.iterrows()]
val_data = [{'input_text': row['input_text'], 'target_text': row['answer1']} for idx, row in val_data.iterrows()]

## Model Training

In [None]:
# Initialize the model and tokenizer
model_name = 't5-small' 
tokenizer = T5Tokenizer.from_pretrained(model_name)
model = T5ForConditionalGeneration.from_pretrained(model_name)

# Using a dataset object
train_dataset = Dataset.from_list(train_data)
val_dataset = Dataset.from_list(val_data)

# Tokenizing the input and target
def preprocess_data(examples):
    inputs = examples['input_text']
    targets = examples['target_text']
    
    model_inputs = tokenizer(inputs, max_length=512, truncation=True, padding="max_length")
    labels = tokenizer(targets, max_length=64, truncation=True, padding="max_length").input_ids
    
    # Replacing padding token ids in labels with -100 to ignore them
    labels_with_padding = [-100 if token == tokenizer.pad_token_id else token for token in labels]
    model_inputs['labels'] = labels_with_padding
    return model_inputs

# Applying preprocessing
train_dataset = train_dataset.map(preprocess_data, batched=True)
val_dataset = val_dataset.map(preprocess_data, batched=True)

training_args = TrainingArguments(
    output_dir='./results',
    evaluation_strategy="steps",
    per_device_train_batch_size=4,
    per_device_eval_batch_size=4,
    num_train_epochs=1,
    save_steps=10,
    eval_steps=10,
    logging_dir='./logs',
    logging_steps=10,
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
)

# Start training
trainer.train(resume_from_checkpoint=True)

# Save the model
trainer.save_model('./trained_model')
tokenizer.save_pretrained('./trained_model')

## Model Evaluation

In [5]:
# Load the trained model and tokenizer
model_path = './app/trained_model'
tokenizer = T5Tokenizer.from_pretrained(model_path)
model = T5ForConditionalGeneration.from_pretrained(model_path)

# Prepare the validation dataset for evaluation (assuming `val_data` is already prepared)
val_dataset = Dataset.from_list(val_data)

# Tokenizing the input and target
def preprocess_data(examples):
    inputs = examples['input_text']
    targets = examples['target_text']
    
    model_inputs = tokenizer(inputs, max_length=512, truncation=True, padding="max_length")
    labels = tokenizer(targets, max_length=64, truncation=True, padding="max_length").input_ids
    
    # Replacing padding token ids in labels with -100 to ignore them during loss computation
    labels_with_padding = [-100 if token == tokenizer.pad_token_id else token for token in labels]
    model_inputs['labels'] = labels_with_padding
    return model_inputs

# Applying preprocessing on validation dataset
val_dataset = val_dataset.map(preprocess_data, batched=True)

# Define evaluation arguments
eval_args = TrainingArguments(
    output_dir='./results',
    per_device_eval_batch_size=4,
    logging_dir='./logs',
    logging_steps=10,
)

# Create a trainer instance
trainer = Trainer(
    model=model,
    args=eval_args,
    eval_dataset=val_dataset,
)

# Run evaluation
eval_results = trainer.evaluate()

# Print evaluation results
print(f"Evaluation Results: {eval_results}")

# Generate predictions for the validation set
def generate_answer(input_text):
    inputs = tokenizer(input_text, return_tensors="pt", max_length=512, truncation=True)
    input_ids = inputs.input_ids.to(model.device)
    
    # Generate output
    outputs = model.generate(input_ids, max_length=64, num_beams=4, early_stopping=True)
    
    # Decode the generated output
    answer = tokenizer.decode(outputs[0], skip_special_tokens=True)
    return answer

# Test the model on some validation samples
for i in range(5):  # Adjust to test more examples
    sample = val_data[i]
    input_text = sample['input_text']
    target_text = sample['target_text']
    
    predicted_answer = generate_answer(input_text)
    print(f"Input: {input_text}")
    print(f"Predicted Answer: {predicted_answer}")
    print(f"Actual Answer: {target_text}")
    print("-" * 50)



Map:   0%|          | 0/2114 [00:00<?, ? examples/s]

  0%|          | 0/529 [00:00<?, ?it/s]

Evaluation Results: {'eval_loss': 0.029073098674416542, 'eval_model_preparation_time': 0.0016, 'eval_runtime': 70.6523, 'eval_samples_per_second': 29.921, 'eval_steps_per_second': 7.487}
Input: Context: The Rankine cycle is the fundamental thermodynamic underpinning of the steam engine. The cycle is an arrangement of components as is typically used for simple power production, and utilizes the phase change of water (boiling water producing steam, condensing exhaust steam, producing liquid water)) to provide a practical heat/power conversion system. The heat is supplied externally to a closed loop with some of the heat added being converted to work and the waste heat being removed in a condenser. The Rankine cycle is used in virtually all steam power production applications. In the 1990s, Rankine steam cycles generated about 90% of all electric power used throughout the world, including virtually all solar, biomass, coal and nuclear power plants. It is named after William John Macquorn 

## Conclusion

### Methodology
The task involved fine-tuning a T5-based model for question answering using a dataset containing contexts, questions, and answers. The steps for training and evaluation were as follows:

#### Data Preparation:

- A JSON dataset was processed and flattened to extract contexts, questions, and answers.
- The data was further split into training and validation sets, where each row contained an input_text (combining context and question) and a target (answer1).


#### Model and Tokenization:

- The T5Tokenizer and T5ForConditionalGeneration model were used from the Hugging Face transformers library.
Both the training and validation datasets were tokenized, with inputs truncated to a maximum length of 512 tokens and target sequences to 64 tokens.

#### Model Setup and Training: 

- The Trainer class was employed for training, using a batch size of 4, with evaluation strategy set to run every 10 steps. Training was resumed from a checkpoint.
The model was trained for 1 epoch with num_train_epochs=1.

### Evaluation

- We evaluated the model was evaluated using the validation dataset after training, calculating the loss and checking predictions against actual answers.

### Results
- Evaluation Loss: The model achieved a very low evaluation loss of 0.029.
- Efficiency: The model processed 29.92 samples per second and 7.49 steps per second during evaluation.
- Example Prediction: On one sample from the validation set, the model was given the context of the Rankine cycle and asked, "What happens to waste heat in the Rankine cycle?"
  - Predicted Answer: "removed in a condenser"
  - Actual Answer: "removed in a condenser"

The model's performance on this example was highly accurate, demonstrating effective learning and prediction capabilities for T5 model for the question-answering task.
