# Introduction
This notebook demonstrates the implementation of a question-answering system using the T5ForConditionalGeneration model from the Hugging Face Transformers library. We'll process the SQuAD (Stanford Question Answering Dataset) to train and validate our model, and evaluate its performance using various metrics such as F1, BLEU, and ROUGE scores.

# 1. Setup and Imports

In [7]:
 # Import necessary libraries
from transformers import T5ForConditionalGeneration, T5Tokenizer
from datasets import load_dataset, load_metric
from nltk.translate.bleu_score import corpus_bleu
from rouge_score import rouge_scorer
import random


#2. Load Model and Tokenizer


In [8]:
 # Load T5 large model and tokenizer
model_name = 't5-small'
# model_name = 't5-base'
# model_name = 't5-large'
model = T5ForConditionalGeneration.from_pretrained(model_name)
tokenizer = T5Tokenizer.from_pretrained(model_name)


Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


# 3. Load and Prepare Datasets




In [9]:
 # Load SQuAD dataset and select a subset for training and validation
dataset = load_dataset('squad')
train_dataset = dataset['train'].select(range(10000))
validation_dataset = dataset['validation'].select(range(1000))

# Convert datasets to lists for easier processing
train_dataset_list = [item for item in train_dataset]
validation_dataset_list = [item for item in validation_dataset]


# 4. Load Metrics

In [10]:
 # Load the metric for evaluating the model's performance
metric = load_metric('squad')


#5. Define Functions for Model Evaluation


In [11]:
 # Define function to generate answers from the model and evaluate them
def generate_answer_and_evaluate(dataset_list):
    # Initialize containers for results
    predictions = []
    plain_text_predictions = []
    plain_text_references = []
    references = []

    # Process each data item
    for item in dataset_list:
        question = item['question']
        context = item['context']
        input_text = f"question: {question} context: {context}"
        input_ids = tokenizer(input_text, return_tensors='pt').input_ids
        output_ids = model.generate(input_ids, max_length=200, num_beams=5, early_stopping=True)
        predicted_answer = tokenizer.decode(output_ids[0], skip_special_tokens=True)

        # Collect results for F1, Exact Match, BLEU, and ROUGE
        predictions.append({'id': item['id'], 'prediction_text': predicted_answer})
        references.append({'id': item['id'], 'answers': item['answers']})
        plain_text_predictions.append(predicted_answer)
        plain_text_references.append(item['answers']['text'][0])

    # Compute and return evaluation metrics
    squad_results = metric.compute(predictions=predictions, references=references)
    exact_matches = sum(1 for ref, pred in zip(plain_text_references, plain_text_predictions) if ref == pred)
    accuracy = exact_matches / len(plain_text_references) * 100
    return squad_results, plain_text_predictions, plain_text_references, accuracy

# Functions to calculate BLEU and ROUGE scores
def calculate_bleu(references, hypotheses):
    list_of_references = [[ref.split()] for ref in references]
    hypotheses_formatted = [hyp.split() for hyp in hypotheses]
    return corpus_bleu(list_of_references, hypotheses_formatted)

def calculate_rouge(references, hypotheses):
    scorer = rouge_scorer.RougeScorer(['rouge1', 'rougeL'], use_stemmer=True)
    rouge_scores = {key: 0.0 for key in ['rouge1', 'rougeL']}
    for ref, hyp in zip(references, hypotheses):
        scores = scorer.score(ref, hyp)
        for key in rouge_scores:
            rouge_scores[key] += scores[key].fmeasure
    for key in rouge_scores:
        rouge_scores[key] /= len(hypotheses)
    return rouge_scores


#6. Run Evaluation and Display Results


In [12]:
# Execute model evaluation on the validation dataset
squad_results, plain_text_predictions, plain_text_references, accuracy = generate_answer_and_evaluate(validation_dataset_list)

# Calculate and display BLEU and ROUGE scores
bleu_score = calculate_bleu(plain_text_references, plain_text_predictions)
rouge_scores = calculate_rouge(plain_text_references, plain_text_predictions)

# Output all evaluation results
print(f"SQuAD Evaluation Results: {squad_results}")
print(f"BLEU Score: {bleu_score}")
print(f"ROUGE Scores: {rouge_scores}")
print(f"Accuracy: {accuracy:.2f}%")


SQuAD Evaluation Results: {'exact_match': 76.4, 'f1': 83.8792542131071}
BLEU Score: 0.2900363598103316
ROUGE Scores: {'rouge1': 0.7989091516670457, 'rougeL': 0.7986014593593532}
Accuracy: 64.00%
