In [3]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [5]:
import nltk
nltk.download('wordnet')
nltk.download('wordnet_ic')
nltk.download('plwn')
nltk.download('gazetteers')
nltk.download('stopwords')
nltk.download('punkt')
import torch
import string, re
from transformers import pipeline
from rouge_score import rouge_scorer
from nltk.translate.bleu_score import sentence_bleu, SmoothingFunction
from nltk.translate.meteor_score import single_meteor_score
import csv
import time

In [7]:
%%time

def predict(context,model, query, tokenizer):
     qa_pipeline = pipeline('question-answering', model=model, tokenizer=tokenizer)
     result = qa_pipeline(question=query, context=context)
     return result['answer']
  
'''
Removing articles and punctuation and standardizing whitespace
'''
def normalize_text(s):
    
    def remove_articles(text):
        regex = re.compile(r'\b(a|an|the)\b', re.UNICODE)
        return re.sub(regex, ' ', text)
        
    def white_space_fix(text):
        return ' '.join(text.split())
        
    def remove_punc(text):
        exclude = set(string.punctuation)
        return ''.join(ch for ch in text if ch not in exclude)
        
    def lower(text):
        return text.lower()
        
    return white_space_fix(remove_articles(remove_punc(lower(s))))
    
    
def compute_exact_match(prediction, truth):
    return int(normalize_text(prediction) == normalize_text(truth))
    

def compute_f1(prediction,truth):
    pred_tokens = normalize_text(prediction).split()
    truth_tokens = normalize_text(truth).split()
    # if either the prediction or the truth is no-answer, then f1 = 1 if they agree, 0 otherwise
    if len(pred_tokens) == 0 or len(truth_tokens) == 0:
        return int(pred_tokens == truth_tokens)
    common_tokens = set(pred_tokens) & set(truth_tokens)
    if len(common_tokens) == 0:
        return 0
    precision = len(common_tokens) / len(pred_tokens)
    recall = len(common_tokens) / len(truth_tokens)
    return 2 * precision * recall / (precision + recall)
    
def compute_bleu(prediction, truth):
    # Convert the prediction and truth to lists of tokens
    pred_tokens = normalize_text(prediction).split()
    truth_tokens = normalize_text(truth).split()
    # Set up the SmoothingFunction
    smoothing = SmoothingFunction().method1
    # Compute BLEU score with smoothing
    bleu_score = sentence_bleu([truth_tokens], pred_tokens, smoothing_function=smoothing)
    return bleu_score

def compute_rouge(prediction, truth):
    # Compute ROUGE scores
    scorer = rouge_scorer.RougeScorer(['rouge1', 'rougeL'], use_stemmer=True)
    scores = scorer.score(truth, prediction)
    return scores['rouge1'].fmeasure, scores['rougeL'].fmeasure

def compute_meteor(prediction, truth):
    # Compute METEOR score
    meteor_score = single_meteor_score(truth, prediction)
    return meteor_score   

def give_an_answer(answer, context, model, query, tokenizer):
    prediction = predict(context, model, query, tokenizer)
    em_score = compute_exact_match(prediction, answer)
    f1_score = compute_f1(prediction, answer)
    bleu_score = compute_bleu(prediction, answer)
    rouge_1_score, rouge_l_score = compute_rouge(prediction, answer)
    meteor_score = compute_meteor(prediction, answer)
    return em_score, f1_score, bleu_score, rouge_1_score, rouge_l_score, meteor_score

CPU times: user 36 µs, sys: 0 ns, total: 36 µs
Wall time: 42.7 µs


In [8]:
def evaluate_model(answers, context, model, model_name, queries, tokenizer):
    f1_scores = []
    bleu_scores = []
    rouge_1_scores = []
    rouge_l_scores = []
    em_scores = []
    start_time = time.time()
    
    for q, a in zip(queries, answers):
        prediction = predict(context, model, q, tokenizer)
        em_score = compute_exact_match(prediction, a)
        f1_score = compute_f1(prediction, a)
        bleu_score = compute_bleu(prediction, a)
        rouge_1_score, rouge_l_score = compute_rouge(prediction, a)
        
        f1_scores.append(f1_score)
        bleu_scores.append(bleu_score)
        rouge_1_scores.append(rouge_1_score)
        rouge_l_scores.append(rouge_l_score)
        em_scores.append(em_score)
    
    avg_f1 = sum(f1_scores) / len(f1_scores)
    avg_bleu = sum(bleu_scores) / len(bleu_scores)
    avg_rouge_1 = sum(rouge_1_scores) / len(rouge_1_scores)
    avg_rouge_l = sum(rouge_l_scores) / len(rouge_l_scores)
    accuracy = sum(em_scores) / len(em_scores)
    total_time = time.time() - start_time
    
    return avg_f1, avg_bleu, avg_rouge_1, avg_rouge_l, accuracy, total_time

In [9]:
%%time

context = """ Harry Potter is a series of seven fantasy novels written by British author, J. K. Rowling. 
              The novels chronicle the lives of a young wizard, Harry Potter, and his friends Hermione Granger and 
              Ron Weasley, all of whom are students at Hogwarts School of Witchcraft and Wizardry. 
              The main story arc concerns Harry's struggle against Lord Voldemort, a dark wizard who 
              intends to become immortal, overthrow the wizard governing body known as the Ministry of 
              Magic and subjugate all wizards and Muggles (non-magical people). Since the release of 
              the first novel, Harry Potter and the Philosopher's Stone, on 26 June 1997, the books 
              have found immense popularity, positive reviews, and commercial success worldwide. They 
              have attracted a wide adult audience as well as younger readers and are often considered 
              cornerstones of modern young adult literature.As of February 2018, the books have 
              sold more than 500 million copies worldwide, making them the best-selling book series in 
              history, and have been translated into eighty languages.The last four books 
              consecutively set records as the fastest-selling books in history, with the final 
              installment selling roughly eleven million copies in the United States within twenty-four 
              hours of its release.  """

queries = ["Who wrote Harry Potter's novels?",
           "Who are Harry Potter's friends?",
           "Who is the enemy of Harry Potter?",
           "What are Muggles?",
           "Which is the name of Harry Poter's first novel?",
           "When did the first novel release?",
           "Who was attracted by Harry Potter novels?",
           "How many languages Harry Potter has been translated into? "]

answers = ["J. K. Rowling",
           "Hermione Granger and Ron Weasley",
           "Lord Voldemort",
           "non-magical people",
           "Harry Potter and the Philosopher's Stone",
           "26 June 1997",
           "a wide adult audience as well as younger readers",
           "eighty"]
   

CPU times: user 6 µs, sys: 0 ns, total: 6 µs
Wall time: 10.5 µs


In [11]:
#Changing and adding models here for testing
from transformers import AlbertTokenizer, AlbertForQuestionAnswering
from transformers import BartTokenizer, BartForQuestionAnswering
from transformers import BertTokenizer, BertForQuestionAnswering
from transformers import BigBirdTokenizer, BigBirdForQuestionAnswering
from transformers import DistilBertTokenizer, DistilBertForQuestionAnswering
from transformers import ElectraTokenizer, ElectraForQuestionAnswering

models_info = [
    {
        'name': 'albert-base-v2',
        'tokenizer': AlbertTokenizer,
        'model': AlbertForQuestionAnswering,
    },
    {
        'name': 'twmkn9/albert-base-v2-squad2',
        'tokenizer': AlbertTokenizer,
        'model': AlbertForQuestionAnswering,
    },
    {
        'name': 'ahotrod/albert_xxlargev1_squad2_512',
        'tokenizer': AlbertTokenizer,
        'model': AlbertForQuestionAnswering,
    },
    {
        'name': 'facebook/bart-large',
        'tokenizer': BartTokenizer,
        'model': BartForQuestionAnswering,
    },
     {
        'name': 'a-ware/bart-squadv2',
        'tokenizer': BartTokenizer,
        'model': BartForQuestionAnswering,
    },
     {
        'name': 'valhalla/bart-large-finetuned-squadv1',
        'tokenizer': BartTokenizer,
        'model': BartForQuestionAnswering,
    },
    {
        'name': 'bert-base-uncased',
        'tokenizer': BertTokenizer,
        'model': BertForQuestionAnswering,
    },
     {
        'name': 'bert-large-uncased',
        'tokenizer': BertTokenizer,
        'model': BertForQuestionAnswering,
    },
     {
        'name': 'bert-base-cased',
        'tokenizer': BertTokenizer,
        'model': BertForQuestionAnswering,
    },
     {
        'name': 'bert-large-uncased-whole-word-masking-finetuned-squad',
        'tokenizer': BertTokenizer,
        'model': BertForQuestionAnswering,
    },
     {
        'name': 'google/bigbird-base-trivia-itc',
        'tokenizer': BigBirdTokenizer,
        'model': BigBirdForQuestionAnswering,
    }
]


In [12]:
model_results = []

for model_info in models_info:
    model_name = model_info['name']
    tokenizer_class = model_info['tokenizer']
    model_class = model_info['model']

    tokenizer = tokenizer_class.from_pretrained(model_name)
    model = model_class.from_pretrained(model_name)
    model.eval()

    avg_f1, avg_bleu, avg_rouge_1, avg_rouge_l, accuracy, total_time = evaluate_model(answers, context, model, model_name, queries, tokenizer)
    model_results.append({'Model': model_name, 'Avg F1 Score': avg_f1, 'Avg BLEU Score': avg_bleu, 'Avg ROUGE-1 Score': avg_rouge_1, 'Avg ROUGE-L Score': avg_rouge_l, 'Accuracy': accuracy, 'Time (s)': total_time})

# Save the results in a CSV file
csv_file = 'evaluation_results.csv'
with open(csv_file, mode='w', newline='') as file:
    fieldnames = ['Model', 'Avg F1 Score', 'Avg BLEU Score', 'Avg ROUGE-1 Score', 'Avg ROUGE-L Score', 'Accuracy', 'Time (s)']
    writer = csv.DictWriter(file, fieldnames=fieldnames)
    writer.writeheader()
    for result in model_results:
        writer.writerow(result)


Downloading (…)ve/main/spiece.model:   0%|          | 0.00/760k [00:00<?, ?B/s]

Downloading (…)lve/main/config.json:   0%|          | 0.00/684 [00:00<?, ?B/s]

Downloading model.safetensors:   0%|          | 0.00/47.4M [00:00<?, ?B/s]

Downloading (…)ve/main/spiece.model:   0%|          | 0.00/760k [00:00<?, ?B/s]

Downloading (…)cial_tokens_map.json:   0%|          | 0.00/156 [00:00<?, ?B/s]

Downloading (…)okenizer_config.json:   0%|          | 0.00/39.0 [00:00<?, ?B/s]

Downloading (…)lve/main/config.json:   0%|          | 0.00/716 [00:00<?, ?B/s]

Downloading pytorch_model.bin:   0%|          | 0.00/46.7M [00:00<?, ?B/s]

Downloading (…)ve/main/spiece.model:   0%|          | 0.00/760k [00:00<?, ?B/s]

Downloading (…)cial_tokens_map.json:   0%|          | 0.00/156 [00:00<?, ?B/s]

Downloading (…)okenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

Downloading (…)lve/main/config.json:   0%|          | 0.00/715 [00:00<?, ?B/s]

Downloading pytorch_model.bin:   0%|          | 0.00/890M [00:00<?, ?B/s]

Downloading (…)olve/main/vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

Downloading (…)olve/main/merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

Downloading (…)okenizer_config.json:   0%|          | 0.00/26.0 [00:00<?, ?B/s]

Downloading (…)lve/main/config.json:   0%|          | 0.00/1.63k [00:00<?, ?B/s]

Downloading pytorch_model.bin:   0%|          | 0.00/1.02G [00:00<?, ?B/s]

Downloading (…)olve/main/vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

Downloading (…)olve/main/merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

Downloading (…)cial_tokens_map.json:   0%|          | 0.00/150 [00:00<?, ?B/s]

Downloading (…)okenizer_config.json:   0%|          | 0.00/200 [00:00<?, ?B/s]

Downloading (…)lve/main/config.json:   0%|          | 0.00/1.16k [00:00<?, ?B/s]

Downloading model.safetensors:   0%|          | 0.00/1.63G [00:00<?, ?B/s]

Downloading (…)olve/main/vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

Downloading (…)olve/main/merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

Downloading (…)cial_tokens_map.json:   0%|          | 0.00/150 [00:00<?, ?B/s]

Downloading (…)okenizer_config.json:   0%|          | 0.00/26.0 [00:00<?, ?B/s]

Downloading (…)lve/main/config.json:   0%|          | 0.00/1.43k [00:00<?, ?B/s]

Downloading pytorch_model.bin:   0%|          | 0.00/1.63G [00:00<?, ?B/s]

Downloading (…)solve/main/vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

Downloading (…)okenizer_config.json:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

Downloading (…)lve/main/config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

Downloading model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

Downloading (…)solve/main/vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

Downloading (…)okenizer_config.json:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

Downloading (…)lve/main/config.json:   0%|          | 0.00/571 [00:00<?, ?B/s]

Downloading model.safetensors:   0%|          | 0.00/1.34G [00:00<?, ?B/s]

Downloading (…)solve/main/vocab.txt:   0%|          | 0.00/213k [00:00<?, ?B/s]

Downloading (…)okenizer_config.json:   0%|          | 0.00/29.0 [00:00<?, ?B/s]

Downloading (…)lve/main/config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

Downloading model.safetensors:   0%|          | 0.00/436M [00:00<?, ?B/s]

Downloading (…)solve/main/vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

Downloading (…)okenizer_config.json:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

Downloading (…)lve/main/config.json:   0%|          | 0.00/443 [00:00<?, ?B/s]

Downloading model.safetensors:   0%|          | 0.00/1.34G [00:00<?, ?B/s]

Downloading spiece.model:   0%|          | 0.00/846k [00:00<?, ?B/s]

Downloading (…)cial_tokens_map.json:   0%|          | 0.00/775 [00:00<?, ?B/s]

Downloading (…)okenizer_config.json:   0%|          | 0.00/943 [00:00<?, ?B/s]

Downloading (…)lve/main/config.json:   0%|          | 0.00/789 [00:00<?, ?B/s]

Downloading pytorch_model.bin:   0%|          | 0.00/527M [00:00<?, ?B/s]