# Library requirements

In [None]:
! pip install datasets
! pip install transformers
! pip install accelerate -U
! pip install evaluate
! pip install shap

# Reading and splitting data again

In [None]:
from datasets import load_dataset

In [None]:
# load the entire dataset
full_dataset = load_dataset("mlqa", "mlqa.en.en")

In [None]:
# need to split into train and test set, same seed as other file
split_dataset_traintest = full_dataset["test"].train_test_split(test_size=0.3, seed=42)
split_dataset_traintest

In [None]:
# split previous train set into train and val set
split_dataset_trainval = split_dataset_traintest["train"].train_test_split(test_size=0.3, seed=42)
split_dataset_trainval

# Evaluate the model

In [None]:
from transformers import AutoTokenizer, AutoModelForQuestionAnswering, TrainingArguments, Trainer
import torch
from google.colab import files

In [None]:
device =  'cuda' if torch.cuda.is_available() else 'cpu'
device

In [None]:
# getting our fine-tuned model and its tokenizer now
tokenizer = AutoTokenizer.from_pretrained("radyad/diff_model")
model = AutoModelForQuestionAnswering.from_pretrained("radyad/diff_model")
model.to(device)

In [None]:
from tqdm import tqdm

testset = split_dataset_traintest["test"]
all_predicted_answers = []
shap_data = [] # questions+context for shap
predictions = [] # for formatted entries of model's answers for evalution on squad
truth = [] # for formatted entries of actual answer's for evalution on squad
only_predictions = [] # for evaluation on bleu, meteor etc.
only_truth = [] # for evaluation on bleu, meteor etc.
start_scores = []
end_scores = []
tokens = []

for i in tqdm(range(len(testset))):
  # special_context = re.sub('[^a-zA-Z0-9.()]+', ' ', testset[i]['context'])
  '''
  text = testset[i]['context']
  text = [word.lower() for word in text.split() if word.lower() not in stopwords.words("english")]
  text= " ".join(text)
  '''

  if len(testset[i]['context'])>1600:
    context = testset[i]['context'][:1600]
  else:
    context = testset[i]['context']

  inputs = tokenizer(testset[i]['question'], context, return_tensors="pt")
  inputs.to(device)
  with torch.no_grad():
      outputs = model(**inputs)

  input_for_tokens = tokenizer.encode(testset[i]['question'], context)
  tokens.append(tokenizer.convert_ids_to_tokens(input_for_tokens))
  start_scores.append(outputs.start_logits)
  end_scores.append(outputs.end_logits)

  answer_start_index = outputs.start_logits.argmax()
  answer_end_index = outputs.end_logits.argmax()
  predict_answer_tokens = inputs.input_ids[0, answer_start_index : answer_end_index + 1]
  each_predicted_answer = tokenizer.decode(predict_answer_tokens)
  all_predicted_answers.append(each_predicted_answer)
  print(each_predicted_answer)

  # a list of questions+context in the shap format for using it later
  each_data = [testset[i]['question']+"[SEP]"+context]
  shap_data.append(each_data)

  # putting predictions and actual answers in the format for evaluating on squad dataset
  no_answer_probability = 1 if len(testset[i]['answers']['answer_start'])!=0 else 0
  each_formatted_pred = {'prediction_text':each_predicted_answer, 'id': testset[i]['id']}
  each_formatted_truth = {'answers': testset[i]['answers'], 'id': testset[i]['id']}
  predictions.append(each_formatted_pred)
  truth.append(each_formatted_truth)

  # putting predictions and actual answers in the format for calculating bleu score
  only_predictions.append(each_predicted_answer)
  only_truth.append(testset[i]['answers']['text'][0])


In [None]:
# calculating sqaud metrics scores: exact match and f1
from evaluate import load
squad_metric = load("squad")
results = squad_metric.compute(predictions=predictions, references=truth)
display(results)

text = 'These are the exact-match and f1 scores for the test data'
with open('diff_evaluation_results.csv','w') as f:
    f.write(text)
    f.write("\n"+str(results)+"\n")
    f.write("\n")

In [None]:
# calculating meteor scores
meteor = load("meteor")
results = meteor.compute(predictions=only_predictions, references=only_truth)
display(results)

text = 'This is the meteor score for the test data'
with open('diff_evaluation_results.csv','a') as f:
    f.write(text)
    f.write("\n"+str(results)+"\n")
    f.write("\n")

In [None]:
# calculating bleu scores
bleu = load("bleu")
results = bleu.compute(predictions=only_predictions, references=only_truth)
display(results)

text = 'These are the bleu scores for the test data'
with open('diff_evaluation_results.csv','a') as f:
    f.write(text)
    f.write("\n"+str(results)+"\n")
    f.write("\n")

In [None]:
# calculating google bleu scores
google_bleu = load("google_bleu")
results = google_bleu.compute(predictions=only_predictions, references=only_truth)
display(results)

text = 'This is the google-bleu score for the test data'
with open('diff_evaluation_results.csv','a') as f:
    f.write(text)
    f.write("\n"+str(results)+"\n")

In [None]:
files.download("diff_evaluation_results.csv")

# Answer explanations using SHAP

In [None]:
# get 6 random samples for generating their shap plots
import random
random.seed(446)

random_list = random.sample(range(1, len(testset)), 6)
random_list

In [None]:
random_list = [333, 3146, 2486, 3225, 1954, 3136]

In [None]:
for i in range(len(random_list)):
  display(only_predictions[random_list[i]])
  display(only_truth[random_list[i]])
  print("\n")

In [None]:
import transformers
import shap
import torch

# load the model
pmodel = transformers.pipeline('question-answering', model="radyad/diff_model")

# define two predictions, one that outputs the logits for the range start,
# and the other for the range end
def f(questions, start):
    outs = []
    for q in questions:
        question, context = q.split("[SEP]")
        d = pmodel.tokenizer(question, context)
        out = pmodel.model.forward(**{k: torch.tensor(d[k]).reshape(1, -1) for k in d})
        logits = out.start_logits if start else out.end_logits
        outs.append(logits.reshape(-1).detach().numpy())
    return outs
def f_start(questions):
    return f(questions, True)
def f_end(questions):
    return f(questions, False)

# attach a dynamic output_names property to the models so we can plot the tokens at each output position
def out_names(inputs):
    question, context = inputs.split("[SEP]")
    d = pmodel.tokenizer(question, context)
    return [pmodel.tokenizer.decode([id]) for id in d["input_ids"]]

f_start.output_names = out_names
f_end.output_names = out_names

In [None]:
# explainer objects for start and end positions

explainer_start = shap.Explainer(f_start, pmodel.tokenizer)
explainer_end = shap.Explainer(f_end, pmodel.tokenizer)

## Start positions SHAP plot

In [None]:
all_shap_values_start = []

for i in range(len(random_list)):
  shap_values_start = explainer_start(shap_data[random_list[i]])
  all_shap_values_start.append(shap_values_start)
  plot = shap.plots.text(shap_values_start, display=False)
  filename = "sample" + str(i+1) +"_start"
  file = open(filename + ".html",'w')
  file.write(plot)
  file.close()

In [None]:
for i in range(len(random_list)):
  filename = "sample" + str(i+1) +"_start"
  files.download(filename + ".html")

## End positions SHAP plot

In [None]:
# only for the answers with more than one word in the answer

all_shap_values_end = []
answers_with_end = []

for i in range(len(random_list)):
  words = only_predictions[random_list[i]].split()
  if(len(words))>1:
    answers_with_end.append(i)
    shap_values_end = explainer_end(shap_data[random_list[i]])
    all_shap_values_end.append(shap_values_end)
    plot = shap.plots.text(shap_values_end, display=False)
    filename = "sample" + str(i+1) +"_end"
    file = open(filename + ".html",'w')
    file.write(plot)
    file.close()

In [None]:
for i in range(len(answers_with_end)):
  filename = "sample" + str(answers_with_end[i]+1) +"_end"
  files.download(filename + ".html")

## Start scores bar graph

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
import pandas as pd

sns.set(style='darkgrid')
plt.rcParams["figure.figsize"] = (10,8)

In [None]:
for i in range(len(random_list)):
  # getting the start scores
  index = random_list[i]
  s_scores = start_scores[index].detach().cpu().numpy().flatten()

  # adding token index to each token label for the plot
  token_labels = []
  for (j, token) in enumerate(tokens[index]):
    token_labels.append('{:} - {:>2}'.format(token, j))

  # putting start scores and tokens into a dataframe and choosing the top 20
  start_scores_and_token_labels = list(zip(s_scores,token_labels))
  df = pd.DataFrame(start_scores_and_token_labels, columns=['start_scores','token_labels'])
  df = df.nlargest(20, "start_scores")

  # plotting the top 20 start scores
  ax = sns.barplot(x=df["token_labels"], y=df["start_scores"], errorbar=None)
  ax.set_xticklabels(ax.get_xticklabels(), rotation=90, ha="center")
  ax.grid(True)
  plt.title('Start scores for top 20 tokens')
  filename= "top20_start_sample" + str(i+1) + ".png"
  plt.savefig(filename, dpi=300, bbox_inches='tight')
  plt.clf()
  #plt.show()

In [None]:
for i in range(len(random_list)):
  filename= "top20_start_sample" + str(i+1) + ".png"
  files.download(filename)

## End scores bar graph

In [None]:
for i in range(len(answers_with_end)):
  # getting the end scores
  temp_index = answers_with_end[i]
  index = random_list[temp_index]
  e_scores = end_scores[index].detach().cpu().numpy().flatten()

  # adding token index to each token label for the plot
  token_labels = []
  for (j, token) in enumerate(tokens[index]):
    token_labels.append('{:} - {:>2}'.format(token, j))

  # putting end scores and tokens into a dataframe and choosing the top 20
  end_scores_and_token_labels = list(zip(e_scores,token_labels))
  df = pd.DataFrame(end_scores_and_token_labels, columns=['end_scores','token_labels'])
  df = df.nlargest(20, "end_scores")

  # plotting the top 20 end scores
  ax = sns.barplot(x=df["token_labels"], y=df["end_scores"], errorbar=None)
  ax.set_xticklabels(ax.get_xticklabels(), rotation=90, ha="center")
  ax.grid(False)
  plt.title('End scores for top 20 tokens')
  filename= "top20_end_sample" + str(temp_index+1) + ".png"
  plt.savefig(filename, dpi=300, bbox_inches='tight')
  plt.clf()
  #plt.show()

In [None]:
for i in range(len(answers_with_end)):
  filename= "top20_end_sample" + str(answers_with_end[i]+1) + ".png"
  files.download(filename)