# Library requirements

In [None]:
!pip install transformers SentencePiece accelerate evaluate sacrebleu

In [None]:
import pandas as pd
import json
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
import torch
from tqdm import tqdm
from evaluate import load
from google.colab import files
import sklearn

# Reading and processing data

In [None]:
# reading the validation split

# reading more than 100 samples to have some extra samples
# while prompting the model with few-shot prompting

In [None]:
df_val_qa = pd.read_json('Data/aokvqa_v1p0_val.json')
df_val_qa.iloc[0]

In [None]:
df_val_qa_102 = df_val_qa.sample(n=102, random_state=33)
df_val_qa_102.sort_values(by=["image_id"], inplace = True)
df_val_qa_102


In [None]:
# df_val_qa_102.to_csv("all_data.csv")
df_val_qa_102['image_id'].to_csv("image_ids.csv")
image_ids_list = df_val_qa_102['image_id'].tolist()

In [None]:
# getting the image captions for all the questions in the validation set
# we are using image captions directly here instead of working with the images
# and extracting features/information from them

with open("Data/captions_val2017.json",'r') as f:
    data = json.loads(f.read())

df_val_captions = pd.json_normalize(data["annotations"])

df_val_captions

In [None]:
# further filtering down the image captions for only the 102 samples we have selected

df_val_captions_102 = df_val_captions[(df_val_captions['image_id'].isin(image_ids_list))]
df_val_captions_102

In [None]:
# each image has multiple possible captions, so only selecting one for each
df_val_captions_102.drop_duplicates(subset=['image_id'], inplace = True)
df_val_captions_102.sort_values(by=["image_id"], inplace = True)
df_val_captions_102

In [None]:
df_val_qa_2, df_val_qa_100 = df_val_qa_102.iloc[:2, :], df_val_qa_102.iloc[2:, :]
df_val_captions_2, df_val_captions_100 = df_val_captions_102.iloc[:2, :], df_val_captions_102.iloc[2:, :]

In [None]:
# saving the questions and captions
temp_data = df_val_qa_100[['image_id', 'question']]
temp_images = df_val_captions_100[['caption']]
temp_concat = pd.concat([temp_data.reset_index(drop=True), temp_images.reset_index(drop=True)], axis=1)
temp_concat.to_csv("questions_and_captions.csv")

In [None]:
# the dataset has multiple choice answers or direct answers for the questions
# we use only direct answers for our task

truth_answers_list = []
for i in range(df_val_qa_100.shape[0]):
  truth_answers_list.append(df_val_qa_100.iloc[i]["direct_answers"])

In [None]:
# getting all the rationales/explanations for the answers (of our selected
# questions) from the dataset

truth_rationales_list = []
for i in range(df_val_qa_100.shape[0]):
  truth_rationales_list.append(df_val_qa_100.iloc[i]["rationales"])

# LLM model and prompting strategies

In [None]:
if torch.cuda.is_available():
    device = torch.device('cuda')
else:
    device = torch.device('cpu')

device

In [None]:
# getting the model and its tokenizer
tokenizer = AutoTokenizer.from_pretrained("google/flan-t5-large")
model = AutoModelForSeq2SeqLM.from_pretrained("google/flan-t5-large").to(device)

In [None]:
# Writing a prompt for getting the answers for each question

# This is a few shot prompt, i.e., the model gets a few examples of questions
# and answers. Then we add our question and its image context to this prompt
# and ask the model to generate an answer

# Additionally, through trial-and-error, it was found that the model
# performs better when the prompts start with a "please"

# Further analysis about this is included in the Project Report,
# under section 6.2.1 "Prompting strategy", pg. 7

answer_task = "Please answer the question given an image context. Use these examples for help. \n\n"
example1 = "Example 1 \n" + "Image context: " + df_val_captions_2.iloc[0]["caption"] + "\n" + "Question: " + df_val_qa_2.iloc[0]["question"] + "\n" + "Answer: "+ df_val_qa_2.iloc[0]["choices"][df_val_qa_2.iloc[0]['correct_choice_idx']] + "\n\n"
example2 = "Example 2 \n" + "Image context: " + df_val_captions_2.iloc[1]["caption"] + "\n" + "Question: " + df_val_qa_2.iloc[1]["question"] + "\n" + "Answer: "+ df_val_qa_2.iloc[1]["choices"][df_val_qa_2.iloc[1]['correct_choice_idx']] +"\n\n"

fixed_string = answer_task + example1 + example2
print(fixed_string)

Please answer the question given an image context. Use these examples for help. 

Example 1 
Image context: A bunch of bananas sitting on top of a wooden table.
Question: What treat can be made with this fruit and ice cream?
Answer: banana split

Example 2 
Image context: A pan filled with onions sitting next to a pan of stew.
Question: The reddish-brown food in the further bowl is what type of food?
Answer: meat




In [None]:
n = df_val_qa_100.shape[0]
# n = 2
prediction_answers_list = []

for i in tqdm(range(n)):

  input_text = fixed_string + "Image context: " + df_val_captions_100.iloc[i]["caption"] + "Question: " + df_val_qa_100.iloc[i]["question"] + "Answer: "
  input_ids = tokenizer(input_text, return_tensors="pt").input_ids.to(device)

  outputs = model.generate(input_ids)
  prediction_answers_list.append(tokenizer.decode(outputs[0][1:-1]))

prediction_answers_list


In [None]:
# Writing a prompt for getting the rationales for each answer that the model
# generates

# This is a zero-shot prompt, only the instruction is given to the model without
# any examples


# selected one rationale from the dataset for each sample (the dataset has 3 for
# each input)

rationale_task = "Please generate explanations for the given answer. \n\n"

fixed_string = rationale_task
print(fixed_string)

Please generate explanations for the given answer. 




In [None]:
n = df_val_qa_100.shape[0]
prediction_rationales_list = []

for i in tqdm(range(n)):

  each_prediction = "Image context: " + df_val_captions_100.iloc[i]["caption"] + "\n" + "Question: " + df_val_qa_100.iloc[i]["question"] + "\n" + "Answer: "+ prediction_answers_list[i] + "\n" + "Explanation: " "\n\n"
  input_text = fixed_string + each_prediction

  input_ids = tokenizer(input_text, return_tensors="pt").input_ids.to(device)

  outputs = model.generate(input_ids)
  prediction_rationales_list.append(tokenizer.decode(outputs[0][1:-1]))

prediction_rationales_list


# Results and evaluation metrics

In [None]:
# saving all the true and predicted answers and explanations

results_dictionary = {'image_ids': image_ids_list[2:],
                      'True answers': truth_answers_list,
                      'Predicted answer': prediction_answers_list,
                      'True rationales': truth_rationales_list,
                      'Predicted rationale': prediction_rationales_list}

df_results = pd.DataFrame(results_dictionary)
df_results

In [None]:
df_results.to_csv('answers_and_explanations.csv')

In [None]:
# Calculating the meteor scores for the model generated answers

meteor = load("meteor")
answers_results = meteor.compute(predictions=prediction_answers_list, references=truth_answers_list)
print("The METEOR score for the model's answers", answers_results)

In [None]:
text = 'This is the METEOR score for the answers'
with open('final_scores.csv','w') as f:
    f.write(text)
    f.write("\n"+str(answers_results)+"\n")
    f.write("\n")

In [None]:
# Calculating the meteor scores for the model generated rationales/explanations

predictions_results = meteor.compute(predictions=prediction_rationales_list, references=truth_rationales_list)
print("The METEOR score for the model's explanations", predictions_results)

In [None]:
text = 'This is the METEOR score for the explanations'
with open('final_scores.csv','a+') as f:
    f.write(text)
    f.write("\n"+str(predictions_results)+"\n")
    f.write("\n")

In [None]:
# Calculating the sacreblue scores for the model generated answers

sacrebleu = load("sacrebleu")
answers_results = sacrebleu.compute(predictions=prediction_answers_list, references=truth_answers_list)
print("The sacrebleu score for the model's answers", answers_results)

In [None]:
text = 'This is the sacrebleu score for the answers'
with open('final_scores.csv','a+') as f:
    f.write(text)
    f.write("\n"+str(answers_results)+"\n")
    f.write("\n")

In [None]:
# Calculating the sacreblue score for the model generated rationales/explanations

predictions_results = sacrebleu.compute(predictions=prediction_rationales_list, references=truth_rationales_list)
print("The sacrebleu score for the model's explanations", predictions_results)

In [None]:
text = 'This is the sacrebleu score for the explanations'
with open('final_scores.csv','a+') as f:
    f.write(text)
    f.write("\n"+str(predictions_results)+"\n")
    f.write("\n")

# Interannotator agreement

In [None]:
# Details on the annotation scheme can be found in the Project Report under
# section 6.2.2 "Annotation scheme", pg. 8

# annotator 1 goodness scores for all the explanations
y1_goodness = [1,1,1,0,0,1,0,0,1,1,0,0,1,0,0,1,0,0,1,1,0,1,1,1,0,1,0,0,1,0,0,1,0,0,0,0,1,1,1,1,0,1,0,0,1,0,0,0,0,0,1,1,0,0,1,0,0,0,1,1,1,0,1,1,0,0,1,0,1,0,1,1,0,1,0,0,0,1,0,0,1,0,0,0,1,0,0,0,0,0,0,0,1,0,1,1,0,1,1,1]

# annotator 2 goodness scores for all the explanations
y2_goodness = [1,0,1,0,0,0,0,1,1,1,0,0,0,0,0,1,0,0,1,1,1,1,1,1,0,1,0,0,1,0,0,0,0,0,0,0,1,1,1,0,0,1,1,0,1,0,0,0,0,0,1,1,0,0,0,1,1,0,1,1,1,0,1,1,1,1,1,0,1,0,0,1,0,1,0,0,1,1,0,0,1,0,0,0,1,0,1,0,0,0,0,0,1,0,0,0,0,1,1,1]

In [None]:
# calculating annotator agreement value using cohen's kappa

iaa_goodness = sklearn.metrics.cohen_kappa_score(y1_goodness, y2_goodness)
iaa_goodness

In [None]:
text = 'This is the inter-annotator agreement score for the goodness (The explanation is satisfying)'
with open('final_scores.csv','a+') as f:
    f.write(text)
    f.write("\n"+str(iaa_goodness)+"\n")
    f.write("\n")

In [None]:
# annotator 1 satisfaction scores for all the explanations
y1_satisfaction = [5,2,4,4,1,5,2,1,5,5,1,1,3,1,1,5,1,2,5,3,2,4,5,3,2,4,2,2,5,2,3,5,1,1,3,1,3,4,3,5,2,5,2,1,4,1,2,1,3,3,4,5,1,1,5,3,3,3,5,4,5,1,4,5,2,3,5,1,2,2,5,5,1,3,2,1,3,5,1,3,4,2,1,1,5,2,2,2,3,3,2,3,4,2,4,5,2,3,4,4]

# annotator 2 satisfaction scores for all the explanations
y2_satisfaction = [5,2,3,2,1,2,1,3,5,4,1,1,2,3,3,5,1,2,5,4,3,4,4,5,3,5,2,2,5,2,1,2,2,2,2,1,3,3,4,3,1,5,4,1,4,1,2,2,1,3,3,4,1,1,2,2,2,1,5,4,4,2,5,5,4,5,5,2,5,3,4,3,2,4,2,1,3,4,1,2,5,1,1,2,5,2,5,2,1,2,2,3,3,2,3,1,1,5,4,4]

In [None]:
# calculating annotator agreement value using cohen's kappa

iaa_satisfaction = sklearn.metrics.cohen_kappa_score(y1_satisfaction, y2_satisfaction)
iaa_satisfaction

In [None]:
text = 'This is the inter-annotator agreement score for the satisfaction (The explanation is satisfying)'
with open('final_scores.csv','a+') as f:
    f.write(text)
    f.write("\n"+str(iaa_satisfaction)+"\n")
    f.write("\n")

# Download results files

In [None]:
files.download('questions_and_captions.csv')
files.download('image_ids.csv')
files.download('answers_and_explanations.csv')
files.download('final_scores.csv')

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>