In [47]:
import pandas as pd
import json

In [62]:
responses_riddle = pd.read_json('../responses/zero_shot_responses_riddle_sense.jsonl', lines=True)
responses_ooo = pd.read_json('../responses/zero_shot_responses_oddoneout.jsonl', lines=True)
responses_cj = pd.read_json('../responses/zero_shot_responses_causal_judgement.jsonl', lines=True)

responses_riddle = responses_riddle[~responses_riddle['question'].duplicated()].reset_index(drop=True)
responses_ooo = responses_ooo[~responses_ooo['choices'].duplicated()].reset_index(drop=True)
responses_cj = responses_cj[~responses_cj['question'].duplicated()].reset_index(drop=True)

In [63]:
responses_riddle['task'] = 'riddle'
responses_ooo['task'] = 'ooo'
responses_cj['task'] = 'cj'
responses = pd.concat([responses_riddle, responses_ooo, responses_cj])
responses.shape

(323, 8)

In [64]:
evaluatedf = pd.DataFrame()
evaluatedf['question'] = responses['question']
evaluatedf['task'] = responses['task']

evaluatedf['deberta'] = responses[['answer', 'deberta']].apply(lambda x: x['answer'] in x['deberta'], axis=1)
evaluatedf['gpt3.5'] = responses[['answer', 'gpt3.5']].apply(lambda x: x['answer'] in x['gpt3.5'], axis=1)
evaluatedf['gpt4o'] = responses[['answer', 'gpt4o']].apply(lambda x: x['answer'] in x['gpt4o'], axis=1)
evaluatedf['commandr'] = responses[['answer', 'commandr']].apply(lambda x: x['answer'] in x['commandr'], axis=1)

In [68]:
summary = evaluatedf.groupby('task').agg(
    deberta_true=('deberta', 'sum'),
    gpt3_5_true=('gpt3.5', 'sum'),
    gpt4o_true=('gpt4o', 'sum'),
    commandr_true=('commandr', 'sum'),
    total_questions=('question', 'count')
)

summary['deberta_accuracy'] = round((summary['deberta_true'] / summary['total_questions']) * 100, 2)
summary['gpt3_5_accuracy'] = round((summary['gpt3_5_true'] / summary['total_questions']) * 100, 2)
summary['gpt4o_accuracy'] = round((summary['gpt4o_true'] / summary['total_questions']) * 100, 2)
summary['commandr_accuracy'] = round((summary['commandr_true'] / summary['total_questions']) * 100, 2)

summary.T

task,cj,ooo,riddle
deberta_true,99.0,23.0,11.0
gpt3_5_true,110.0,60.0,34.0
gpt4o_true,125.0,73.0,40.0
commandr_true,102.0,53.0,27.0
total_questions,188.0,86.0,49.0
deberta_accuracy,52.66,26.74,22.45
gpt3_5_accuracy,58.51,69.77,69.39
gpt4o_accuracy,66.49,84.88,81.63
commandr_accuracy,54.26,61.63,55.1


In [74]:
import torch
from transformers import AutoTokenizer, AutoModelForMultipleChoice
from tqdm import tqdm

In [4]:
def predict_answer(model, tokenizer, question, choices):
    inputs = [f"Question: {question} Choice: {choice}" for choice in choices]

    encoding = tokenizer(
        inputs,
        truncation=True,
        padding="max_length",
        max_length=128,
        return_tensors="pt"
    )

    # The model expects inputs in shape: (batch_size, num_choices, seq_length)
    # Here we have one "batch" with multiple choices, so we add a batch dimension
    for key in encoding:
        encoding[key] = encoding[key].unsqueeze(0)  # Now shape: (1, num_choices, seq_length)

    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    model.to(device)
    encoding = {k: v.to(device) for k, v in encoding.items()}

    with torch.no_grad():
        outputs = model(**encoding)


    predicted_label = outputs.logits.argmax(dim=1).item()
    predicted_answer = choices[predicted_label]

    return predicted_answer

In [91]:
# set to true when loading model
load_flag = False
if load_flag:
    model = AutoModelForMultipleChoice.from_pretrained("../deberta_finetuned/")
    tokenizer = AutoTokenizer.from_pretrained("../deberta_finetuned/")

In [75]:
total_responses = len(responses)
total_riddle = len(responses[responses['task'] == 'riddle'])
total_cj = len(responses[responses['task'] == 'cj'])
total_ooo = len(responses[responses['task'] == 'ooo'])

correct_riddle = 0
correct_cj = 0
correct_ooo = 0

for i,row in tqdm(responses.iterrows(), desc = "Predicting"):
    task = row['task']
    question = row['question']
    choices = row['choices']
    answer = row['answer']

    deb_ans = predict_answer(model, tokenizer, question, choices)

    if answer in deb_ans or answer == deb_ans:
        if task == 'riddle':
            correct_riddle += 1
        if task == 'cj':
            correct_cj += 1
        if task == 'ooo':
            correct_ooo += 1



redicting: 323it [07:32,  1.40s/it]

In [80]:
print(f"% Correct Riddle Answers {correct_riddle}")
print(f"% Total Riddle Answers {total_riddle}")
print(f"% Accuracy {(correct_riddle/total_riddle)*100:.2f}%")
print("-"*50)
print(f"% Correct Causal Judgement Answers {correct_cj}")
print(f"% Total Causal Judgement Answers {total_cj}")
print(f"% Accuracy {(correct_cj/total_cj)*100:.2f}%")
print("-"*50)
print(f"% Correct Odd One Out Answers {correct_ooo}")
print(f"% Total Odd One Out Answers {total_ooo}")
print(f"% Accuracy {(correct_ooo/total_ooo)*100:.2f}%")
print("-"*50)
print("-"*50)
print(f"Total Correct Answers {correct_riddle+correct_cj+correct_ooo}")
print(f"Total Questions {total_responses}")
print(f"% Accuracy {((correct_riddle+correct_cj+correct_ooo)/total_responses)*100:.2f}%")

% Correct Riddle Answers 19
% Total Riddle Answers 49
% Accuracy 38.78%
--------------------------------------------------
% Correct Causal Judgement Answers 103
% Total Causal Judgement Answers 188
% Accuracy 54.79%
--------------------------------------------------
% Correct Odd One Out Answers 28
% Total Odd One Out Answers 86
% Accuracy 32.56%
--------------------------------------------------
--------------------------------------------------
Total Correct Answers 150
Total Questions 323
% Accuracy 46.44%


In [88]:
col_ord = ['deberta_true', 'finetun_deb_true', 'gpt3_5_true', 'gpt4o_true', 'commandr_true',
                   'deberta_accuracy',  'finetun_deb_accuracy', 'gpt3_5_accuracy', 'gpt4o_accuracy', 'commandr_accuracy', 'total_questions']
summary['finetun_deb_true'] = [correct_cj, correct_ooo, correct_riddle]
summary['finetun_deb_accuracy'] = [round((correct_cj/total_cj)*100, 2), round((correct_ooo/total_ooo)*100, 2), round((correct_riddle/total_riddle*100), 2)]
summary = summary[col_ord]
summary.T

task,cj,ooo,riddle
deberta_true,99.0,23.0,11.0
finetun_deb_true,103.0,28.0,19.0
gpt3_5_true,110.0,60.0,34.0
gpt4o_true,125.0,73.0,40.0
commandr_true,102.0,53.0,27.0
deberta_accuracy,52.66,26.74,22.45
finetun_deb_accuracy,54.79,32.56,38.78
gpt3_5_accuracy,58.51,69.77,69.39
gpt4o_accuracy,66.49,84.88,81.63
commandr_accuracy,54.26,61.63,55.1


In [93]:
summary

Unnamed: 0_level_0,deberta_true,finetun_deb_true,gpt3_5_true,gpt4o_true,commandr_true,deberta_accuracy,finetun_deb_accuracy,gpt3_5_accuracy,gpt4o_accuracy,commandr_accuracy,total_questions
task,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
cj,99,103,110,125,102,52.66,54.79,58.51,66.49,54.26,188
ooo,23,28,60,73,53,26.74,32.56,69.77,84.88,61.63,86
riddle,11,19,34,40,27,22.45,38.78,69.39,81.63,55.1,49
