# Evaluation Script for Autocast Competition
Step 1: generate `submission.zip` with the same format of autocast competition and download `autocast_test_set_w_answers.csv` from [google drive](https://docs.google.com/spreadsheets/d/1O8kcHuLN7BbklXdHfRpVnJ65C5_qpCehBZPYQ6DRVl0/edit?usp=share_link).

Step 2: put  `submission.zip` and `autocast_test_set_w_answers.csv` in the same directory with `evaluation.ipynb`. 

Step 3: launch  `evaluation.ipynb`.

We validate this evaluation script with the random prediction and get the same score on the autocast official leaderboard (`Combined Metric: 87.41, T/F: 25.00, MCQ: 39.13, NUM: 23.28`). Feel free to use it for the class competition. 



In [1]:
import pandas as pd
import numpy as np
import pickle
import os

In [2]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [3]:
os.chdir('/content/drive/Shareddrives/CS542_Competition/Fine_Tuning')

In [4]:
!ls

 autocast_test_set_w_answers.csv
 autocast_test_set_w_answers.gsheet
 autocast_train_questions_combined_with_tf_negated.csv
 autocast_train_questions_combined_with_tf_negated.xlsx
 BERT_Fine_Tuning
 bert_fine_tuning_test
 BERT_Fine_Tuning_YL
 cached_lm_GPT2Tokenizer_128_test_set.txt
 cached_lm_GPT2Tokenizer_128_test_set.txt.lock
 cached_lm_GPT2Tokenizer_128_train_set.txt
 cached_lm_GPT2Tokenizer_128_train_set.txt.lock
'cleaned_train_set (1).gsheet'
 cleaned_train_set.csv
 cleaned_train_set.gsheet
 distilbert_fine_tuning_test
 distilbert_tuned_model
 distilbert_tuned_model_YL_4-6
 evaluation.ipynb
 filtered_train_set.csv
 GPT2_Few_Shot.ipynb
 gpt2-Fine_Tuning
'GPT-2 Model Training'
 GPT3_Few_Shot
 GPT4_Generated
 mc_gpt3_answers_t0.txt
 mc_gpt3_answers.txt
 mc_gpt3_processed_t0.txt
 olympics_qa.csv
 output
 output_dir
 qa_test.jsonl
 qa_train.jsonl
 starter.ipynb
 submission
 submission.zip
 test_dataset.csv
 test_dataset.gsheet
 test_set.txt
 tf_gpt3_answers.txt
 train_questions.csv.gs

In [5]:
def brier_score(probabilities, answer_probabilities):
    return ((probabilities - answer_probabilities) ** 2).sum() / 2



In [6]:
answers_csv = pd.read_csv('autocast_test_set_w_answers.csv')
answers = []
qtypes = []
for question in answers_csv.iterrows():
    question = question[1]
    if question['qtype'] == 't/f':
      # No [1,0]; Yes [0,1]
        ans_idx = 0 if question['answers'] == 'no' else 1
        ans = np.zeros(len(eval(question['choices'])))
        ans[ans_idx] = 1
        qtypes.append('t/f')
    elif question['qtype'] == 'mc':
        try:
            #print("question['answer']:", question['answer'])
            ans_idx = ord(question['answers']) - ord('A')
            #print(eval(question['choices']))
            ans = np.zeros(len(eval(question['choices'])))
            ans[ans_idx] = 1
            qtypes.append('mc')
        except:
            ans_idx = 0
            ans = np.zeros(len(eval(question['choices'])))
            ans[ans_idx] = 1
            qtypes.append('mc')
    elif question['qtype'] == 'num':
        ans = float(question['answers'])
        if np.isnan(ans):
            ans = 0.0
        qtypes.append('num')
    answers.append(ans)


In [7]:
! mkdir -p submission
! unzip -o submission.zip -d submission
with open(os.path.join('submission', 'predictions.pkl'), 'rb') as f:
    preds = pickle.load(f)

Archive:  submission.zip
  inflating: submission/predictions.pkl  
  inflating: submission/ramiyunda_predictions.pkl  


In [8]:
processed_preds = []
for i, question in enumerate(answers_csv.iterrows()):
    question = question[1]
    if question['qtype'] == 't/f':
      # No [1,0]; Yes [0,1]
        ans_idx = 0 if preds[i] == 'no' else 1
        ans = np.zeros(len(eval(question['choices'])))
        ans[ans_idx] = 1
    elif question['qtype'] == 'mc':
        try:
            #print("question['answer']:", question['answer'])
            ans_idx = ord(preds[i]) - ord('A')
            #print(eval(question['choices']))
            ans = np.zeros(len(eval(question['choices'])))
            ans[ans_idx] = 1
        except:
            ans_idx = 0
            ans = np.zeros(len(eval(question['choices'])))
            ans[ans_idx] = 1
    elif question['qtype'] == 'num':
        try:
            ans = float(preds[i])
        except:
            ans = 0.0

    processed_preds.append(ans)

In [9]:
tf_results, mc_results, num_results = [],[],[]
for p, a, qtype in zip(processed_preds, answers, qtypes):
    if qtype == 't/f':
        tf_results.append(brier_score(p, a))
    elif qtype == 'mc':
        mc_results.append(brier_score(p, a))
    else:
        num_results.append(np.abs(p - a))


print(f"T/F: {np.mean(tf_results)*100:.2f}, MCQ: {np.mean(mc_results)*100:.2f}, NUM: {np.mean(num_results)*100:.2f}")
print(f"Combined Metric: {(np.mean(tf_results) + np.mean(mc_results) + np.mean(num_results))*100:.2f}")




T/F: 52.06, MCQ: 81.87, NUM: 38.22
Combined Metric: 172.15
