In [1]:
from nltk.tokenize import word_tokenize
from nltk.translate.bleu_score import sentence_bleu, SmoothingFunction
from nltk.translate.meteor_score import meteor_score
from nltk.stem import PorterStemmer
import re
from collections import Counter
from sklearn.metrics import f1_score, accuracy_score

import json
import rouge
rs = rouge.Rouge()

import spacy
spacy.prefer_gpu()
pos_tagger = spacy.load('en_core_web_sm')
ps = PorterStemmer()

import pandas as pd
import numpy as np
import evaluate

metric = evaluate.combine(["meteor", "rouge"])
bleu = evaluate.load("bleu")
bs = evaluate.load("bertscore")

idk_list = [
    'i dont know',
    'i do not know',
    'im not sure',
    'i am not sure',
    'unsure',
    'possibly',
    'this is not related to my search'
    ]

negation_list = [
    'no',
    'not',
    'none',
    'isnt',
    'isn\'t',
    'dont',
    'don\'t',
    ]

def read_dialog_output(dialog_output):
    import subprocess
    result = subprocess.run(['jq','.metrics', dialog_output], stdout=subprocess.PIPE)
    res_dict = json.loads(result.stdout)
    return res_dict['ndcg@1']['mean'], \
    res_dict['ndcg@5']['mean'], \
    res_dict['ndcg@20']['mean'], \
    res_dict['p@1']['mean'], \
    res_dict['mrr']['mean']
        
def type_answer(answer):
    answer = re.sub(',', '', answer)
    if answer in idk_list:
        return 'idk'
    elif 'yes' in answer.split()[:3]:
        return 'yes'
    elif any([w in answer.split()[:3] for w in negation_list]):
        return 'no'
    else:
        return 'open'

def compute_metrics(refs, cands):
    result = metric.compute(predictions=cands, references=refs, use_stemmer=True)
    bleu3 = bleu.compute(predictions=cands, references=[[r] for r in refs], max_order = 3)
    bleu4 = bleu.compute(predictions=cands, references=[[r] for r in refs], max_order = 4)
    bertscore = bs.compute(predictions=cands, references=refs, lang = 'en')
    result['bleu3'] = bleu3['bleu']
    result['bleu4'] = bleu4['bleu']
    result['bertscore_precision'] = np.mean(bertscore['precision'])
    result['bertscore_recall'] = np.mean(bertscore['recall'])
    result['bertscore_f1'] = np.mean(bertscore['f1'])
    result = {k: round(v * 100, 4) for k, v in result.items() }
    return result


def evaluate_from_output(model_output, column_name, dialog_output):
    model_output_data = pd.read_csv(model_output)
    model_output_data = model_output_data.dropna(subset=['reference'])

    cand_type, ref_type = [], []

    for iter, row in model_output_data.iterrows():
        ref = row['reference'].lower()
        try:
            candidate = row[column_name].lower()
        except:
            candidate = 'no'
        #print(iter, candidate)
        cand_type.append(type_answer(candidate))
        ref_type.append(type_answer(ref))

        t = f1_score(ref_type, cand_type, average = 'macro')
        #t = accuracy_score(ref_type, cand_type)

    model_output_data.to_csv(model_output)

    ndcg1, ndcg5, ndcg20, p1, mrr = read_dialog_output(dialog_output)

    output_df = pd.read_csv(model_output)

    result = compute_metrics(refs = output_df['reference'].values.tolist(), cands = output_df[column_name].values.tolist())

    return result['bleu3'], result['bleu4'], result['rougeL'], result['meteor'], result['bertscore_f1'], t, ndcg1, ndcg5, ndcg20, p1, mrr


2024-01-31 23:29:25.953343: I tensorflow/tsl/cuda/cudart_stub.cc:28] Could not find cuda drivers on your machine, GPU will not be used.
2024-01-31 23:29:26.002211: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.
[nltk_data] Downloading package wordnet to /home/zhenduow/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package punkt to /home/zhenduow/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package omw-1.4 to /home/zhenduow/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


## Qulac Test Result

In [6]:

print("-------------------------------------------------------------------------------------------------------------------------------")
print("|         qulac          |                 generation similarity                |            retrieval performance            |")
print("-------------------------------------------------------------------------------------------------------------------------------")
print("|{:<24}| {:<8} {:<8} {:<8} {:<8} {:<8} {:<8}| {:<8} {:<8} {:<8} {:<8} {:<8}|".format('MODEL','BLEU3','BLEU4', 'ROUGE-L','METEOR', 'BERT-F1', 'TYPE-F1', 'nDCG@1', 'nDCG@5', 'nDCG@20', 'P@1', 'MRR'))



b3, b4, r, m, bf1, t, ndcg1, ndcg5, ndcg20, p1, mrr = evaluate_from_output('output/gpt35-qulac-0shot.csv', 'candidate', 'output/gpt35-qulac-0s.json')
print("-------------------------------------------------------------------------------------------------------------------------------")
print("|{:<24}| {:<8} {:<8} {:<8} {:<8} {:<8} {:<8}| {:<8} {:<8} {:<8} {:<8} {:<8}|".format('GPT3.5-0s', 
                                                            round(b3, 2), round(b4, 2), round(r, 2), round(m, 2),round(bf1, 2),  round(np.mean(t)*100, 2),
                                                            round(ndcg1, 4), round(ndcg5, 4), round(ndcg20, 4), round(p1, 4), round(mrr, 4),
                                                            ))   
 
b3, b4, r, m, bf1, t, ndcg1, ndcg5, ndcg20, p1, mrr = evaluate_from_output('output/gpt35-qulac-3shot.csv', 'candidate', 'output/gpt35-qulac-3s.json')
print("-------------------------------------------------------------------------------------------------------------------------------")
print("|{:<24}| {:<8} {:<8} {:<8} {:<8} {:<8} {:<8}| {:<8} {:<8} {:<8} {:<8} {:<8}|".format('GPT3.5-3s', 
                                                            round(b3, 2), round(b4, 2), round(r, 2), round(m, 2),round(bf1, 2),  round(np.mean(t)*100, 2),
                                                            round(ndcg1, 4), round(ndcg5, 4), round(ndcg20, 4), round(p1, 4), round(mrr, 4),
                                                            ))   

b3, b4, r, m, bf1, t, ndcg1, ndcg5, ndcg20, p1, mrr = evaluate_from_output('output/gpt4-qulac-0shot.csv', 'candidate', 'output/gpt4-qulac-0s.json')
print("-------------------------------------------------------------------------------------------------------------------------------")
print("|{:<24}| {:<8} {:<8} {:<8} {:<8} {:<8} {:<8}| {:<8} {:<8} {:<8} {:<8} {:<8}|".format('GPT4-0s', 
                                                            round(b3, 2), round(b4, 2), round(r, 2), round(m, 2),round(bf1, 2),  round(np.mean(t)*100, 2),
                                                            round(ndcg1, 4), round(ndcg5, 4), round(ndcg20, 4), round(p1, 4), round(mrr, 4),
                                                            ))  
 
b3, b4, r, m, bf1, t, ndcg1, ndcg5, ndcg20, p1, mrr = evaluate_from_output('output/gpt4-qulac-3shot.csv', 'candidate', 'output/gpt4-qulac-3s.json')
print("-------------------------------------------------------------------------------------------------------------------------------")
print("|{:<24}| {:<8} {:<8} {:<8} {:<8} {:<8} {:<8}| {:<8} {:<8} {:<8} {:<8} {:<8}|".format('GPT4-3s', 
                                                            round(b3, 2), round(b4, 2), round(r, 2), round(m, 2),round(bf1, 2),  round(np.mean(t)*100, 2),
                                                            round(ndcg1, 4), round(ndcg5, 4), round(ndcg20, 4), round(p1, 4), round(mrr, 4),
                                                            ))   

b3, b4, r, m, bf1, t, ndcg1, ndcg5, ndcg20, p1, mrr = evaluate_from_output('output/flan-qulac-0shot.csv', 'candidate', 'output/flan-qulac-0shot.json')
print("-------------------------------------------------------------------------------------------------------------------------------")
print("|{:<24}| {:<8} {:<8} {:<8} {:<8} {:<8} {:<8}| {:<8} {:<8} {:<8} {:<8} {:<8}|".format('Flan-xxl-0s', 
                                                            round(b3, 2), round(b4, 2), round(r, 2), round(m, 2),round(bf1, 2),  round(np.mean(t)*100, 2),
                                                            round(ndcg1, 4), round(ndcg5, 4), round(ndcg20, 4), round(p1, 4), round(mrr, 4),
                                                            ))          



b3, b4, r, m, bf1, t, ndcg1, ndcg5, ndcg20, p1, mrr = evaluate_from_output('output/flan-qulac-3shot.csv', 'candidate', 'output/flan-qulac-3shot.json')
print("-------------------------------------------------------------------------------------------------------------------------------")
print("|{:<24}| {:<8} {:<8} {:<8} {:<8} {:<8} {:<8}| {:<8} {:<8} {:<8} {:<8} {:<8}|".format('Flan-xxl-3s', 
                                                            round(b3, 2), round(b4, 2), round(r, 2), round(m, 2),round(bf1, 2),  round(np.mean(t)*100, 2),
                                                            round(ndcg1, 4), round(ndcg5, 4), round(ndcg20, 4), round(p1, 4), round(mrr, 4),
                                                            ))          
        

b3, b4, r, m, bf1, t, ndcg1, ndcg5, ndcg20, p1, mrr = evaluate_from_output('output/llama2-qulac-0shot.csv', 'candidate', 'output/llama2-qulac-0shot.json')
print("-------------------------------------------------------------------------------------------------------------------------------")
print("|{:<24}| {:<8} {:<8} {:<8} {:<8} {:<8} {:<8}| {:<8} {:<8} {:<8} {:<8} {:<8}|".format('LlaMa2-0s', 
                                                            round(b3, 2), round(b4, 2), round(r, 2), round(m, 2),round(bf1, 2),  round(np.mean(t)*100, 2),
                                                            round(ndcg1, 4), round(ndcg5, 4), round(ndcg20, 4), round(p1, 4), round(mrr, 4),
                                                            ))                                                                                         


b3, b4, r, m, bf1, t, ndcg1, ndcg5, ndcg20, p1, mrr = evaluate_from_output('output/llama2-qulac-3shot.csv', 'candidate', 'output/llama2-qulac-3shot.json')
print("-------------------------------------------------------------------------------------------------------------------------------")
print("|{:<24}| {:<8} {:<8} {:<8} {:<8} {:<8} {:<8}| {:<8} {:<8} {:<8} {:<8} {:<8}|".format('LlaMa2-3s', 
                                                            round(b3, 2), round(b4, 2), round(r, 2), round(m, 2),round(bf1, 2),  round(np.mean(t)*100, 2),
                                                            round(ndcg1, 4), round(ndcg5, 4), round(ndcg20, 4), round(p1, 4), round(mrr, 4),
                                                            ))                                                                                                                                                                    
                                                                                        
b3, b4, r, m, bf1, t, ndcg1, ndcg5, ndcg20, p1, mrr  = evaluate_from_output('output/t5-small-qulac-30.csv', 'candidate', 'output/t5-small-qulac-30.csv.json')
print("-------------------------------------------------------------------------------------------------------------------------------")
print("|{:<24}| {:<8} {:<8} {:<8} {:<8} {:<8} {:<8}| {:<8} {:<8} {:<8} {:<8} {:<8}|".format('T5-qulac', 
                                                            round(b3, 2), round(b4, 2), round(r, 2), round(m, 2),round(bf1, 2),  round(np.mean(t)*100, 2),
                                                            round(ndcg1, 4), round(ndcg5, 4), round(ndcg20, 4), round(p1, 4), round(mrr, 4),
                                                            ))   
                                                                                        
b3, b4, r, m, bf1, t, ndcg1, ndcg5, ndcg20, p1, mrr  = evaluate_from_output('output/unifiedqa-small-qulac-30.csv', 'candidate', 'output/u-qulac-30.csv.json')
print("-------------------------------------------------------------------------------------------------------------------------------")
print("|{:<24}| {:<8} {:<8} {:<8} {:<8} {:<8} {:<8}| {:<8} {:<8} {:<8} {:<8} {:<8}|".format('UnifiedQA-qulac', 
                                                            round(b3, 2), round(b4, 2), round(r, 2), round(m, 2),round(bf1, 2),  round(np.mean(t)*100, 2),
                                                            round(ndcg1, 4), round(ndcg5, 4), round(ndcg20, 4), round(p1, 4), round(mrr, 4),
                                                            ))   
                                                                                        
b3, b4, r, m, bf1, t, ndcg1, ndcg5, ndcg20, p1, mrr  = evaluate_from_output('output/unifiedqa-small-qulac-roberta.csv', 'candidate', 'output/unifiedqa-small-qulac-roberta.csv.json')
print("-------------------------------------------------------------------------------------------------------------------------------")
print("|{:<24}| {:<8} {:<8} {:<8} {:<8} {:<8} {:<8}| {:<8} {:<8} {:<8} {:<8} {:<8}|".format('UnifiedQA-roberta', 
                                                            round(b3, 2), round(b4, 2), round(r, 2), round(m, 2),round(bf1, 2),  round(np.mean(t)*100, 2),
                                                            round(ndcg1, 4), round(ndcg5, 4), round(ndcg20, 4), round(p1, 4), round(mrr, 4),
                                                            ))   
                                                                                        
b3, b4, r, m, bf1, t, ndcg1, ndcg5, ndcg20, p1, mrr  = evaluate_from_output('output/t5-small-qulac-30-all.csv', 'candidate', 'output/t-qulac-all.csv.json')
print("-------------------------------------------------------------------------------------------------------------------------------")
print("|{:<24}| {:<8} {:<8} {:<8} {:<8} {:<8} {:<8}| {:<8} {:<8} {:<8} {:<8} {:<8}|".format('T5-COOP', 
                                                            round(b3, 2), round(b4, 2), round(r, 2), round(m, 2),round(bf1, 2),  round(np.mean(t)*100, 2),
                                                            round(ndcg1, 4), round(ndcg5, 4), round(ndcg20, 4), round(p1, 4), round(mrr, 4),
                                                            ))   

b3, b4, r, m, bf1, t, ndcg1, ndcg5, ndcg20, p1, mrr  = evaluate_from_output('output/t5-small-qulac-short.csv', 'candidate', 'output/t-qulac-all.csv.json')
print("-------------------------------------------------------------------------------------------------------------------------------")
print("|{:<24}| {:<8} {:<8} {:<8} {:<8} {:<8} {:<8}| {:<8} {:<8} {:<8} {:<8} {:<8}|".format('T5-short', 
                                                            round(b3, 2), round(b4, 2), round(r, 2), round(m, 2),round(bf1, 2),  round(np.mean(t)*100, 2),
                                                            round(ndcg1, 4), round(ndcg5, 4), round(ndcg20, 4), round(p1, 4), round(mrr, 4),
                                                            ))   

b3, b4, r, m, bf1, t, ndcg1, ndcg5, ndcg20, p1, mrr  = evaluate_from_output('output/t5-small-qulac-long.csv', 'candidate', 'output/t-qulac-all.csv.json')
print("-------------------------------------------------------------------------------------------------------------------------------")
print("|{:<24}| {:<8} {:<8} {:<8} {:<8} {:<8} {:<8}| {:<8} {:<8} {:<8} {:<8} {:<8}|".format('T5-long', 
                                                            round(b3, 2), round(b4, 2), round(r, 2), round(m, 2),round(bf1, 2),  round(np.mean(t)*100, 2),
                                                            round(ndcg1, 4), round(ndcg5, 4), round(ndcg20, 4), round(p1, 4), round(mrr, 4),
                                                            ))   


b3, b4, r, m, bf1, t, ndcg1, ndcg5, ndcg20, p1, mrr  = evaluate_from_output('output/unifiedqa-small-qulac-roberta-all.csv', 'candidate', 'output/u-qulac-all.csv.json')
print("-------------------------------------------------------------------------------------------------------------------------------")
print("|{:<24}| {:<8} {:<8} {:<8} {:<8} {:<8} {:<8}| {:<8} {:<8} {:<8} {:<8} {:<8}|".format('RU-COOP', 
                                                            round(b3, 2), round(b4, 2), round(r, 2), round(m, 2),round(bf1, 2),  round(np.mean(t)*100, 2),
                                                            round(ndcg1, 4), round(ndcg5, 4), round(ndcg20, 4), round(p1, 4), round(mrr, 4),
                                                            ))   

b3, b4, r, m, bf1, t, ndcg1, ndcg5, ndcg20, p1, mrr  = evaluate_from_output('output/t5-small-qulac-30.csv', 'question', 'output/query-qulac.csv.json')
print("-------------------------------------------------------------------------------------------------------------------------------")
print("|{:<24}| {:<8} {:<8} {:<8} {:<8} {:<8} {:<8}| {:<8} {:<8} {:<8} {:<8} {:<8}|".format('Query only', 
                                                            "-", "-", "-", "-", "-", "-",
                                                            round(ndcg1, 4), round(ndcg5, 4), round(ndcg20, 4), round(p1, 4), round(mrr, 4),
                                                            ))   
                                                            
b3, b4, r, m, bf1, t, ndcg1, ndcg5, ndcg20, p1, mrr  = evaluate_from_output('output/t5-small-qulac-30.csv', 'reference', 'output/human-qulac.json')
print("-------------------------------------------------------------------------------------------------------------------------------")
print("|{:<24}| {:<8} {:<8} {:<8} {:<8} {:<8} {:<8}| {:<8} {:<8} {:<8} {:<8} {:<8}|".format('Reference', 
                                                            round(b3, 2), round(b4, 2), round(r, 2), round(m, 2),round(bf1, 2),  round(np.mean(t)*100, 2),
                                                            round(ndcg1, 4), round(ndcg5, 4), round(ndcg20, 4), round(p1, 4), round(mrr, 4),
                                                            ))   
                                                                                        
b3, b4, r, m, bf1, t, ndcg1, ndcg5, ndcg20, p1, mrr  = evaluate_from_output('output/t5-small-qulac-30.csv', 'facet', 'output/copy-test.csv.json')
print("-------------------------------------------------------------------------------------------------------------------------------")
print("|{:<24}| {:<8} {:<8} {:<8} {:<8} {:<8} {:<8}| {:<8} {:<8} {:<8} {:<8} {:<8}|".format('Copy Intent', 
                                                            round(b3, 2), round(b4, 2), round(r, 2), round(m, 2),round(bf1, 2),  round(np.mean(t)*100, 2),
                                                            round(ndcg1, 4), round(ndcg5, 4), round(ndcg20, 4), round(p1, 4), round(mrr, 4),
                                                            ))   

-------------------------------------------------------------------------------------------------------------------------------
|         qulac          |                 generation similarity                |            retrieval performance            |
-------------------------------------------------------------------------------------------------------------------------------
|MODEL                   | BLEU3    BLEU4    ROUGE-L  METEOR   BERT-F1  TYPE-F1 | nDCG@1   nDCG@5   nDCG@20  P@1      MRR     |
-------------------------------------------------------------------------------------------------------------------------------
|GPT3.5-0s               | 10.34    7.48     27.9     34.89    87.63    43.13   | 0.2045   0.2012   0.1829   0.267    0.3755  |
-------------------------------------------------------------------------------------------------------------------------------
|GPT3.5-3s               | 14.02    10.39    30.14    31.71    87.92    43.73   | 0.1966   0.1938   0.17

# ClariQ Experiments

In [3]:

print("-------------------------------------------------------------------------------------------------------------------------------")
print("|         clariq         |                 generation similarity                |            retrieval performance            |")
print("-------------------------------------------------------------------------------------------------------------------------------")
print("|{:<24}| {:<8} {:<8} {:<8} {:<8} {:<8} {:<8}| {:<8} {:<8} {:<8} {:<8} {:<8}|".format('MODEL','BLEU3','BLEU4', 'ROUGE-L','METEOR', 'BERT-F1', 'TYPE-F1', 'nDCG@1', 'nDCG@5', 'nDCG@20', 'P@1', 'MRR'))
                                                                                
b3, b4, r, m, bf1, t, ndcg1, ndcg5, ndcg20, p1, mrr = evaluate_from_output('output/gpt35-clariq-0shot.csv', 'candidate', 'output/gpt35-clariq-0s.json')
print("-------------------------------------------------------------------------------------------------------------------------------")
print("|{:<24}| {:<8} {:<8} {:<8} {:<8} {:<8} {:<8}| {:<8} {:<8} {:<8} {:<8} {:<8}|".format('GPT3.5', 
                                                            round(b3, 2), round(b4, 2), round(r, 2), round(m, 2),round(bf1, 2),  round(np.mean(t)*100, 2),
                                                            round(ndcg1, 4), round(ndcg5, 4), round(ndcg20, 4), round(p1, 4), round(mrr, 4),
                                                            ))  

b3, b4, r, m, bf1, t, ndcg1, ndcg5, ndcg20, p1, mrr = evaluate_from_output('output/gpt35-clariq-3shot.csv', 'candidate', 'output/gpt35-clariq-3s.json')
print("-------------------------------------------------------------------------------------------------------------------------------")
print("|{:<24}| {:<8} {:<8} {:<8} {:<8} {:<8} {:<8}| {:<8} {:<8} {:<8} {:<8} {:<8}|".format('GPT3.5-3s', 
                                                            round(b3, 2), round(b4, 2), round(r, 2), round(m, 2),round(bf1, 2),  round(np.mean(t)*100, 2),
                                                            round(ndcg1, 4), round(ndcg5, 4), round(ndcg20, 4), round(p1, 4), round(mrr, 4),
                                                            ))    
                                                                                                                                                           
b3, b4, r, m, bf1, t, ndcg1, ndcg5, ndcg20, p1, mrr = evaluate_from_output('output/gpt4-clariq-0shot.csv', 'candidate', 'output/gpt4-clariq-0s.json')
print("-------------------------------------------------------------------------------------------------------------------------------")
print("|{:<24}| {:<8} {:<8} {:<8} {:<8} {:<8} {:<8}| {:<8} {:<8} {:<8} {:<8} {:<8}|".format('GPT4', 
                                                            round(b3, 2), round(b4, 2), round(r, 2), round(m, 2),round(bf1, 2),  round(np.mean(t)*100, 2),
                                                            round(ndcg1, 4), round(ndcg5, 4), round(ndcg20, 4), round(p1, 4), round(mrr, 4),
                                                            ))   
                                                                                                                                                                                                                                       
b3, b4, r, m, bf1, t, ndcg1, ndcg5, ndcg20, p1, mrr = evaluate_from_output('output/gpt4-clariq-3shot.csv', 'candidate', 'output/gpt4-clariq-3s.json')
print("-------------------------------------------------------------------------------------------------------------------------------")
print("|{:<24}| {:<8} {:<8} {:<8} {:<8} {:<8} {:<8}| {:<8} {:<8} {:<8} {:<8} {:<8}|".format('GPT4-3s', 
                                                            round(b3, 2), round(b4, 2), round(r, 2), round(m, 2),round(bf1, 2),  round(np.mean(t)*100, 2),
                                                            round(ndcg1, 4), round(ndcg5, 4), round(ndcg20, 4), round(p1, 4), round(mrr, 4),
                                                            ))   


b3, b4, r, m, bf1, t, ndcg1, ndcg5, ndcg20, p1, mrr = evaluate_from_output('output/flan-clariq-0shot.csv', 'candidate', 'output/flan-clariq-0shot.json')
print("-------------------------------------------------------------------------------------------------------------------------------")
print("|{:<24}| {:<8} {:<8} {:<8} {:<8} {:<8} {:<8}| {:<8} {:<8} {:<8} {:<8} {:<8}|".format('Flan-xxl-0s', 
                                                            round(b3, 2), round(b4, 2), round(r, 2), round(m, 2),round(bf1, 2),  round(np.mean(t)*100, 2),
                                                            round(ndcg1, 4), round(ndcg5, 4), round(ndcg20, 4), round(p1, 4), round(mrr, 4),
                                                            ))          

b3, b4, r, m, bf1, t, ndcg1, ndcg5, ndcg20, p1, mrr = evaluate_from_output('output/flan-clariq-3shot.csv', 'candidate', 'output/flan-clariq-3shot.json')
print("-------------------------------------------------------------------------------------------------------------------------------")
print("|{:<24}| {:<8} {:<8} {:<8} {:<8} {:<8} {:<8}| {:<8} {:<8} {:<8} {:<8} {:<8}|".format('Flan-xxl-3s', 
                                                            round(b3, 2), round(b4, 2), round(r, 2), round(m, 2),round(bf1, 2),  round(np.mean(t)*100, 2),
                                                            round(ndcg1, 4), round(ndcg5, 4), round(ndcg20, 4), round(p1, 4), round(mrr, 4),
                                                            ))          
                       
b3, b4, r, m, bf1, t, ndcg1, ndcg5, ndcg20, p1, mrr = evaluate_from_output('output/llama2-clariq-0shot.csv', 'candidate', 'output/llama2-clariq-0shot.json')
print("-------------------------------------------------------------------------------------------------------------------------------")
print("|{:<24}| {:<8} {:<8} {:<8} {:<8} {:<8} {:<8}| {:<8} {:<8} {:<8} {:<8} {:<8}|".format('LLaMa2-0s', 
                                                            round(b3, 2), round(b4, 2), round(r, 2), round(m, 2),round(bf1, 2),  round(np.mean(t)*100, 2),
                                                            round(ndcg1, 4), round(ndcg5, 4), round(ndcg20, 4), round(p1, 4), round(mrr, 4),
                                                            ))   

b3, b4, r, m, bf1, t, ndcg1, ndcg5, ndcg20, p1, mrr = evaluate_from_output('output/llama2-clariq-3shot.csv', 'candidate', 'output/llama2-clariq-3shot.json')
print("-------------------------------------------------------------------------------------------------------------------------------")
print("|{:<24}| {:<8} {:<8} {:<8} {:<8} {:<8} {:<8}| {:<8} {:<8} {:<8} {:<8} {:<8}|".format('LLaMa2-3s', 
                                                            round(b3, 2), round(b4, 2), round(r, 2), round(m, 2),round(bf1, 2),  round(np.mean(t)*100, 2),
                                                            round(ndcg1, 4), round(ndcg5, 4), round(ndcg20, 4), round(p1, 4), round(mrr, 4),
                                                            ))   

b3, b4, r, m, bf1, t, ndcg1, ndcg5, ndcg20, p1, mrr = evaluate_from_output('output/t5-small-clariq-30.csv', 'candidate', 'output/t5-small-clariq-30.csv.json')
print("-------------------------------------------------------------------------------------------------------------------------------")
print("|{:<24}| {:<8} {:<8} {:<8} {:<8} {:<8} {:<8}| {:<8} {:<8} {:<8} {:<8} {:<8}|".format('T5-clariq', 
                                                            round(b3, 2), round(b4, 2), round(r, 2), round(m, 2),round(bf1, 2),  round(np.mean(t)*100, 2),
                                                            round(ndcg1, 4), round(ndcg5, 4), round(ndcg20, 4), round(p1, 4), round(mrr, 4),
                                                            ))    

b3, b4, r, m, bf1, t, ndcg1, ndcg5, ndcg20, p1, mrr = evaluate_from_output('output/unifiedqa-small-clariq-30.csv', 'candidate', 'output/unifiedqa-small-clariq-30.csv.json')
print("-------------------------------------------------------------------------------------------------------------------------------")
print("|{:<24}| {:<8} {:<8} {:<8} {:<8} {:<8} {:<8}| {:<8} {:<8} {:<8} {:<8} {:<8}|".format('UnifiedQA-clariq', 
                                                            round(b3, 2), round(b4, 2), round(r, 2), round(m, 2),round(bf1, 2),  round(np.mean(t)*100, 2),
                                                            round(ndcg1, 4), round(ndcg5, 4), round(ndcg20, 4), round(p1, 4), round(mrr, 4),
                                                            ))    

b3, b4, r, m, bf1, t, ndcg1, ndcg5, ndcg20, p1, mrr = evaluate_from_output('output/unifiedqa-small-clariq-roberta.csv', 'candidate', 'output/unifiedqa-small-clariq-roberta.csv.json')
print("-------------------------------------------------------------------------------------------------------------------------------")
print("|{:<24}| {:<8} {:<8} {:<8} {:<8} {:<8} {:<8}| {:<8} {:<8} {:<8} {:<8} {:<8}|".format('UnifiedQA-clariq-roberta', 
                                                            round(b3, 2), round(b4, 2), round(r, 2), round(m, 2),round(bf1, 2),  round(np.mean(t)*100, 2),
                                                            round(ndcg1, 4), round(ndcg5, 4), round(ndcg20, 4), round(p1, 4), round(mrr, 4),
                                                            ))    

b3, b4, r, m, bf1, t, ndcg1, ndcg5, ndcg20, p1, mrr  = evaluate_from_output('output/t5-small-clariq-30-all.csv', 'candidate', 'output/t-clariq-all.csv.json')
print("-------------------------------------------------------------------------------------------------------------------------------")
print("|{:<24}| {:<8} {:<8} {:<8} {:<8} {:<8} {:<8}| {:<8} {:<8} {:<8} {:<8} {:<8}|".format('T5-COOP', 
                                                            round(b3, 2), round(b4, 2), round(r, 2), round(m, 2),round(bf1, 2),  round(np.mean(t)*100, 2),
                                                            round(ndcg1, 4), round(ndcg5, 4), round(ndcg20, 4), round(p1, 4), round(mrr, 4),
                                                            ))   

b3, b4, r, m, bf1, t, ndcg1, ndcg5, ndcg20, p1, mrr  = evaluate_from_output('output/unifiedqa-small-clariq-roberta-all.csv', 'candidate', 'output/r-clariq-all.csv.json')
print("-------------------------------------------------------------------------------------------------------------------------------")
print("|{:<24}| {:<8} {:<8} {:<8} {:<8} {:<8} {:<8}| {:<8} {:<8} {:<8} {:<8} {:<8}|".format('RU-COOP', 
                                                            round(b3, 2), round(b4, 2), round(r, 2), round(m, 2),round(bf1, 2),  round(np.mean(t)*100, 2),
                                                            round(ndcg1, 4), round(ndcg5, 4), round(ndcg20, 4), round(p1, 4), round(mrr, 4),
                                                            ))    
                                                            
b3, b4, r, m, bf1, t, ndcg1, ndcg5, ndcg20, p1, mrr = evaluate_from_output('output/t5-small-clariq-30.csv', 'question', 'output/query-clariq.csv.json')
print("-------------------------------------------------------------------------------------------------------------------------------")
print("|{:<24}| {:<8} {:<8} {:<8} {:<8} {:<8} {:<8}| {:<8} {:<8} {:<8} {:<8} {:<8}|".format('Query only', 
                                                            "-", "-", "-", "-", "-", "-",
                                                            round(ndcg1, 4), round(ndcg5, 4), round(ndcg20, 4), round(p1, 4), round(mrr, 4),
                                                            ))   

b3, b4, r, m, bf1, t, ndcg1, ndcg5, ndcg20, p1, mrr = evaluate_from_output('output/t5-small-clariq-30.csv', 'reference', 'output/reference-clariq.csv.json')
print("-------------------------------------------------------------------------------------------------------------------------------")
print("|{:<24}| {:<8} {:<8} {:<8} {:<8} {:<8} {:<8}| {:<8} {:<8} {:<8} {:<8} {:<8}|".format('Reference', 
                                                            round(b3, 2), round(b4, 2), round(r, 2), round(m, 2),round(bf1, 2),  round(np.mean(t)*100, 2),
                                                            round(ndcg1, 4), round(ndcg5, 4), round(ndcg20, 4), round(p1, 4), round(mrr, 4),
                                                            ))    

b3, b4, r, m, bf1, t, ndcg1, ndcg5, ndcg20, p1, mrr = evaluate_from_output('output/t5-small-clariq-30.csv', 'facet', 'output/copy-clariq.csv.json')
print("-------------------------------------------------------------------------------------------------------------------------------")
print("|{:<24}| {:<8} {:<8} {:<8} {:<8} {:<8} {:<8}| {:<8} {:<8} {:<8} {:<8} {:<8}|".format('Copy Intent', 
                                                            round(b3, 2), round(b4, 2), round(r, 2), round(m, 2),round(bf1, 2),  round(np.mean(t)*100, 2),
                                                            round(ndcg1, 4), round(ndcg5, 4), round(ndcg20, 4), round(p1, 4), round(mrr, 4),
                                                            ))    

-------------------------------------------------------------------------------------------------------------------------------
|         clariq         |                 generation similarity                |            retrieval performance            |
-------------------------------------------------------------------------------------------------------------------------------
|MODEL                   | BLEU3    BLEU4    ROUGE-L  METEOR   BERT-F1  TYPE-F1 | nDCG@1   nDCG@5   nDCG@20  P@1      MRR     |
-------------------------------------------------------------------------------------------------------------------------------
|GPT3.5                  | 9.68     6.96     26.88    34.62    87.3     43.77   | 0.1399   0.1303   0.1127   0.1651   0.2393  |
-------------------------------------------------------------------------------------------------------------------------------
|GPT3.5-3s               | 12.17    8.78     28.44    30.43    87.56    43.82   | 0.1342   0.1243   0.10

## Roberta Gen

In [None]:
import torch as T
import pandas as pd
import tqdm
from transformers import RobertaTokenizer, RobertaForSequenceClassification
from transformers import (
    AdamW,
    T5ForConditionalGeneration,
    T5Tokenizer,
)

device = T.device("cuda")

roberta_tokenizer = RobertaTokenizer.from_pretrained('roberta-base')
roberta_model = RobertaForSequenceClassification.from_pretrained("output/roberta-qulac/checkpoint-102")



local_dir = "./output/t5-small-qulac/"
tokenizer = T5Tokenizer.from_pretrained(local_dir)
model = T5ForConditionalGeneration.from_pretrained(local_dir).cuda()
df = pd.read_csv('qulac_test.csv')

for iter, row in tqdm.tqdm(df.iterrows(), total=len(df)):
    test_tokenized = tokenizer.encode_plus(row['unifiedqa-question'], return_tensors="pt")
    test_input_ids = test_tokenized["input_ids"].to(device)


    clf_inputs = roberta_tokenizer(row['unifiedqa-question'], return_tensors="pt")

    with T.no_grad():
        logits = roberta_model(**clf_inputs).logits

    predicted_class_id = logits.argmax().item()

    prefix = ''
    if predicted_class_id == 3 : # 3: yes
        prefix = 'yes'
    elif predicted_class_id == 1: #1: no
        prefix = 'no'
    elif predicted_class_id == 0: # 0: idk
        prefix = 'i dont know'
    else:
        prefix = ''

    tokenized_decoder_input = tokenizer.encode_plus(prefix, return_tensors="pt")
    decoder_input_ids = tokenized_decoder_input["input_ids"].to(device)[0][0].item()
    decoder_input_ids = T.tensor([[0, decoder_input_ids]]).to(device)
    
    df.at[iter, 'decoder-input'] = prefix
    inputs = tokenizer(row['unifiedqa-question'], max_length=128, padding=True, truncation=True)
    inputs['decoder_input_ids'] = decoder_input_ids
    print(prefix)
    print(model(inputs))

    '''
    tokenized_decoder_input = tokenizer.encode_plus(prefix, return_tensors="pt")
    decoder_input_ids = tokenized_decoder_input["input_ids"].to(device)[0][0].item()
    decoder_input_ids = T.tensor([[0, decoder_input_ids]]).to(device)
    
    model.eval()
    if prefix == '':
        beam_output = model.generate(
            input_ids=test_input_ids,
            max_length=96,
            early_stopping=True,
            num_beams=10,
            top_k=50, 
            top_p=0.9, 
            num_return_sequences=10,
            no_repeat_ngram_size=2
        )

    else:
        beam_output = model.generate(
            input_ids=test_input_ids,
            decoder_input_ids = decoder_input_ids,
            max_length=96,
            early_stopping=True,
            num_beams=10,
            top_k=50, 
            top_p=0.9, 
            num_return_sequences=10,
            no_repeat_ngram_size=2
        )

    decoded_output = tokenizer.decode(beam_output[0], skip_special_tokens=True,clean_up_tokenization_spaces=True)

    df.at[iter, 'candidate'] = decoded_output.strip()
    df.at[iter, 'facet_desc'] = df.at[iter, 'facet_desc'].lower()
    df.at[iter, 'question'] = df.at[iter, 'question'].lower()

    output = df[['facet_desc','question','answer','candidate']].copy(deep=True)
    output.columns = ['facet', 'question', 'reference','candidate']
    output.to_csv('qulac_test.csv')
    '''
df.to_csv('qulac_test.csv', index=False)

## Oracle clf

In [None]:
import torch as T
from transformers import (
    AdamW,
    T5ForConditionalGeneration,
    T5Tokenizer,
)
import tqdm
import pandas as pd
device = T.device("cuda")

local_dir = "./output/unifiedqa-trainer-small-qulac-30/"
tokenizer = T5Tokenizer.from_pretrained(local_dir)
model = T5ForConditionalGeneration.from_pretrained(local_dir).cuda()

df = pd.read_csv('qulac_test.csv')

for iter, row in tqdm.tqdm(df.iterrows(), total=len(df)):
    test_tokenized = tokenizer.encode_plus(row['unifiedqa-question'], return_tensors="pt")
    test_input_ids = test_tokenized["input_ids"].to(device)

    answer_type = row['answer-type']

    ''' 
    prefix = ''
    if answer_type == 'yes' or answer_type == 'no':
        prefix = answer_type
    elif answer_type == 'open':
        prefix = ''
    else:
        prefix = 'i dont know'
    '''

    prefix = row['answer'].split()[0]

    model.eval()
    beam_output = model.generate(
        input_ids=test_input_ids,
        max_length=96,
        early_stopping=True,
        num_beams=10,
        top_k=50, 
        top_p=0.9, 
        num_return_sequences=10,
        no_repeat_ngram_size=2
    )

    decoded_output = [tokenizer.decode(bo, skip_special_tokens=True,clean_up_tokenization_spaces=True) for bo in beam_output]

    if prefix == 'yes' or prefix == 'no':
        filtered_output = [d for d in decoded_output if (len(d)>0) and (d.split()[0] == prefix) ]
        gen = filtered_output[0] if len(filtered_output) > 0 else decoded_output[0]
    else:
        gen = decoded_output[0]

    df.at[iter, 'candidate'] = gen.strip()

    output = df[['facet_desc','question','answer','candidate']].copy(deep=True)
    output.columns = ['facet', 'question', 'reference','candidate']
    output.to_csv('output/unifiedqa-small-qulac-oracle-type.csv')

## Oracle Gen

In [None]:
import torch as T
from transformers import (
    AdamW,
    T5ForConditionalGeneration,
    T5Tokenizer,
)
import tqdm
import pandas as pd
device = T.device("cuda")

local_dir = "./output/unifiedqa-trainer-small-qulac-30/"
tokenizer = T5Tokenizer.from_pretrained(local_dir)
model = T5ForConditionalGeneration.from_pretrained(local_dir).cuda()

df = pd.read_csv('qulac_test.csv')

for iter, row in tqdm.tqdm(df.iterrows(), total=len(df)):
    test_tokenized = tokenizer.encode_plus(row['unifiedqa-question'], return_tensors="pt")
    test_input_ids = test_tokenized["input_ids"].to(device)

    answer_type = row['answer-type']

    
    prefix = ''
    if answer_type == 'yes' or answer_type == 'no':
        prefix = answer_type
    elif answer_type == 'open':
        prefix = ''
    else:
        prefix = 'i dont know'
    
    #prefix = row['answer'].split()[0]
    if prefix == "":
        model.eval()
        beam_output = model.generate(
            input_ids=test_input_ids,
            max_length=96,
            early_stopping=True,
            num_beams=10,
            top_k=50, 
            top_p=0.9, 
            num_return_sequences=10,
            no_repeat_ngram_size=2
        )
    else:
        tokenized_decoder_input = tokenizer.encode_plus(prefix, return_tensors="pt")
        tokenized_prefix = tokenized_decoder_input["input_ids"].to(device)
        len_tokenized_prefix = T.tensor(len(tokenized_prefix[0]), dtype=T.long, device=device)
        decoder_input_ids =  T.tensor([[0]]).to(device)
        for token in range(len_tokenized_prefix-1):
            decoder_input_ids = T.cat((decoder_input_ids, T.tensor([[tokenized_prefix[0][token]]]).to(device)), 1)

        model.eval()
        beam_output = model.generate(
            input_ids=test_input_ids,
            decoder_input_ids = decoder_input_ids,
            max_length=96,
            early_stopping=True,
            num_beams=10,
            top_k=50, 
            top_p=0.9, 
            num_return_sequences=10,
            no_repeat_ngram_size=2
        )

    decoded_output = tokenizer.decode(beam_output[0], skip_special_tokens=True,clean_up_tokenization_spaces=True)

    df.at[iter, 'candidate'] = decoded_output.strip()

    output = df[['facet_desc','question','answer','candidate']].copy(deep=True)
    output.columns = ['facet', 'question', 'reference','candidate']
    output.to_csv('output/unifiedqa-small-qulac-oracle-gen.csv')

## Custom loss function

In [None]:
import torch as T
from torch import nn
from transformers import (
    AdamW,
    T5ForConditionalGeneration,
    T5Tokenizer,
)
import torch.nn.functional as F
import tqdm
import pandas as pd
device = T.device("cuda")

local_dir = "./output/unifiedqa-trainer-small-qulac-30/"
tokenizer = T5Tokenizer.from_pretrained(local_dir)
model = T5ForConditionalGeneration.from_pretrained(local_dir).cuda()


df = pd.read_csv('qulac_dev.csv')

input_ids = tokenizer(df.at[1,'unifiedqa-question'], return_tensors="pt").input_ids.to(device)
labels = tokenizer(df.at[1,'answer'], return_tensors="pt").input_ids.to(device)

outputs = model(input_ids=input_ids, labels = labels)

logits = outputs['logits']
bsz, length, vsz = logits.shape

print("logits size", bsz, length, vsz)

print("labels size", labels.shape)


category_labels = T.zeros(bsz)

print("c_l size", category_labels.shape)
print("c_l", category_labels)
category_labels[5][0] = 1
print("c_l", category_labels)



'''
p_idk = 0
for utterance in idk_list:
    tokenized_u = tokenizer(utterance, return_tensors="pt").input_ids[0]
    p_utterance = 1
    for i in range(min(outputs['logits'].size(1), len(tokenized_u[0]))):
        idx = tokenized_u[i]
        p_utterance *= T.exp(outputs['logits'][0][i][idx])
    p_idk += p_utterance
print("p_idk", p_idk)


p_yes = 0
yes_idx = tokenizer('yes', return_tensors="pt").input_ids[0][0]
for i in range(min(outputs['logits'].size(1), 4)):
    p_yes += T.exp(outputs['logits'][0][i][yes_idx])
print('p_yes', p_yes)

'''

p_no = 0
for word in negation_list:   
    word_idx = tokenizer(word, return_tensors="pt").input_ids[0][:-1]
    p_no = 1
    print(word_idx)

        
first_word_logits = T.unsqueeze(outputs['logits'].view(-1, outputs['logits'].size(-1))[0],  dim=0)
rest_logits = outputs['logits'].view(-1, outputs['logits'].size(-1))

first_word_labels = T.unsqueeze(labels.view(-1)[0], dim=0)
rest_labels = labels.view(-1)


gen_len = labels.size(-1)

first_word_weight = 0
loss = first_word_weight * F.cross_entropy(first_word_logits, 
                    first_word_labels, 
                    ignore_index=-100) \
                + F.cross_entropy(rest_logits, 
                    rest_labels, 
                    ignore_index=-100)

print(outputs['loss'])

print(loss)


## Permute answers

In [None]:
import pandas as pd
import numpy as np

name2col = {
    'ref': 'answer1',
    't5': 'answer2',
    'ru': 'answer3',
}

df = pd.read_csv('10.csv')
rand_perm_list = []
for iter, row in df.iterrows():
    a1 = row['answer1']
    a2 = row['answer2']
    a3 = row['answer3']
    rand_perm = np.random.permutation(['ref', 't5', 'ru'])
    df.at[iter, 'answer1'] = row[name2col[rand_perm[0]]]
    df.at[iter, 'answer2'] = row[name2col[rand_perm[1]]]
    df.at[iter, 'answer3'] = row[name2col[rand_perm[2]]]
    rand_perm_list.append(rand_perm)

df.to_csv('randomized_10.csv')
with open('rand_perm_list', 'w') as f:
    for rdp in rand_perm_list:
        for name in rdp:
            f.write(name)
            f.write('\t')
        f.write('\n')

## Convert clariq facet_id to qulac facet_id to reuse qulac qrel file

In [None]:
import pandas as pd
import re

qulac_test_df = pd.read_json('cosearcher/data/qulac.json')
print(qulac_test_df.columns)

for iter, row in qulac_test_df.iterrows():
    qulac_test_df.at[iter, 'facet_desc'] = re.sub('\\\\', '', qulac_test_df.at[iter, 'facet_desc'])

def find_facet_id(facet):
    for iter, row in qulac_test_df.iterrows():
        if facet == qulac_test_df.at[iter, 'facet_desc']:
            return qulac_test_df.at[iter, 'facet_id']
    print(facet)
    return 'error'

clariq_test_df = pd.read_csv('clariq_dev.csv')
clariq_test_df = clariq_test_df[clariq_test_df.topic_id < 201]
for iter,row in clariq_test_df.iterrows():
    clariq_test_df.at[iter, 'topic'] = clariq_test_df.at[iter, 'topic_desc']
    clariq_test_df.at[iter, 'facet_id'] = find_facet_id(clariq_test_df.at[iter, 'facet_desc'])
clariq_test_df = clariq_test_df[['topic','topic_desc','answer','facet_desc','question','facet_id','topic_id']].copy(deep=True)
print(clariq_test_df.head(5))
clariq_test_df.to_json('cosearcher/data/clariq.dev.json')



## Human evaluation

In [None]:
import pandas as pd
import json
import numpy as np
import scipy.stats as stats
import seaborn as sns
import matplotlib.pyplot as plt
from scipy.stats import pearsonr
from scipy.stats import f

def TwoSampleT2Test(X, Y):
    nx, p = X.shape
    ny, _ = Y.shape
    delta = np.mean(X, axis=0) - np.mean(Y, axis=0)
    Sx = np.cov(X, rowvar=False)
    Sy = np.cov(Y, rowvar=False)
    S_pooled = ((nx-1)*Sx + (ny-1)*Sy)/(nx+ny-2)
    t_squared = (nx*ny)/(nx+ny) * np.matmul(np.matmul(delta.transpose(), np.linalg.inv(S_pooled)), delta)
    statistic = t_squared * (nx+ny-p-1)/(p*(nx+ny-2))
    F = f(p, nx+ny-p-1)
    p_value = 1 - F.cdf(statistic)
    print(f"Test statistic: {statistic}\nDegrees of freedom: {p} and {nx+ny-p-1}\np-value: {p_value}")
    return statistic, p_value


ban_list = ['A1IU5OP7BBZHZ7',
            'A299J4PKHAEU9H', # evidently not paying attention
            'A26ZENZ5G8AEGM', # evidently not paying attention
            'A9MYC5IGQ2DO4', 
            'A1XH05IKC77OXO',
            'A2BAQ26SMQQEUG',
            'AORHXBTOCXFUK',
            'A26399B1QZ7XJJ',
            'A17D6BK59S31BM',
            'A13XXMDHOULEZ7',
            'A1XO6ONCCTBMKW',
            'A1EX0MEOPF8AHT',
            'A28A3HF3LSEIDT',]
            
new_ban_list = [
    'A2QQY4S73JO639',
    'A273DS7TQWR9M1',
    'A3TUMZ954ORSUC',
    'A1BSOOHNHX51RI',
    'A26399B1QZ7XJJ',
    'A3QI1RV4HQ9MOC',
    'A26ZENZ5G8AEGM',
    'A1EX0MEOPF8AHT',
    'A13XXMDHOULEZ7',
    'A17D6BK59S31BM',
    'A299J4PKHAEU9H',
    'A2BAQ26SMQQEUG',
    'A3O81LHBBI8NPK',
    'A9MYC5IGQ2DO4',
    'AORHXBTOCXFUK'
    ]

white_list = [
    'A14W0AXTJ3R19V',
    'A2KLJKDG90K1PP',
    'A2TBXASXZIRNNW',
    'A3HNEYFOIJWPH1',
    'A3I9XLIHPPWPN1',
    'AA9V4NE8SOA4I',
    'APGX2WZ59OWDN',

]

batch_n = 10
time_lower_bound = 120

'''for i in range(1, batch_n+1):
    pilot_name = 'batch_results_' +str(i) + '.csv'
    batch_df = pd.read_csv(pilot_name)


    if i == 1:
        df = batch_df.copy(deep=True)
    else:
        df = pd.concat([df, batch_df])

df = df.reset_index(drop=True)'''

df = pd.read_csv('output/all_results.csv').reset_index(drop=True)


num_q = df.shape[0] // 5
model_names = ['ref', 't5', 'ru', 'copy']
performances = {k:{'rel':[], 'nat':[]} for k in model_names}
permutations = [l.strip().split('\t') for l in open('new_rand_perm_list_200','r').readlines()[:num_q]]


for i, row in df.iterrows():
    ith_example = int(i/5)
    results = row['Answer.taskAnswers']
    result_dict = json.loads(results)[0]
    for k,v in result_dict['ans1nat'].items():
        if v:
            score = int(k[-1])
            df.at[i, 'ans1nat'] = score
            df.at[i, permutations[ith_example][0]+'-nat'] = score

    for k,v in result_dict['ans1rel'].items():
        if v:
            score = int(k[-1])
            df.at[i, 'ans1rel'] = score
            df.at[i, permutations[ith_example][0]+'-rel'] = score

    for k,v in result_dict['ans2nat'].items():
        if v:
            score = int(k[-1])
            df.at[i, 'ans2nat'] = score
            df.at[i, permutations[ith_example][1]+'-nat'] = score

    for k,v in result_dict['ans2rel'].items():
        if v:
            score = int(k[-1])
            df.at[i, 'ans2rel'] = score
            df.at[i, permutations[ith_example][1]+'-rel'] = score

    for k,v in result_dict['ans3nat'].items():
        if v:
            score = int(k[-1])
            df.at[i, 'ans3nat'] = score
            df.at[i, permutations[ith_example][2]+'-nat'] = score

    for k,v in result_dict['ans3rel'].items():
        if v:
            score = int(k[-1])
            df.at[i, 'ans3rel'] = score
            df.at[i, permutations[ith_example][2]+'-rel'] = score

    for k,v in result_dict['ans4nat'].items():
        if v:
            score = int(k[-1])
            df.at[i, 'ans4nat'] = score
            df.at[i, permutations[ith_example][3]+'-nat'] = score

    for k,v in result_dict['ans4rel'].items():
        if v:
            score = int(k[-1])
            df.at[i, 'ans4rel'] = score
            df.at[i, permutations[ith_example][3]+'-rel'] = score


'''
sns.kdeplot(
    data=df, x="waiting", y="duration", hue="kind", fill=True,
)
'''

for i in range(num_q):
    qi_results = df[i*5: (i+1)*5]
    qi_answer1_nat, qi_answer1_rel = [], []
    qi_answer2_nat, qi_answer2_rel = [], []
    qi_answer3_nat, qi_answer3_rel = [], []
    qi_answer4_nat, qi_answer4_rel = [], []

    for i, row in qi_results.iterrows():
        # if row['WorkTimeInSeconds'] < time_lower_bound or row['WorkerId'] in new_ban_list:
        # if row['WorkTimeInSeconds'] < time_lower_bound :
        #    continue
        qi_answer1_nat.append(row['ans1nat'])
        qi_answer1_rel.append(row['ans1rel'])
        qi_answer2_nat.append(row['ans2nat'])
        qi_answer2_rel.append(row['ans2rel'])
        qi_answer3_nat.append(row['ans3nat'])
        qi_answer3_rel.append(row['ans3rel'])
        qi_answer4_nat.append(row['ans4nat'])
        qi_answer4_rel.append(row['ans4rel'])

    if len(qi_answer1_rel) > 0:
        performances[permutations[(i+1)//5-1][0]]['rel'].append(np.mean(qi_answer1_rel))
        performances[permutations[(i+1)//5-1][0]]['nat'].append(np.mean(qi_answer1_nat))

    if len(qi_answer2_rel) > 0:
        performances[permutations[(i+1)//5-1][1]]['rel'].append(np.mean(qi_answer2_rel))
        performances[permutations[(i+1)//5-1][1]]['nat'].append(np.mean(qi_answer2_nat))

    if len(qi_answer3_rel) > 0:
        performances[permutations[(i+1)//5-1][2]]['rel'].append(np.mean(qi_answer3_rel))
        performances[permutations[(i+1)//5-1][2]]['nat'].append(np.mean(qi_answer3_nat))
    
    if len(qi_answer4_rel) > 0:
        performances[permutations[(i+1)//5-1][3]]['rel'].append(np.mean(qi_answer4_rel))
        performances[permutations[(i+1)//5-1][3]]['nat'].append(np.mean(qi_answer4_nat))
    
# Importing library

print(f'\tRelevance\tNaturalness')
for k in performances.keys():
    print(f"{k}\t{round(np.mean(performances[k]['rel']),3)}\t\t{round(np.mean(performances[k]['nat']),3)}")


# Performing the paired sample t-test
print("T5 and R+U relevance significance test: ", stats.ttest_rel(performances['ru']['rel'], performances['t5']['rel']))
print("T5 and R+U naturalness significance test: ",stats.ttest_rel(performances['ru']['nat'], performances['t5']['nat']))
print("Copy and R+U relevance significance test: ", stats.ttest_rel(performances['ru']['rel'], performances['copy']['rel']))
print("Copy and R+U naturalness significance test: ",stats.ttest_rel(performances['ru']['nat'], performances['copy']['nat']))

# Visualization
rel_l = performances['t5']['rel'] + performances['ru']['rel']
nat_l = performances['t5']['nat'] + performances['ru']['nat']
models = ['T5'] * len(performances['t5']['nat']) + ['Type+UQA'] * len(performances['ru']['nat'])
snsdf = pd.DataFrame(list(zip(rel_l, nat_l, models)),
               columns =['relevance', 'naturalness', 'model'])
sns.set(font_scale=2)
sns.set_theme(style='white')

g = sns.JointGrid(data=snsdf, x="relevance", y="naturalness", hue='model', xlim=(-10, 5.1), ylim=(-10, 5.1))
g.plot_joint(sns.kdeplot, alpha=0.25, fill=True)
g.plot_marginals(sns.kdeplot)
sns.scatterplot(data=snsdf, x="relevance", y="naturalness", hue="model", style="model", ax=g.ax_joint)
plt.show()
plt.savefig('humankde.png')

# hotelling's t-test
print('Hotellings t-test:')
t5 = snsdf.loc[snsdf['model']=='T5'][['relevance', 'naturalness']].copy(deep=True)
ru = snsdf.loc[snsdf['model']=='Type+UQA'][['relevance', 'naturalness']].copy(deep=True)

TwoSampleT2Test(t5, ru)



In [None]:
# person correlation between generation length and scores
generation_lengths = []
full_rel_l, full_nat_l = [], []
for i, row in df.iterrows():
    if i % 5 == 0:
        generation_lengths.append(len(row['Input.reference'].split()))
        generation_lengths.append(len(row['Input.facet'].split()))
        generation_lengths.append(len(row['Input.t5-candidate'].split()))
        generation_lengths.append(len(row['Input.ru-candidate'].split()))

        full_rel_l.append(row['ref-rel'])
        full_rel_l.append(row['copy-rel'])
        full_rel_l.append(row['t5-rel'])
        full_rel_l.append(row['ru-rel'])

        full_nat_l.append(row['ref-nat'])
        full_nat_l.append(row['copy-nat'])
        full_nat_l.append(row['t5-nat'])
        full_nat_l.append(row['ru-nat'])


pearson_df = pd.DataFrame(list(zip(generation_lengths, full_rel_l, full_nat_l)),
               columns =['length', 'relevance', 'naturalness'])

corr, _ = pearsonr(pearson_df['length'].values.tolist(), pearson_df['relevance'].values.tolist())
print('Length-relevance correlation: %.3f' % corr)
corr, _ = pearsonr(pearson_df['length'].values.tolist(), pearson_df['naturalness'].values.tolist())
print('Length-naturalness correlation: %.3f' % corr)

sns.scatterplot(data=pearson_df, x="length", y="relevance")
plt.show()
sns.scatterplot(data=pearson_df, x="length", y="naturalness")
plt.show()
sns.scatterplot(data=pearson_df, x="relevance", y="naturalness")
plt.show()

In [None]:
import numpy as np
from sklearn import datasets
from scipy.stats import f

def TwoSampleT2Test(X, Y):
    nx, p = X.shape
    ny, _ = Y.shape
    delta = np.mean(X, axis=0) - np.mean(Y, axis=0)
    Sx = np.cov(X, rowvar=False)
    Sy = np.cov(Y, rowvar=False)
    S_pooled = ((nx-1)*Sx + (ny-1)*Sy)/(nx+ny-2)
    t_squared = (nx*ny)/(nx+ny) * np.matmul(np.matmul(delta.transpose(), np.linalg.inv(S_pooled)), delta)
    statistic = t_squared * (nx+ny-p-1)/(p*(nx+ny-2))
    F = f(p, nx+ny-p-1)
    p_value = 1 - F.cdf(statistic)
    print(f"Test statistic: {statistic}\nDegrees of freedom: {p} and {nx+ny-p-1}\np-value: {p_value}")
    return statistic, p_value



iris = datasets.load_iris()
versicolor = iris.data[iris.target==1, :2]
virginica = iris.data[iris.target==2, :2]

TwoSampleT2Test(versicolor, virginica)

# new evaluation heuristics Qulac

In [None]:
import torch as T
import pandas as pd
import tqdm
from transformers import RobertaTokenizer, RobertaForSequenceClassification
from transformers import (
    AdamW,
    T5ForConditionalGeneration,
    T5Tokenizer,
)

device = T.device("cuda")

roberta_tokenizer = RobertaTokenizer.from_pretrained('roberta-base')
roberta_model = RobertaForSequenceClassification.from_pretrained("output/roberta-qulac/checkpoint-102")

local_dir = "./output/unifiedqa-small-qulac-30-long/"
tokenizer = T5Tokenizer.from_pretrained(local_dir)
model = T5ForConditionalGeneration.from_pretrained(local_dir).cuda()
df = pd.read_csv('qulac_test_long.csv')

for iter, row in tqdm.tqdm(df.iterrows(), total=len(df)):
    test_tokenized = tokenizer.encode_plus(row['unifiedqa-question'], return_tensors="pt")
    test_input_ids = test_tokenized["input_ids"].to(device)


    clf_inputs = roberta_tokenizer(row['unifiedqa-question'], return_tensors="pt")

    with T.no_grad():
        logits = roberta_model(**clf_inputs).logits

    predicted_class_id = logits.argmax().item()

    prefix = ''
    if predicted_class_id == 3 : # 3: yes
        prefix = 'yes'
    elif predicted_class_id == 1: #1: no
        prefix = 'no'
    elif predicted_class_id == 0: # 0: idk
        prefix = 'i dont know'
    else:
        prefix = ''

    tokenized_decoder_input = tokenizer.encode_plus(prefix, return_tensors="pt")
    decoder_input_ids = tokenized_decoder_input["input_ids"].to(device)[0][0].item()
    decoder_input_ids = T.tensor([[0, decoder_input_ids]]).to(device)
    
    model.eval()
    if prefix == '':
        beam_output = model.generate(
            input_ids=test_input_ids,
            max_length=96,
            early_stopping=True,
            num_beams=10,
            top_k=50, 
            top_p=0.9, 
            num_return_sequences=10,
            no_repeat_ngram_size=2
        )

    else:
        beam_output = model.generate(
            input_ids=test_input_ids,
            decoder_input_ids = decoder_input_ids,
            max_length=96,
            early_stopping=True,
            num_beams=10,
            top_k=50, 
            top_p=0.9, 
            num_return_sequences=10,
            no_repeat_ngram_size=2
        )

    decoded_output = tokenizer.decode(beam_output[0], skip_special_tokens=True,clean_up_tokenization_spaces=True)

    df.at[iter, 'candidate'] = decoded_output.strip()
    df.at[iter, 'facet_desc'] = df.at[iter, 'facet_desc'].lower()
    df.at[iter, 'question'] = df.at[iter, 'question'].lower()

    output = df[['facet_desc','question','answer','candidate']].copy(deep=True)
    output.columns = ['facet', 'question', 'reference','candidate']
    output.to_csv('output/unifiedqa-small-qulac-roberta-long.csv')

## new evaluation heuristic ClariQ

In [None]:
import torch as T
import pandas as pd
import tqdm
from transformers import RobertaTokenizer, RobertaForSequenceClassification
from transformers import (
    AdamW,
    T5ForConditionalGeneration,
    T5Tokenizer,
)

device = T.device("cuda")

roberta_tokenizer = RobertaTokenizer.from_pretrained('roberta-base')
roberta_model = RobertaForSequenceClassification.from_pretrained("output/roberta-clariq/checkpoint-112")

local_dir = "./output/unifiedqa-small-clariq-30-long/"
tokenizer = T5Tokenizer.from_pretrained(local_dir)
model = T5ForConditionalGeneration.from_pretrained(local_dir).cuda()
df = pd.read_csv('clariq_dev_long.csv')

for iter, row in tqdm.tqdm(df.iterrows(), total=len(df)):
    test_tokenized = tokenizer.encode_plus(row['unifiedqa-question'], return_tensors="pt")
    test_input_ids = test_tokenized["input_ids"].to(device)


    clf_inputs = roberta_tokenizer(row['unifiedqa-question'], return_tensors="pt")

    with T.no_grad():
        logits = roberta_model(**clf_inputs).logits

    predicted_class_id = logits.argmax().item()

    prefix = ''
    if predicted_class_id == 3 : # 3: yes
        prefix = 'yes'
    elif predicted_class_id == 1: #1: no
        prefix = 'no'
    elif predicted_class_id == 0: # 0: idk
        prefix = 'i dont know'
    else:
        prefix = ''

    tokenized_decoder_input = tokenizer.encode_plus(prefix, return_tensors="pt")
    decoder_input_ids = tokenized_decoder_input["input_ids"].to(device)[0][0].item()
    decoder_input_ids = T.tensor([[0, decoder_input_ids]]).to(device)
    
    model.eval()
    if prefix == '':
        beam_output = model.generate(
            input_ids=test_input_ids,
            max_length=96,
            early_stopping=True,
            num_beams=10,
            top_k=50, 
            top_p=0.9, 
            num_return_sequences=10,
            no_repeat_ngram_size=2
        )

    else:
        beam_output = model.generate(
            input_ids=test_input_ids,
            decoder_input_ids = decoder_input_ids,
            max_length=96,
            early_stopping=True,
            num_beams=10,
            top_k=50, 
            top_p=0.9, 
            num_return_sequences=10,
            no_repeat_ngram_size=2
        )

    decoded_output = tokenizer.decode(beam_output[0], skip_special_tokens=True,clean_up_tokenization_spaces=True)

    df.at[iter, 'candidate'] = decoded_output.strip()
    df.at[iter, 'facet_desc'] = df.at[iter, 'facet_desc'].lower()
    df.at[iter, 'question'] = df.at[iter, 'question'].lower()

    output = df[['facet_desc','question','answer','candidate']].copy(deep=True)
    output.columns = ['facet', 'question', 'reference','candidate']
    output.to_csv('output/unifiedqa-small-clariq-roberta-long.csv')

In [None]:
from nltk.tokenize import word_tokenize
from nltk.translate.bleu_score import sentence_bleu, SmoothingFunction
from nltk.translate.meteor_score import meteor_score
from nltk.stem import PorterStemmer
import re
from collections import Counter

import json
import rouge
rs = rouge.Rouge()

import spacy
spacy.prefer_gpu()
pos_tagger = spacy.load('en_core_web_sm')
ps = PorterStemmer()

import pandas as pd
import numpy as np
import evaluate
import tqdm

metric = evaluate.combine(["rouge"])

def compute_metrics(refs, cands):
    result = metric.compute(predictions=cands, references=refs, use_stemmer=True)
    result = {k: round(v * 100, 4) for k, v in result.items() }
    return result['rougeL']

df = pd.read_csv("output/t5-small-qulac-30-all.csv")
for i, row in tqdm.tqdm(df.iterrows(), total=len(df)):
    df.at[i, 'low'] = compute_metrics(refs = [row['reference']], cands = [row['candidate']])

df.to_csv('t-qulac-dev-study.csv')

## Combine the qulac short and qulac long results and evaluate.

In [None]:
import pandas as pd

# UnifiedQA
qulac_long_gen_file = 'output/u-small-qulac-long/generated_predictions.txt'
qulac_long_gen = open(qulac_long_gen_file).readlines()
qulac_long_data_file = 'qulac_test_long.csv'
qulac_long_df = pd.read_csv(qulac_long_data_file)
qulac_short_gen_file = 'output/u-small-qulac-short/generated_predictions.txt'
qulac_short_gen = open(qulac_short_gen_file).readlines()
qulac_short_data_file = 'qulac_test_short.csv'
qulac_short_df = pd.read_csv(qulac_short_data_file)

for iter, row in qulac_long_df.iterrows():
    qulac_long_df.at[iter, 'candidate'] = qulac_long_gen[iter].strip().lower()
    qulac_long_df.at[iter, 'facet_desc'] = qulac_long_df.at[iter, 'facet_desc'].lower()
    qulac_long_df.at[iter, 'question'] = qulac_long_df.at[iter, 'question'].lower()

for iter, row in qulac_short_df.iterrows():
    qulac_short_df.at[iter, 'candidate'] = qulac_short_gen[iter].strip().lower()
    qulac_short_df.at[iter, 'facet_desc'] = qulac_short_df.at[iter, 'facet_desc'].lower()
    qulac_short_df.at[iter, 'question'] = qulac_short_df.at[iter, 'question'].lower()

qulac_full_df = pd.concat([qulac_short_df, qulac_long_df], ignore_index=True)
qulac_full_df.to_csv('output/u-small-qulac-full.csv', index=False)


## ClariQ

In [None]:
import pandas as pd

# UnifiedQA
qulac_long_gen_file = 'output/u-small-clariq-long/generated_predictions.txt'
qulac_long_gen = open(qulac_long_gen_file).readlines()
qulac_long_data_file = 'clariq_dev_long.csv'
qulac_long_df = pd.read_csv(qulac_long_data_file)
qulac_short_gen_file = 'output/u-small-clariq-short/generated_predictions.txt'
qulac_short_gen = open(qulac_short_gen_file).readlines()
qulac_short_data_file = 'clariq_dev_short.csv'
qulac_short_df = pd.read_csv(qulac_short_data_file)

for iter, row in qulac_long_df.iterrows():
    qulac_long_df.at[iter, 'candidate'] = qulac_long_gen[iter].strip().lower()
    qulac_long_df.at[iter, 'facet_desc'] = qulac_long_df.at[iter, 'facet_desc'].lower()
    qulac_long_df.at[iter, 'question'] = qulac_long_df.at[iter, 'question'].lower()

for iter, row in qulac_short_df.iterrows():
    qulac_short_df.at[iter, 'candidate'] = qulac_short_gen[iter].strip().lower()
    qulac_short_df.at[iter, 'facet_desc'] = qulac_short_df.at[iter, 'facet_desc'].lower()
    qulac_short_df.at[iter, 'question'] = qulac_short_df.at[iter, 'question'].lower()

qulac_full_df = pd.concat([qulac_short_df, qulac_long_df], ignore_index=True)
qulac_full_df.to_csv('output/u-small-clariq-full.csv', index=False)


## Evaluate under new cooperativeness partition.

In [None]:
print("-------------------------------------------------------------------------------------------------------------------------------")
print("|         qulac          |                 generation similarity                |            retrieval performance            |")
print("-------------------------------------------------------------------------------------------------------------------------------")
print("|{:<24}| {:<8} {:<8} {:<8} {:<8} {:<8} {:<8}| {:<8} {:<8} {:<8} {:<8} {:<8}|".format('MODEL','BLEU3','BLEU4', 'ROUGE-L','METEOR', 'BERT-F1', 'TYPE-F1', 'nDCG@1', 'nDCG@5', 'nDCG@20', 'P@1', 'MRR'))

b3, b4, r, m, bf1, t, ndcg1, ndcg5, ndcg20, p1, mrr = evaluate_from_output('output/t5-11b-roberta-qulac.csv', 'candidate', 'output/t5-11b-roberta-qulac.json')
print("-------------------------------------------------------------------------------------------------------------------------------")
print("|{:<24}| {:<8} {:<8} {:<8} {:<8} {:<8} {:<8}| {:<8} {:<8} {:<8} {:<8} {:<8}|".format('T5-11b-roberta-qulac', 
                                                            round(b3, 2), round(b4, 2), round(r, 2), round(m, 2),round(bf1, 2),  round(np.mean(t)*100, 2),
                                                            round(ndcg1, 4), round(ndcg5, 4), round(ndcg20, 4), round(p1, 4), round(mrr, 4),
                                                            ))   

b3, b4, r, m, bf1, t, ndcg1, ndcg5, ndcg20, p1, mrr = evaluate_from_output('output/u-11b-roberta-qulac.csv', 'candidate', 'output/u-11b-roberta-qulac.json')
print("-------------------------------------------------------------------------------------------------------------------------------")
print("|{:<24}| {:<8} {:<8} {:<8} {:<8} {:<8} {:<8}| {:<8} {:<8} {:<8} {:<8} {:<8}|".format('U-11b-roberta-qulac', 
                                                            round(b3, 2), round(b4, 2), round(r, 2), round(m, 2),round(bf1, 2),  round(np.mean(t)*100, 2),
                                                            round(ndcg1, 4), round(ndcg5, 4), round(ndcg20, 4), round(p1, 4), round(mrr, 4),
                                                            ))   

b3, b4, r, m, bf1, t, ndcg1, ndcg5, ndcg20, p1, mrr = evaluate_from_output('output/f-11b-roberta-qulac.csv', 'candidate', 'output/f-11b-roberta-qulac.json')
print("-------------------------------------------------------------------------------------------------------------------------------")
print("|{:<24}| {:<8} {:<8} {:<8} {:<8} {:<8} {:<8}| {:<8} {:<8} {:<8} {:<8} {:<8}|".format('F-11b-roberta-qulac', 
                                                            round(b3, 2), round(b4, 2), round(r, 2), round(m, 2),round(bf1, 2),  round(np.mean(t)*100, 2),
                                                            round(ndcg1, 4), round(ndcg5, 4), round(ndcg20, 4), round(p1, 4), round(mrr, 4),
                                                            ))  
                                                                    
b3, b4, r, m, bf1, t, ndcg1, ndcg5, ndcg20, p1, mrr = evaluate_from_output('output/f-11b-roberta-qulac-noprompt.csv', 'candidate', 'output/f-11b-roberta-qulac-noprompt.json')
print("-------------------------------------------------------------------------------------------------------------------------------")
print("|{:<24}| {:<8} {:<8} {:<8} {:<8} {:<8} {:<8}| {:<8} {:<8} {:<8} {:<8} {:<8}|".format('F-11b-roberta-qulac-0', 
                                                            round(b3, 2), round(b4, 2), round(r, 2), round(m, 2),round(bf1, 2),  round(np.mean(t)*100, 2),
                                                            round(ndcg1, 4), round(ndcg5, 4), round(ndcg20, 4), round(p1, 4), round(mrr, 4),
                                                            ))   

## ClariQ

In [None]:
print("-------------------------------------------------------------------------------------------------------------------------------")
print("|         qulac          |                 generation similarity                |            retrieval performance            |")
print("-------------------------------------------------------------------------------------------------------------------------------")
print("|{:<24}| {:<8} {:<8} {:<8} {:<8} {:<8} {:<8}| {:<8} {:<8} {:<8} {:<8} {:<8}|".format('MODEL','BLEU3','BLEU4', 'ROUGE-L','METEOR', 'BERT-F1', 'TYPE-F1', 'nDCG@1', 'nDCG@5', 'nDCG@20', 'P@1', 'MRR'))


b3, b4, r, m, bf1, t, ndcg1, ndcg5, ndcg20, p1, mrr = evaluate_from_output('output/t5-11b-roberta-clariq.csv', 'candidate', 'output/t5-11b-roberta-clariq.json')
print("-------------------------------------------------------------------------------------------------------------------------------")
print("|{:<24}| {:<8} {:<8} {:<8} {:<8} {:<8} {:<8}| {:<8} {:<8} {:<8} {:<8} {:<8}|".format('T5-11b-roberta-clariq', 
                                                            round(b3, 2), round(b4, 2), round(r, 2), round(m, 2),round(bf1, 2),  round(np.mean(t)*100, 2),
                                                            round(ndcg1, 4), round(ndcg5, 4), round(ndcg20, 4), round(p1, 4), round(mrr, 4),
                                                            ))   

b3, b4, r, m, bf1, t, ndcg1, ndcg5, ndcg20, p1, mrr = evaluate_from_output('output/u-11b-roberta-clariq.csv', 'candidate', 'output/u-11b-roberta-clariq.json')
print("-------------------------------------------------------------------------------------------------------------------------------")
print("|{:<24}| {:<8} {:<8} {:<8} {:<8} {:<8} {:<8}| {:<8} {:<8} {:<8} {:<8} {:<8}|".format('U-11b-roberta-clariq', 
                                                            round(b3, 2), round(b4, 2), round(r, 2), round(m, 2),round(bf1, 2),  round(np.mean(t)*100, 2),
                                                            round(ndcg1, 4), round(ndcg5, 4), round(ndcg20, 4), round(p1, 4), round(mrr, 4),
                                                            ))   
           
b3, b4, r, m, bf1, t, ndcg1, ndcg5, ndcg20, p1, mrr = evaluate_from_output('output/f-11b-roberta-clariq.csv', 'candidate', 'output/f-11b-roberta-clariq.json')
print("-------------------------------------------------------------------------------------------------------------------------------")
print("|{:<24}| {:<8} {:<8} {:<8} {:<8} {:<8} {:<8}| {:<8} {:<8} {:<8} {:<8} {:<8}|".format('F-11b-roberta-clariq', 
                                                            round(b3, 2), round(b4, 2), round(r, 2), round(m, 2),round(bf1, 2),  round(np.mean(t)*100, 2),
                                                            round(ndcg1, 4), round(ndcg5, 4), round(ndcg20, 4), round(p1, 4), round(mrr, 4),
                                                            ))   
                                                          

In [1]:
import pandas as pd
import tqdm
import re

file_list = [
    'output/gpt35-qulac-0shot.csv',
    'output/gpt35-qulac-3shot.csv',
    'output/gpt4-qulac-0shot.csv',
    'output/gpt4-qulac-3shot.csv',
    'output/llama2-qulac-0shot.csv',
    'output/llama2-qulac-3shot.csv',
    'output/flan-qulac-0shot.csv',
    'output/flan-qulac-3shot.csv',

    'output/gpt35-clariq-0shot.csv',
    'output/gpt35-clariq-3shot.csv',
    'output/gpt4-clariq-0shot.csv',
    'output/gpt4-clariq-3shot.csv',
    'output/llama2-clariq-0shot.csv',
    'output/llama2-clariq-3shot.csv',
    'output/flan-clariq-0shot.csv',
    'output/flan-clariq-3shot.csv'
]

for fi in file_list:
    try:
        df = pd.read_csv(fi)
    except:
        continue
    for iter, row in tqdm.tqdm(df.iterrows(), total=len(df)):
        df.at[iter, 'facet_desc'] = df.at[iter, 'facet_desc'].lower()
        df.at[iter, 'question'] = df.at[iter, 'question'].lower()
        try:
            df.at[iter, 'candidate'] = re.sub('[^a-zA-Z0-9 -]', '', df.at[iter, 'candidate']).lower()
            df.at[iter, 'candidate'] = re.sub('user response ', '', df.at[iter, 'candidate']).lower()
        except:
            pass
    df.to_csv(fi, index=False)

100%|██████████| 3714/3714 [00:00<00:00, 4111.31it/s]
100%|██████████| 3714/3714 [00:00<00:00, 4165.88it/s]
100%|██████████| 3714/3714 [00:00<00:00, 4017.83it/s]
100%|██████████| 3714/3714 [00:00<00:00, 4180.07it/s]
100%|██████████| 3714/3714 [00:00<00:00, 4190.23it/s]
100%|██████████| 3714/3714 [00:00<00:00, 4193.36it/s]
100%|██████████| 3714/3714 [00:00<00:00, 4213.46it/s]
100%|██████████| 3714/3714 [00:00<00:00, 4044.16it/s]
100%|██████████| 2034/2034 [00:00<00:00, 4145.19it/s]
100%|██████████| 2034/2034 [00:00<00:00, 3729.91it/s]
100%|██████████| 2034/2034 [00:00<00:00, 4252.35it/s]
100%|██████████| 2034/2034 [00:00<00:00, 4160.19it/s]
100%|██████████| 2034/2034 [00:00<00:00, 4120.07it/s]
100%|██████████| 2034/2034 [00:00<00:00, 4197.77it/s]
