In [169]:
from evaluation import f1_score, exact_match_score, f1, ems
import pickle as pkl
import json
import numpy as np
import itertools

In [166]:
def evaluate(dataset, retriever, k = None, round = 5, topks = [1, 5, 10, 20, 30], k_emb = 15):    
    if retriever != 'knn':
        res = json.load(open('../result/{}/{}_{}.json'.format(dataset, retriever, k), 'rb'))
    else:
        res = json.load(open('../result/{}/{}_{}_{}.json'.format(dataset, retriever, k_emb, k), 'rb'))
    
    
    filter_res = [r for r in res if r['prediction'] != 'System mistake']
    
    f1s, emss, accs = [], [], []
    
    if retriever not in ['golden', 'no']:
        recall, precision, sp_em = [], [], []
    
    for r in filter_res:
        accs.append(('1' in r['grade'])*1.0)
        
        if dataset in ['hotpotqa', 'wikimultihop', 'musique']:
            f1s.append(f1_score(r['prediction'], r['answer']))
            emss.append(exact_match_score(r['prediction'], r['answer']))
            
        elif dataset in ['iirc']:
            f1s.append(f1(r['prediction'], r['answer']))
            emss.append(ems(r['prediction'], r['answer']))
        
        r['corpus'] = list(itertools.chain(*[_.split('\n') for _ in r['corpus']]))
        if retriever not in ['golden', 'no']:
            evi = set([_[1] for _ in r['supports']])
            
            tmp_recall = []
            tmp_precision = []    
            tmp_sp_em = []
            for kk in topks:
                if kk <= k:
                    tmp = set(r['corpus'][:kk])

                    tmp_recall.append(len(evi.intersection(tmp))/len(evi))
                    tmp_precision.append(len(evi.intersection(tmp))/kk)
                    
                    if evi.issubset(tmp):
                        tmp_sp_em.append(1)
                    else:
                        tmp_sp_em.append(0)
                
            
            recall.append(tmp_recall)
            precision.append(tmp_precision)
            sp_em.append(tmp_sp_em)

    
    print('Acc:', np.mean(accs))
    print('F1:', np.mean(f1s))
    print('EM:', np.mean(emss))
    

    if retriever not in ['golden', 'no']:
        print('Recall:', np.mean(np.array(recall), axis = 0))
        print('Precision:', np.mean(np.array(precision), axis = 0))
        print('SP_EM:', np.mean(np.array(sp_em), axis = 0))

# IIRC

In [149]:
# Golden
evaluate(dataset = 'iirc', retriever = 'golden', k = 30)

Acc: 0.6268343815513627
F1: 0.5475670282089076
EM: 0.35639412997903563


In [150]:
# No
evaluate(dataset = 'iirc', retriever = 'no', k = 30)

Acc: 0.1949685534591195
F1: 0.13173226726685847
EM: 0.0859538784067086


In [151]:
# KNN
evaluate(dataset = 'iirc', retriever = 'knn', k = 30, k_emb = 15)

Acc: 0.4381551362683438
F1: 0.3723802032557411
EM: 0.25157232704402516
Recall: [0.00950384 0.04808326 0.1334914  0.44783451 0.46844947]
Precision: [0.0230608  0.02389937 0.03291405 0.05366876 0.03752621]
SP_EM: [0.00209644 0.00628931 0.0230608  0.1802935  0.19496855]


In [152]:
# Tf-IDF
evaluate(dataset = 'iirc', retriever = 'tf-idf', k = 30)

Acc: 0.47468354430379744
F1: 0.4080300029986812
EM: 0.2721518987341772
Recall: [0.27879587 0.48984259 0.55870938 0.61531979 0.64281627]
Precision: [0.63924051 0.23417722 0.13396624 0.07415612 0.05161744]
SP_EM: [0.00632911 0.15400844 0.23417722 0.29535865 0.33966245]


In [153]:
# Bm25
evaluate(dataset = 'iirc', retriever = 'bm25', k = 30)

Acc: 0.4192872117400419
F1: 0.35552822234287274
EM: 0.2348008385744235
Recall: [0.18504383 0.35476622 0.41668096 0.46811353 0.5013071 ]
Precision: [0.42976939 0.1672956  0.09874214 0.0557652  0.03997205]
SP_EM: [0.00209644 0.06918239 0.11949686 0.16561845 0.19496855]


In [154]:
# KG
evaluate(dataset = 'iirc', retriever = 'kg_test_docs_graph', k = 30)

Acc: 0.4630021141649049
F1: 0.4143313677575586
EM: 0.2769556025369979
Recall: [0.27973765 0.49052584 0.56006677 0.61074464 0.64314262]
Precision: [0.64059197 0.23424947 0.13403805 0.07357294 0.05179704]
SP_EM: [0.00634249 0.15433404 0.23678647 0.29175476 0.34038055]


In [155]:
# MDR
evaluate(dataset = 'iirc', retriever = 'mhop', k = 30)

Acc: 0.5084033613445378
F1: 0.43474292747401994
EM: 0.27521008403361347
Recall: [0.21410905 0.35840177 0.47716928 0.54595679 0.57134195]
Precision: [0.5        0.16722689 0.11302521 0.06533613 0.04572829]
SP_EM: [0.00210084 0.11344538 0.21848739 0.27521008 0.29201681]


In [156]:
# LLaMA
evaluate(dataset = 'iirc', retriever = 'llama', k = 30)

Acc: 0.5189075630252101
F1: 0.4499719424311614
EM: 0.30462184873949577
Recall: [0.27937516 0.4202265  0.54062466 0.64572859 0.6896429 ]
Precision: [0.6407563  0.19621849 0.12920168 0.07857143 0.05644258]
SP_EM: [0.00630252 0.13865546 0.2710084  0.37184874 0.42226891]


In [157]:
# DPR
evaluate(dataset = 'iirc', retriever = 'dpr', k = 30)

Acc: 0.4810924369747899
F1: 0.41847556610285236
EM: 0.2689075630252101
Recall: [0.20551471 0.42413215 0.53066477 0.65254185 0.71594888]
Precision: [0.46428571 0.20420168 0.12752101 0.07962185 0.05903361]
SP_EM: [0.00420168 0.11344538 0.23739496 0.3907563  0.48739496]


In [158]:
# T5
evaluate(dataset = 'iirc', retriever = 't5', k = 30)

Acc: 0.5063025210084033
F1: 0.43624963532650135
EM: 0.28991596638655465
Recall: [0.27937516 0.43636295 0.53866387 0.62517522 0.65743904]
Precision: [0.6407563  0.20420168 0.12941176 0.07605042 0.05357143]
SP_EM: [0.00630252 0.16386555 0.25420168 0.34243697 0.38235294]


In [159]:
# KG-T5
evaluate(dataset = 'iirc', retriever = 'kg-t5_test_docs_graph', k = 30)

Acc: 0.4827586206896552
F1: 0.41542967257023256
EM: 0.26939655172413796
Recall: [0.27959607 0.3785544  0.4906285  0.5860916  0.62253454]
Precision: [0.64008621 0.17801724 0.11702586 0.07047414 0.05014368]
SP_EM: [0.00646552 0.07543103 0.16810345 0.27155172 0.31896552]


In [161]:
# KG-LLaMA
evaluate(dataset = 'iirc', retriever = 'kg-llama_test_docs_graph', k = 30)

Acc: 0.4957446808510638
F1: 0.42556766665402457
EM: 0.28085106382978725
Recall: [0.28045938 0.37211423 0.5012125  0.60605976 0.64905875]
Precision: [0.64255319 0.17574468 0.12085106 0.07319149 0.05248227]
SP_EM: [0.00638298 0.06808511 0.19148936 0.3        0.35957447]


In [162]:
# IRCoT
evaluate(dataset = 'iirc', retriever = 'ircot', k = 30)

Acc: 0.4978165938864629
F1: 0.41654503458125347
EM: 0.27729257641921395
Recall: [0.28355002 0.49723884 0.60661189 0.67215661 0.70254234]
Precision: [0.64847162 0.23668122 0.1441048  0.08067686 0.0562591 ]
SP_EM: [0.00655022 0.15720524 0.31659389 0.41484716 0.46724891]


In [163]:
# KG-MDR
evaluate(dataset = 'iirc', retriever = 'kg-mdr_test_docs_graph', k = 30)

Acc: 0.4957805907172996
F1: 0.43211468118765217
EM: 0.29324894514767935
Recall: [0.27879587 0.37387323 0.49525248 0.59532783 0.62717265]
Precision: [0.63924051 0.1742616  0.11835443 0.07120253 0.05014065]
SP_EM: [0.00632911 0.07805907 0.16877637 0.28481013 0.33333333]


In [164]:
# LLM-MDR
evaluate(dataset = 'iirc', retriever = 'llm-mdr', k = 30)

Acc: 0.5297872340425532
F1: 0.44641096283774634
EM: 0.29148936170212764
Recall: [0.27861541 0.42336718 0.54363567 0.634437   0.66495372]
Precision: [0.63829787 0.19659574 0.12851064 0.07574468 0.05312057]
SP_EM: [0.00638298 0.14680851 0.25531915 0.35744681 0.4       ]


# HotpotQA

In [96]:
# Golden
evaluate(dataset = 'hotpotqa', retriever = 'golden', k = 30)

Acc: 0.8218623481781376
F1: 0.7106223583777763
EM: 0.5020242914979757


In [6]:
# No
evaluate(dataset = 'hotpotqa', retriever = 'no', k = 30)

Acc: 0.418
F1: 0.30504087374613686
EM: 0.19


In [139]:
# bm25
evaluate(dataset = 'hotpotqa', retriever = 'bm25', k = 30)

Acc: 0.7075268817204301
F1: 0.4593725352098424
EM: 0.22795698924731184
Recall: [0. 0. 0. 0. 0.]
Precision: [0. 0. 0. 0. 0.]
SP_EM: [0. 0. 0. 0. 0.]


In [140]:
# TF-IDF
evaluate(dataset = 'hotpotqa', retriever = 'tf-idf', k = 30)

Acc: 0.7663934426229508
F1: 0.6463696668433891
EM: 0.4569672131147541
Recall: [0.26240242 0.65369828 0.77736632 0.85395687 0.88483119]
Precision: [0.61065574 0.31065574 0.18565574 0.10266393 0.07110656]
SP_EM: [0.         0.34016393 0.55122951 0.68442623 0.74385246]


In [138]:
# KNN
evaluate(dataset = 'hotpotqa', retriever = 'knn', k = 30, k_emb = 15)

Acc: 0.7157258064516129
F1: 0.5797229717889111
EM: 0.40725806451612906
Recall: [0.0124328  0.0702477  0.22414075 0.67373752 0.690673  ]
Precision: [0.03225806 0.03467742 0.05544355 0.08044355 0.05504032]
SP_EM: [0.         0.00201613 0.02419355 0.41129032 0.4375    ]


In [7]:
# MDR
evaluate(dataset = 'hotpotqa', retriever = 'mhop', k = 30)

Acc: 0.7530364372469636
F1: 0.651630315782495
EM: 0.45546558704453444
Recall: [0.20150376 0.50241469 0.67601697 0.76639194 0.80768749]
Precision: [0.46761134 0.23238866 0.1562753  0.09008097 0.06369771]
SP_EM: [0.         0.20850202 0.41295547 0.54453441 0.61133603]


In [98]:
# DPR
evaluate(dataset = 'hotpotqa', retriever = 'dpr', k = 30)

Acc: nan
F1: nan
EM: nan
Recall: nan
Precision: nan
SP_EM: nan


In [97]:
# LLaMA
evaluate(dataset = 'hotpotqa', retriever = 'llama', k = 30)

Acc: 0.7598343685300207
F1: 0.6682713840916252
EM: 0.47619047619047616
Recall: [0.26046042 0.55204575 0.73144533 0.84109238 0.88452134]
Precision: [0.60662526 0.25797101 0.17308489 0.10062112 0.07094548]
SP_EM: [0.         0.24430642 0.46376812 0.65838509 0.74120083]


In [99]:
# T5
evaluate(dataset = 'hotpotqa', retriever = 't5', k = 30)

Acc: 0.7611336032388664
F1: 0.6515542335201777
EM: 0.4574898785425101
Recall: [0.26225178 0.59471274 0.72060922 0.81345672 0.85586563]
Precision: [0.61133603 0.27773279 0.17044534 0.09716599 0.06842105]
SP_EM: [0.         0.28947368 0.43522267 0.59311741 0.68421053]


In [168]:
# T5-KG-TAGME
evaluate(dataset = 'hotpotqa', retriever = 'kg-t5_graph_tagme_0.8', k = 30)

Acc: 0.7551020408163265
F1: 0.693789147360576
EM: 0.4897959183673469
Recall: [0.27244898 0.46462585 0.64812925 0.81632653 0.87993197]
Precision: [0.64285714 0.21428571 0.15204082 0.09693878 0.07040816]
SP_EM: [0.         0.14285714 0.32653061 0.59183673 0.73469388]


In [101]:
# LLaMA-KG-TAGME
evaluate(dataset = 'hotpotqa', retriever = 'kg-llama_test_docs_graph', k = 30)

Acc: 0.7566462167689162
F1: 0.6631430560697974
EM: 0.4621676891615542
Recall: [0.26254747 0.48196027 0.65941669 0.80701626 0.8650112 ]
Precision: [0.61145194 0.22413088 0.15623722 0.09631902 0.06925699]
SP_EM: [0.         0.15746421 0.34969325 0.58691207 0.69734151]


In [102]:
# MDR-KG-TAGME
evaluate(dataset = 'hotpotqa', retriever = 'kg-mdr_test_docs_graph', k = 30)

Acc: 0.757201646090535
F1: 0.6577063157128517
EM: 0.4609053497942387
Recall: [0.26279639 0.47739075 0.65930335 0.80354203 0.85172937]
Precision: [0.61316872 0.22222222 0.15534979 0.09547325 0.068107  ]
SP_EM: [0.         0.16872428 0.36213992 0.58641975 0.68106996]


In [103]:
# IRCOT
evaluate(dataset = 'hotpotqa', retriever = 'ircot', k = 30)

Acc: 0.7435897435897436
F1: 0.6411763526959606
EM: 0.452991452991453
Recall: [0.26435694 0.65189764 0.81310033 0.87932947 0.90354599]
Precision: [0.61538462 0.31111111 0.19551282 0.10630342 0.07293447]
SP_EM: [0.         0.33547009 0.62393162 0.7457265  0.79487179]


In [141]:
# llm-mdr
evaluate(dataset = 'hotpotqa', retriever = 'llm-mdr', k = 30)

Acc: 0.7689161554192229
F1: 0.6541219363124814
EM: 0.46830265848670755
Recall: [0.26169539 0.56195345 0.75014607 0.8464164  0.8794089 ]
Precision: [0.60940695 0.2605317  0.17546012 0.1002045  0.06993865]
SP_EM: [0.         0.2597137  0.50102249 0.65644172 0.7198364 ]


In [148]:
# kg-chatgpt
evaluate(dataset = 'hotpotqa', retriever = 'kg-chatgpt_test_docs_graph', k = 30)

Acc: 0.7780040733197556
F1: 0.6657932432701811
EM: 0.46028513238289204
Recall: [0.26215692 0.45817089 0.63992823 0.78733392 0.84035496]
Precision: [0.61099796 0.21344196 0.1503055  0.09358452 0.06693822]
SP_EM: [0.         0.13849287 0.32790224 0.54786151 0.64765784]


# 2WikiMQA(Wikimultihop)

In [83]:
# Golden
evaluate(dataset = 'wikimultihop', retriever = 'golden', k = 30)

Acc: 0.726
F1: 0.596932201132201
EM: 0.402


In [84]:
# No
evaluate(dataset = 'wikimultihop', retriever = 'no', k = 30)

Acc: 0.444
F1: 0.25073823953823954
EM: 0.186


In [85]:
# MDR
evaluate(dataset = 'wikimultihop', retriever = 'mhop', k = 30)

Acc: 0.63
F1: 0.5244347985347986
EM: 0.36
Recall: [0.15216667 0.39133333 0.56833333 0.684      0.735     ]
Precision: [0.336      0.1748     0.1296     0.0789     0.05666667]
SP_EM: [0.    0.14  0.32  0.444 0.5  ]


In [86]:
# DPR
evaluate(dataset = 'wikimultihop', retriever = 'dpr', k = 30)

Acc: 0.624
F1: 0.5110422688422688
EM: 0.356
Recall: [0.166      0.48266667 0.6685     0.806      0.8805    ]
Precision: [0.372  0.2228 0.1566 0.0959 0.0708]
SP_EM: [0.    0.196 0.422 0.632 0.762]


In [87]:
# KNN
evaluate(dataset = 'wikimultihop', retriever = 'knn', k = 30, k_emb = 15)

Acc: 0.524
F1: 0.4213374070374071
EM: 0.312
Recall: [0.0075     0.0645     0.172      0.66016667 0.68266667]
Precision: [0.018      0.0324     0.042      0.0778     0.05393333]
SP_EM: [0.    0.    0.022 0.392 0.408]


In [88]:
# bm25
evaluate(dataset = 'wikimultihop', retriever = 'bm25', k = 30)

Acc: 0.558
F1: 0.4055049284049284
EM: 0.308
Recall: [0.1375     0.40816667 0.50266667 0.58816667 0.62416667]
Precision: [0.33       0.192      0.1186     0.0706     0.05026667]
SP_EM: [0.    0.152 0.244 0.328 0.364]


In [89]:
# TF-IDF
evaluate(dataset = 'wikimultihop', retriever = 'tf-idf', k = 30)

Acc: 0.584
F1: 0.44495413475413476
EM: 0.346
Recall: [0.208      0.52416667 0.61216667 0.69566667 0.73666667]
Precision: [0.48   0.244  0.1434 0.0825 0.0588]
SP_EM: [0.    0.236 0.334 0.442 0.5  ]


In [135]:
# LLaMA
evaluate(dataset = 'wikimultihop', retriever = 'llama', k = 30)

Acc: 0.6428571428571429
F1: 0.5702380952380952
EM: 0.42857142857142855
Recall: [0.19642857 0.46428571 0.58928571 0.6875     0.73214286]
Precision: [0.46428571 0.21428571 0.13214286 0.07678571 0.0547619 ]
SP_EM: [0.         0.28571429 0.35714286 0.5        0.53571429]


In [136]:
# T5 
evaluate(dataset = 'wikimultihop', retriever = 't5', k = 30)

Acc: 0.664
F1: 0.5548675102675102
EM: 0.408
Recall: [0.208      0.49166667 0.62783333 0.7155     0.753     ]
Precision: [0.48   0.2268 0.1442 0.0819 0.0574]
SP_EM: [0.    0.236 0.382 0.502 0.55 ]


In [137]:
# IRCoT
evaluate(dataset = 'wikimultihop', retriever = 'ircot', k = 30)

Acc: 0.6181015452538632
F1: 0.5017258434476978
EM: 0.37748344370860926
Recall: [0.20640177 0.52336277 0.66629875 0.72038263 0.75459897]
Precision: [0.47240618 0.24326711 0.15695364 0.08532009 0.0598234 ]
SP_EM: [0.         0.23399558 0.41059603 0.49227373 0.53642384]


In [90]:
# KG-T5-TAGME
evaluate(dataset = 'wikimultihop', retriever = 'kg-t5_test_docs_graph', k = 30)

Acc: 0.6347826086956522
F1: 0.5216227371662154
EM: 0.3760869565217391
Recall: [0.20815217 0.39384058 0.5625     0.7048913  0.75923913]
Precision: [0.47173913 0.17826087 0.12891304 0.08184783 0.05869565]
SP_EM: [0.         0.10652174 0.26521739 0.44130435 0.52173913]


In [91]:
# KG-LLaMA-TAGME
evaluate(dataset = 'wikimultihop', retriever = 'kg-llama_test_docs_graph', k = 30)

Acc: 0.6244897959183674
F1: 0.5244906680620967
EM: 0.37551020408163266
Recall: [0.20816327 0.3872449  0.55697279 0.70748299 0.76411565]
Precision: [0.47959184 0.17714286 0.12897959 0.08306122 0.05979592]
SP_EM: [0.         0.09795918 0.25102041 0.44285714 0.52857143]


In [92]:
# KG-MDR-TAGME
evaluate(dataset = 'wikimultihop', retriever = 'kg-mdr_test_docs_graph', k = 30)

Acc: 0.6094069529652352
F1: 0.5128668990632181
EM: 0.3721881390593047
Recall: [0.2091002  0.37372188 0.55351057 0.71404226 0.767212  ]
Precision: [0.48261759 0.17791411 0.13251534 0.08691207 0.06237219]
SP_EM: [0.         0.07361963 0.2208589  0.44989775 0.53578732]


In [134]:
# llm-mdr
evaluate(dataset = 'wikimultihop', retriever = 'llm-mdr', k = 30)

Acc: 0.6686746987951807
F1: 0.5431234205330591
EM: 0.39759036144578314
Recall: [0.20732932 0.43172691 0.61161312 0.72991968 0.78212851]
Precision: [0.47791165 0.19879518 0.14016064 0.08473896 0.0607095 ]
SP_EM: [0.         0.17871486 0.35341365 0.49196787 0.56024096]


In [145]:
# kg-chatgpt
evaluate(dataset = 'wikimultihop', retriever = 'kg-chatgpt_test_docs_graph', k = 30)

Acc: 0.6161616161616161
F1: 0.49391055633479874
EM: 0.3616161616161616
Recall: [0.20757576 0.34006734 0.46481481 0.61902357 0.67558923]
Precision: [0.47878788 0.15878788 0.10989899 0.07474747 0.05434343]
SP_EM: [0.         0.05858586 0.14343434 0.31515152 0.39393939]


# MuSiQue

In [107]:
# no
evaluate(dataset = 'musique', retriever = 'no', k = 30)

Acc: 0.304
F1: 0.10576349206349206
EM: 0.046


In [108]:
# golden
evaluate(dataset = 'musique', retriever = 'golden', k = 30)

Acc: 0.57
F1: 0.47753571004159245
EM: 0.306


In [109]:
# knn
evaluate(dataset = 'musique', retriever = 'knn', k = 30)

Acc: 0.4470338983050847
F1: 0.30036369136568536
EM: 0.1885593220338983
Recall: [0.01483051 0.09004237 0.22033898 0.64724576 0.64936441]
Precision: [0.02754237 0.03474576 0.04237288 0.06324153 0.04223164]
SP_EM: [0.00211864 0.01483051 0.05084746 0.42584746 0.4279661 ]


In [110]:
# bm25
evaluate(dataset = 'musique', retriever = 'bm25', k = 30)

Acc: 0.444672131147541
F1: 0.3114585725000382
EM: 0.21106557377049182
Recall: [0.08094262 0.31659836 0.43545082 0.53278689 0.5942623 ]
Precision: [0.15778689 0.125      0.08565574 0.05204918 0.0385929 ]
SP_EM: [0.00409836 0.09016393 0.18647541 0.29713115 0.36680328]


In [111]:
# tf-idf
evaluate(dataset = 'musique', retriever = 'tf-idf', k = 30)

Acc: 0.4439918533604888
F1: 0.32499073467565137
EM: 0.2158859470468432
Recall: [0.1089613  0.44806517 0.59470468 0.70672098 0.75254582]
Precision: [0.20773931 0.17433809 0.11588595 0.06904277 0.04881195]
SP_EM: [0.0101833  0.18126273 0.34215886 0.49490835 0.56211813]


In [112]:
# mdr
evaluate(dataset = 'musique', retriever = 'mhop', k = 30)

Acc: 0.48582995951417
F1: 0.3671106165539364
EM: 0.22874493927125505
Recall: [0.06477733 0.22469636 0.38157895 0.4888664  0.54048583]
Precision: [0.12550607 0.08825911 0.07510121 0.04797571 0.0354251 ]
SP_EM: [0.00404858 0.0465587  0.16396761 0.26315789 0.31174089]


In [114]:
# dpr
evaluate(dataset = 'musique', retriever = 'dpr', k = 30)

Acc: 0.4426559356136821
F1: 0.3163832255381551
EM: 0.20321931589537223
Recall: [0.09054326 0.33903421 0.45875252 0.57344064 0.65392354]
Precision: [0.17907445 0.13360161 0.08993964 0.05603622 0.04265594]
SP_EM: [0.00201207 0.06036217 0.18511066 0.32796781 0.44064386]


In [115]:
# t5
evaluate(dataset = 'musique', retriever = 't5', k = 30)

Acc: 0.5194274028629857
F1: 0.4198061117945636
EM: 0.2883435582822086
Recall: [0.11145194 0.34253579 0.51431493 0.63496933 0.69222904]
Precision: [0.21267894 0.13374233 0.10102249 0.06206544 0.04505794]
SP_EM: [0.01022495 0.13496933 0.3006135  0.44171779 0.52351738]


In [116]:
# llama
evaluate(dataset = 'musique', retriever = 'llama', k = 30)

Acc: 0.5223577235772358
F1: 0.4124831011488113
EM: 0.2764227642276423
Recall: [0.10873984 0.29776423 0.48780488 0.62703252 0.70121951]
Precision: [0.20731707 0.11666667 0.09573171 0.06168699 0.04586721]
SP_EM: [0.0101626  0.10365854 0.26829268 0.42276423 0.51626016]


In [130]:
# kg
evaluate(dataset = 'musique', retriever = 'kg_test_docs_graph', k = 30)

Acc: 0.44672131147540983
F1: 0.32895686749230624
EM: 0.2192622950819672
Recall: [0.10758197 0.44672131 0.5932377  0.71618852 0.75922131]
Precision: [0.20491803 0.17377049 0.11557377 0.06977459 0.04931694]
SP_EM: [0.0102459  0.17622951 0.33811475 0.50819672 0.57581967]


In [118]:
# kg-t5
evaluate(dataset = 'musique', retriever = 'kg-t5_graph_tagme_0.9', k = 30)

Acc: 0.5050505050505051
F1: 0.4580751492516199
EM: 0.32323232323232326
Recall: [0.09090909 0.28787879 0.52020202 0.68181818 0.73232323]
Precision: [0.18181818 0.11515152 0.1        0.06565657 0.04713805]
SP_EM: [0.         0.07070707 0.27272727 0.49494949 0.55555556]


In [131]:
# kg-mdr
evaluate(dataset = 'musique', retriever = 'kg-mdr_test_docs_graph', k = 30)

Acc: 0.5122448979591837
F1: 0.41105783172409827
EM: 0.27755102040816326
Recall: [0.10714286 0.28469388 0.50204082 0.64591837 0.69795918]
Precision: [0.20408163 0.11061224 0.09836735 0.06316327 0.04537415]
SP_EM: [0.01020408 0.10204082 0.29591837 0.45510204 0.52040816]


In [142]:
# kg-chatgpt
evaluate(dataset = 'musique', retriever = 'kg-chatgpt_test_docs_graph', k = 30)

Acc: 0.5060728744939271
F1: 0.3865982526377858
EM: 0.2692307692307692
Recall: [0.10931174 0.32287449 0.51012146 0.6305668  0.68016194]
Precision: [0.20850202 0.12550607 0.09959514 0.06163968 0.04433198]
SP_EM: [0.01012146 0.10526316 0.27732794 0.41295547 0.47773279]


In [132]:
# IRCoT
evaluate(dataset = 'musique', retriever = 'ircot', k = 30)

Acc: 0.4514038876889849
F1: 0.34211078877465667
EM: 0.22462203023758098
Recall: [0.10583153 0.45140389 0.61771058 0.73326134 0.77645788]
Precision: [0.20086393 0.17537797 0.11987041 0.0712743  0.05017999]
SP_EM: [0.01079914 0.18790497 0.39308855 0.55507559 0.61987041]


In [128]:
# llm-mdr
evaluate(dataset = 'musique', retriever = 'llm-mdr', k = 30)

Acc: 0.5172413793103449
F1: 0.40851251189516313
EM: 0.281947261663286
Recall: [0.10851927 0.29006085 0.47464503 0.62880325 0.67951318]
Precision: [0.20689655 0.11237323 0.09290061 0.06146045 0.04415145]
SP_EM: [0.01014199 0.10953347 0.26369168 0.43002028 0.48681542]
