In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import KFold
import pyterrier as pt
from NIRfunction import *
if not pt.started():
    pt.init(boot_packages=["com.github.terrierteam:terrier-prf:-SNAPSHOT"])

PyTerrier 0.8.1 has loaded Terrier 5.6 (built by craigmacdonald on 2021-09-17 13:27)

No etc/terrier.properties, using terrier.default.properties for bootstrap configuration.


#### Import data and indexer

In [2]:
query = get_query('../Project/NIR2022 dataset/data/train_query.csv')
qrel = get_qrel('../Project/NIR2022 dataset/data/train_qrel.csv')
files = pt.io.find_files("../Project/NIR2022 dataset/data/files")

In [3]:
metrics = ["map", "ndcg_cut_5",  "ndcg_cut_10",  "ndcg_cut_20", "mrt"]

In [4]:
# build the index
# indexer_TREC_stemmed = pt.TRECCollectionIndexer("./indexes/stage1/index_TREC_stemmed",  verbose=True, blocks=False, overwrite=True) 
# indexer_TREC_stemmed.setProperties(**{"termpipelines":"EnglishSnowballStemmer, Stopwords"})
# index_ref_TREC_stemmed = indexer_TREC_stemmed.index(files)

# Or load from files
index_ref_TREC_stemmed = pt.IndexRef.of("./indexes/stage1/index_TREC_stemmed")

# load the index, print the statistics
index_TREC_stemmed = pt.IndexFactory.of(index_ref_TREC_stemmed)
print(index_TREC_stemmed.getCollectionStatistics().toString())

Number of documents: 528155
Number of terms: 739349
Number of postings: 93025387
Number of fields: 0
Number of tokens: 159579026
Field names: []
Positions:   false



#### Without tuned parameters: TF-IDF, BM25, DLM, DPH, PL2

##### Chose a DFR based models

In [6]:
# select 2 best language model based on DFR
BB2 = pt.BatchRetrieve(index_TREC_stemmed, wmodel="BB2")
IFB2  = pt.BatchRetrieve(index_TREC_stemmed, wmodel="IFB2")
In_expB2  = pt.BatchRetrieve(index_TREC_stemmed, wmodel="In_expB2")
In_expC2  = pt.BatchRetrieve(index_TREC_stemmed, wmodel="In_expC2")
InL2  = pt.BatchRetrieve(index_TREC_stemmed, wmodel="InL2")
PL2  = pt.BatchRetrieve(index_TREC_stemmed, wmodel="PL2")
DPH  = pt.BatchRetrieve(index_TREC_stemmed, wmodel="DPH")
LGD = pt.BatchRetrieve(index_TREC_stemmed, wmodel="LGD")


DFR = pt.Experiment([BB2, IFB2, In_expB2, In_expC2, InL2, PL2, DPH, LGD], query, qrel, metrics)
DFR

Unnamed: 0,name,map,ndcg_cut_5,ndcg_cut_10,ndcg_cut_20
0,BR(BB2),0.25211,0.460037,0.445127,0.419073
1,BR(IFB2),0.250774,0.45929,0.444004,0.417938
2,BR(In_expB2),0.252171,0.46037,0.447162,0.420684
3,BR(In_expC2),0.247088,0.467211,0.447204,0.418586
4,BR(InL2),0.25277,0.45647,0.441981,0.417313
5,BR(PL2),0.236868,0.459647,0.437131,0.408411
6,BR(DPH),0.260083,0.470625,0.455465,0.428658
7,BR(LGD),0.257529,0.449115,0.43582,0.413633


In [18]:
DFR = DFR.sort_values("map", ascending = False)

In [19]:
DFR.to_excel("/Users/zhouyuqin/github/NIR/Project/DFR.xlsx")

In [20]:
DFR


Unnamed: 0,name,map,ndcg_cut_5,ndcg_cut_10,ndcg_cut_20
6,BR(DPH),0.260083,0.470625,0.455465,0.428658
7,BR(LGD),0.257529,0.449115,0.43582,0.413633
4,BR(InL2),0.25277,0.45647,0.441981,0.417313
2,BR(In_expB2),0.252171,0.46037,0.447162,0.420684
0,BR(BB2),0.25211,0.460037,0.445127,0.419073
1,BR(IFB2),0.250774,0.45929,0.444004,0.417938
3,BR(In_expC2),0.247088,0.467211,0.447204,0.418586
5,BR(PL2),0.236868,0.459647,0.437131,0.408411


In [5]:
# default parameters
TFIDF  = pt.BatchRetrieve(index_TREC_stemmed, wmodel="TF_IDF") 
BM25 = pt.BatchRetrieve(index_TREC_stemmed, wmodel="BM25")
DLM  = pt.BatchRetrieve(index_TREC_stemmed, wmodel="DirichletLM")
DPH  = pt.BatchRetrieve(index_TREC_stemmed, wmodel="DPH")

pt.Experiment(
    [TFIDF, BM25, DLM, DPH],
    query,
    qrel,
    eval_metrics=["map", "ndcg_cut_5", "ndcg_cut_10", "ndcg_cut_20", "mrt"],
    names=["TF-IDF", "BM25",  "Dirichlet QL", "DPH"]
)

Unnamed: 0,name,map,ndcg_cut_5,ndcg_cut_10,ndcg_cut_20,mrt
0,TF-IDF,0.24989,0.462106,0.444825,0.417291,24.880847
1,BM25,0.248283,0.461505,0.442638,0.415147,18.100074
2,Dirichlet QL,0.245282,0.451374,0.429138,0.407523,17.529297
3,DPH,0.260083,0.470625,0.455465,0.428658,17.787969


### Tune parameters + Query expansion

##### Split the dataset

In [10]:
kf = KFold(n_splits=5, shuffle = True, random_state = 2)

X_train_total = []
X_test_total = []

for train_index, test_index in kf.split(query):
    X_train, X_test = query.iloc[train_index], query.iloc[test_index]
    X_train_total.append(X_train)
    X_test_total.append(X_test)    # y_train, y_test = qrel[train_index], qrel[test_index]

In [28]:
names = ["Bo1", "Bo2", "KLComplete", "KLCorrect", "BA", "Information", "KL", "RM3"]

##### Tune DirichletLM + query expansion

In [42]:
DLM_1 = pt.BatchRetrieve(index_TREC_stemmed, wmodel="DirichletLM", controls={"qemodel" : "Bo1", "qe" : "on"})
DLM_2 = pt.BatchRetrieve(index_TREC_stemmed, wmodel="DirichletLM", controls={"qemodel" : "Bo2", "qe" : "on"})
DLM_3 = pt.BatchRetrieve(index_TREC_stemmed, wmodel="DirichletLM", controls={"qemodel" : "KLComplete", "qe" : "on"})
DLM_4 = pt.BatchRetrieve(index_TREC_stemmed, wmodel="DirichletLM", controls={"qemodel" : "KLCorrect", "qe" : "on"})
DLM_5 = pt.BatchRetrieve(index_TREC_stemmed, wmodel="DirichletLM", controls={"qemodel" : "BA", "qe" : "on"})
DLM_6 = pt.BatchRetrieve(index_TREC_stemmed, wmodel="DirichletLM", controls={"qemodel" : "Information", "qe" : "on"})
DLM_7 = pt.BatchRetrieve(index_TREC_stemmed, wmodel="DirichletLM", controls={"qemodel" : "KL", "qe" : "on"})

pipe = (pt.BatchRetrieve(index_TREC_stemmed, wmodel="DirichletLM") >> 
    pt.rewrite.RM3(index_TREC_stemmed) >> 
    pt.BatchRetrieve(index_TREC_stemmed, wmodel="DirichletLM")
)

DLM = pt.Experiment(
    [DLM_1, DLM_2, DLM_3, DLM_4, DLM_5, DLM_6, DLM_7, pipe],
    query,
    qrel,
    eval_metrics= metrics,
    names = names
)

In [43]:
DLM = DLM.sort_values("map", ascending = False)
DLM.to_excel("/Users/zhouyuqin/github/NIR/Project/DLM.xlsx")

In [44]:
DLM

Unnamed: 0,name,map,ndcg_cut_5,ndcg_cut_10,ndcg_cut_20
1,Bo2,0.219162,0.427754,0.392521,0.361419
0,Bo1,0.214429,0.434581,0.395819,0.363839
4,BA,0.212889,0.427173,0.391915,0.359496
2,KLComplete,0.212771,0.426138,0.391917,0.359318
6,KL,0.212708,0.427531,0.390966,0.358518
3,KLCorrect,0.212575,0.427531,0.390966,0.358266
7,RM3,0.207332,0.398033,0.371148,0.344591
5,Information,0.153511,0.371582,0.320324,0.290659


In [24]:
DLM_1  = pt.BatchRetrieve(index_TREC_stemmed, wmodel="DirichletLM",controls={"dirichletlm.mu":2500,  "qemodel" : "Bo2", "qe" : "on"})

DLM_tuned, _= pt.KFoldGridSearch(
    DLM_1,
    {DLM_1 : {"dirichletlm.mu" : [1000, 1100, 1200, 1300, 1400, 1500, 1600, 1700, 1800, 1900, 2000, 2100, 2100, 2200, 2300, 24000]}},
    X_train_total,
    qrel,
    "map"
)

pt.Experiment([DLM_1, DLM_tuned], query, qrel, ["map"])

##### Tune DPH + query expansion

In [32]:
DPH_1 = pt.BatchRetrieve(index_TREC_stemmed, wmodel="DPH", controls={"qemodel" : "Bo1", "qe" : "on"})
DPH_2 = pt.BatchRetrieve(index_TREC_stemmed, wmodel="DPH", controls={"qemodel" : "Bo2", "qe" : "on"})
DPH_3 = pt.BatchRetrieve(index_TREC_stemmed, wmodel="DPH", controls={"qemodel" : "KLComplete", "qe" : "on"})
DPH_4 = pt.BatchRetrieve(index_TREC_stemmed, wmodel="DPH", controls={"qemodel" : "KLCorrect", "qe" : "on"})
DPH_5 = pt.BatchRetrieve(index_TREC_stemmed, wmodel="DPH", controls={"qemodel" : "BA", "qe" : "on"})
DPH_6 = pt.BatchRetrieve(index_TREC_stemmed, wmodel="DPH", controls={"qemodel" : "Information", "qe" : "on"})
DPH_7 = pt.BatchRetrieve(index_TREC_stemmed, wmodel="DPH", controls={"qemodel" : "KL", "qe" : "on"})

pipe = (pt.BatchRetrieve(index_TREC_stemmed, wmodel="DPH") >> 
    pt.rewrite.RM3(index_TREC_stemmed) >> 
    pt.BatchRetrieve(index_TREC_stemmed, wmodel="DPH")
)

DPH = pt.Experiment(
    [DPH_1, DPH_2, DPH_3, DPH_4, DPH_5, DPH_6, DPH_7, pipe],
    query,
    qrel,
    eval_metrics= metrics,
    names = names
)

In [40]:
DPH = DPH.sort_values("map", ascending = False)
DPH.to_excel("/Users/zhouyuqin/github/NIR/Project/DPH.xlsx")

In [34]:
DPH

Unnamed: 0,name,map,ndcg_cut_5,ndcg_cut_10,ndcg_cut_20
6,KL,0.29837,0.489049,0.475665,0.456866
0,Bo1,0.297829,0.486555,0.473276,0.458004
7,RM3,0.292709,0.479142,0.470422,0.453737
3,KLCorrect,0.288748,0.479881,0.471805,0.452119
1,Bo2,0.287725,0.477591,0.453676,0.438594
2,KLComplete,0.266314,0.466532,0.435494,0.411937
4,BA,0.266147,0.470256,0.436591,0.411907
5,Information,0.01719,0.076495,0.0654,0.058663


##### Tune BM25 + Choose query expansion for BM25

In [35]:
BM25_1 = pt.BatchRetrieve(index_TREC_stemmed, wmodel="BM25", controls={"qemodel" : "Bo1", "qe" : "on"})
BM25_2 = pt.BatchRetrieve(index_TREC_stemmed, wmodel="BM25", controls={"qemodel" : "Bo2", "qe" : "on"})
BM25_3 = pt.BatchRetrieve(index_TREC_stemmed, wmodel="BM25", controls={"qemodel" : "KLComplete", "qe" : "on"})
BM25_4 = pt.BatchRetrieve(index_TREC_stemmed, wmodel="BM25", controls={"qemodel" : "KLCorrect", "qe" : "on"})
BM25_5 = pt.BatchRetrieve(index_TREC_stemmed, wmodel="BM25", controls={"qemodel" : "BA", "qe" : "on"})
BM25_6 = pt.BatchRetrieve(index_TREC_stemmed, wmodel="BM25", controls={"qemodel" : "Information", "qe" : "on"})
BM25_7 = pt.BatchRetrieve(index_TREC_stemmed, wmodel="BM25", controls={"qemodel" : "KL", "qe" : "on"})


pipe = (pt.BatchRetrieve(index_TREC_stemmed, wmodel="BM25") >> 
    pt.rewrite.RM3(index_TREC_stemmed) >> 
    pt.BatchRetrieve(index_TREC_stemmed, wmodel="BM25")
)

BM25 = pt.Experiment(
    [BM25_1, BM25_2, BM25_3, BM25_4, BM25_5, BM25_6, BM25_7, pipe],
    query,
    qrel,
    eval_metrics= metrics,
    names = names
)

In [37]:
BM25

Unnamed: 0,name,map,ndcg_cut_5,ndcg_cut_10,ndcg_cut_20
0,Bo1,0.289461,0.483669,0.468938,0.451236
1,Bo2,0.297664,0.494102,0.470569,0.454162
2,KLComplete,0.27613,0.483014,0.45549,0.431129
3,KLCorrect,0.280671,0.480781,0.462419,0.44298
4,BA,0.276452,0.482902,0.455661,0.429609
5,Information,0.023244,0.129477,0.102496,0.085656
6,KL,0.289594,0.48561,0.464766,0.450432
7,RM3,0.28389,0.473335,0.461891,0.44228


In [39]:
BM25 = BM25.sort_values("map", ascending = False)
BM25.to_excel("/Users/zhouyuqin/github/NIR/Project/BM25.xlsx")

In [14]:
BM25 = pt.BatchRetrieve(index_TREC_stemmed, wmodel="BM25", controls={"c": 0.75, "bm25.k_1": 1.2, "bm25.k_3": 8, "qemodel" : "Bo2", "qe" : "on"})  # default parameters

tuned_BM25, _= pt.KFoldGridSearch(
    BM25,
    {BM25 : {"c" : [0.3, 0.4, 0.5, 0.6, 0.7, 0.8], 
    "bm25.k_1": [0.3, 0.5, 0.7, 0.9, 1.1, 1.3, 1.5, 1.7]}},
    X_train_total,
    qrel,
    "map"
)
pt.Experiment([BM25, tuned_BM25], query, qrel, ["map"])
            

Fold 1
Best map is 0.305355
Best setting is ['BR(BM25) c=0.5', 'BR(BM25) bm25.k_1=0.9']
Fold 2
Best map is 0.305355
Best setting is ['BR(BM25) c=0.5', 'BR(BM25) bm25.k_1=0.9']
Fold 3
Best map is 0.305355
Best setting is ['BR(BM25) c=0.5', 'BR(BM25) bm25.k_1=0.9']
Fold 4
Best map is 0.305355
Best setting is ['BR(BM25) c=0.5', 'BR(BM25) bm25.k_1=0.9']
Fold 5
Best map is 0.305355
Best setting is ['BR(BM25) c=0.5', 'BR(BM25) bm25.k_1=0.9']


Unnamed: 0,name,map
0,BR(BM25),0.297664
1,qid docid docno rank ...,0.305355


All results

In [64]:
DLM_qe  = pt.BatchRetrieve(index_TREC_stemmed, wmodel="DirichletLM",controls={"dirichletlm.mu":1000, "qemodel" : "Bo2", "qe" : "on"})
DPH_qe = pt.BatchRetrieve(index_TREC_stemmed, wmodel="DPH", controls={"qemodel" : "KL", "qe" : "on"})
BM25_qe = pt.BatchRetrieve(index_TREC_stemmed, wmodel="BM25", controls={"qemodel" : "Bo2", "qe" : "on", "c": 0.5, "bm25.k_1": 0.9})   # tuned parameters 
pt.Experiment(
    [DLM_qe, DPH_qe, BM25_qe],
    query,
    qrel,
    eval_metrics=["P_10", "map", "ndcg_cut_5", "ndcg_cut_10", "ndcg_cut_20", "mrt"])

Unnamed: 0,name,P_10,map,ndcg_cut_5,ndcg_cut_10,ndcg_cut_20,mrt
0,BR(DirichletLM),0.392965,0.234583,0.448476,0.414457,0.38284,69.500632
1,BR(DPH),0.468844,0.29837,0.489049,0.475665,0.456866,44.525224
2,BR(BM25),0.466332,0.305355,0.485617,0.472063,0.454454,64.271001


### Output

In [150]:
test_query = get_query('../Project/NIR2022 dataset/data/test_query.csv')

In [154]:
K = 1000
topk_dph = DPH_qe % K
stage_dph = topk_dph.transform(test_query)

In [167]:
with open("../Project/NIR2022 dataset/outputs/test_dph.run", "w") as f:
    for i in stage_dph.iterrows():
        qid = i[1]["qid"]
        rank = i[1]["rank"]
        docno = i[1]["docno"]
        score = i[1]["score"]
        a = f'{qid} Q0 {docno} {rank+1} {score} app526APP000'
        f.write(a + "\n")

In [158]:
K = 1000
topk_bm25 = BM25_qe % K
stage_bm25 = topk_bm25.transform(test_query)

In [170]:
with open("../Project/NIR2022 dataset/outputs/test_bm25.run", "w") as f:
    for i in stage_bm25.iterrows():
        qid = i[1]["qid"]
        rank = i[1]["rank"]
        docno = i[1]["docno"]
        score = i[1]["score"]
        a = f'{qid} Q0 {docno} {rank+1} {score} app526APP001'
        f.write(a + "\n")