# Tuning
Here I tune basic models, based on both different indexes and weighting methods.

In [1]:
import pandas as pd
import os
import json

In [2]:
import pyterrier as pt
if not pt.started():
    pt.init(boot_packages=["com.github.terrierteam:terrier-prf:-SNAPSHOT"])  # Initialisation package for RM3

PyTerrier 0.8.1 has loaded Terrier 5.6 (built by craigmacdonald on 2021-09-17 13:27)

No etc/terrier.properties, using terrier.default.properties for bootstrap configuration.


In [None]:
topics_df = pd.read_csv("NIR2022 dataset/train_query.csv", dtype=str)
print(topics_df.shape)
topics_df
qrels_df = pd.read_csv("NIR2022 dataset/train_qrel.csv")
print(qrels_df.shape)
qrels_df.head()

In [3]:
# Loading indexes
indexref_np = pt.IndexRef.of('./indexes_p/iterindex_noprocess/data.properties')
index_np = pt.IndexFactory.of(indexref_np)
indexref1 = pt.IndexRef.of('./indexes_p/iterindex/data.properties')
index1 = pt.IndexFactory.of(indexref1)
indexref2 = pt.IndexRef.of('./indexes_p/iterindex_opt/data.properties')
index2 = pt.IndexFactory.of(indexref2)

### BM25

In [None]:
# cross validation
import numpy as np

f1, f2, f3 = np.split(topics_df.sample(frac=1), [int(0.33*len(topics_df)), int(0.66*len(topics_df))]) # 3 fold cv


para_grid = {"c" : [1.2, 0.3],
            "bm25.k_1": [0.8, 1.2],
            "bm25.k_3": [0.5, 1]
    }


bm25_idx_np = pt.BatchRetrieve(index_np, wmodel="BM25", controls={"c" : 0.75, "bm25.k_1": 0.75, "bm25.k_3": 0.75})
qrels_df.qid = qrels_df.qid.astype('str')
opt_results_np, opt_para1_np = pt.KFoldGridSearch(
    bm25_idx_np,
    {bm25_idx_np: para_grid},
    [f1, f2, f3],
    qrels_df,
    'map')


bm25_idx1 = pt.BatchRetrieve(index1, wmodel="BM25", controls={"c" : 0.75, "bm25.k_1": 0.75, "bm25.k_3": 0.75})
qrels_df.qid = qrels_df.qid.astype('str')
opt_results1, opt_para1 = pt.KFoldGridSearch(
    bm25_idx1,
    {bm25_idx1: para_grid},
    [f1, f2, f3],
    qrels_df,
    'map')


bm25_idx2 = pt.BatchRetrieve(index2, wmodel="BM25", controls={"c" : 0.75, "bm25.k_1": 0.75, "bm25.k_3": 0.75})
qrels_df.qid = qrels_df.qid.astype('str')
opt_results2, opt_para2 = pt.KFoldGridSearch(
    bm25_idx2,
    {bm25_idx2: para_grid},
    [f1, f2, f3],
    qrels_df,
    'ndcg')


In [None]:
pt.Experiment([opt_results2], 
topics_df, qrels_df,["map","ndcg","recip_rank"], 
names=['Snowball Stemming'])

The optimal parameters I got is [c: 0.3, k1: 0.8, k3:0.5] 

### Language Model

In [None]:
# Average doc length
adl = int(index_np.getCollectionStatistics().getAverageDocumentLength())

qrels_df.qid = qrels_df.qid.astype('str')

In [None]:
DLM_np = pt.BatchRetrieve(index_np, wmodel='DirichletLM', controls={'c': adl})
DLM_idx1 = pt.BatchRetrieve(index1, wmodel='DirichletLM', controls={'c': adl})
DLM_idx2 = pt.BatchRetrieve(index2, wmodel='DirichletLM', controls={'c': adl})

para_grid = {'c':[adl, 150, 500]}

qrels_df.qid = qrels_df.qid.astype('str')
dlm_results_np, dlm_para_np = pt.KFoldGridSearch(
    DLM_np,
    {DLM_np: para_grid},
    [f1, f2, f3],
    qrels_df,
    'map')

qrels_df.qid = qrels_df.qid.astype('str')
dlm_results1, dlm_para1 = pt.KFoldGridSearch(
    DLM_idx1,
    {DLM_idx1: para_grid},
    [f1, f2, f3],
    qrels_df,
    'map')

qrels_df.qid = qrels_df.qid.astype('str')
dlm_results2, dlm_para2 = pt.KFoldGridSearch(
    DLM_idx2,
    {DLM_idx2: para_grid},
    [f1, f2, f3],
    qrels_df,
    'map')

c=275 for DLM model.

In [None]:
pt.Experiment([opt_results_np, opt_results1, opt_results2, dlm_results_np, dlm_results1, dlm_results2], 
topics_df, qrels_df,["map","ndcg","recip_rank",'ndcg_cut_10','P_10'], 
names=['BM25-No stemming','BM25-Porter','BM25-Snowball','DLM-No stemming','DLM-Porter','DLM-Snowball'])

Basically, snowball stemmed index and BM25 outperformed. Thus I use the snowball index in the future stage.

In [None]:
HLM = pt.BatchRetrieve(index2, wmodel='Hiemstra_LM', controls={'c':0.15})
para_grid = {"c" : [0.15, 0.1, 0.5, 1]}
HLM_results, HLM_paras = pt.KFoldGridSearch(HLM, {HLM: para_grid}, [f1,f2,f3], qrels_df, 'map')

In [None]:
bm25f = pt.BatchRetrieve(index2, wmodel='BM25F')
pt.Experiment([bm25f],topics_df,qrels_df,['map'])

In [None]:
pt.Experiment([opt_results2, bm25f_results, dlm_results2, HLM_results],topics_df, qrels_df,["map","ndcg","recip_rank",'ndcg_cut_10','P_10'], 
names=['BM25','BM25F','DLM','HLM'] )

HLM and BM25F are also compared here.