In [1]:
import pyterrier as pt
import pandas as pd
import os
import warnings
warnings.filterwarnings("ignore")

In [2]:
if not pt.started():
    pt.init()

PyTerrier 0.7.1 has loaded Terrier 5.6 (built by craigmacdonald on 2021-09-17 13:27)


In [3]:
pd_indexer = pt.DFIndexer('./index', overwrite=True, blocks=True)
df = pd.read_csv('met_dataset.csv').astype(str).rename(columns={'Object ID': 'docno'})
df['Tags'] = df['Tags'].apply(lambda x: x.replace('|', ' '))

In [4]:
meta_fields = df[['docno', 'Is Highlight', 'Is Timeline Work', 'Department', 'Title', 'Culture', 'Period', 'Artist Display Name', 'Country', 'Tags', 'image']]
indexref = pd_indexer.index(df["description"], **meta_fields)

In [5]:
index = pt.IndexFactory.of(indexref)

In [6]:
topics = pd.read_csv('topics.csv').astype(str)
qrels = pd.read_csv('qrels.csv').astype(str)
qrels['label'] = qrels['label'].astype(int)

In [7]:
RANK_CUTOFF = 100
SEED=42

from sklearn.model_selection import train_test_split

tr_va_topics, test_topics = train_test_split(topics, test_size=0.15, random_state=SEED)
train_topics, valid_topics =  train_test_split(tr_va_topics, test_size=0.15, random_state=SEED)

In [8]:
bm25 = pt.BatchRetrieve(index, wmodel="BM25")
sdm = pt.rewrite.SDM()
qe = pt.rewrite.Bo1QueryExpansion(index)

ltr_feats1 = (bm25 % RANK_CUTOFF) >> pt.text.get_text(index, ['Is Highlight', 'Is Timeline Work', 'Department', 'Title', 'Culture', 'Period', 'Artist Display Name', 'Country', 'Tags', 'image']) >> (
    pt.transformer.IdentityTransformer()
    ** # sequential dependence and query expansion
    (sdm >> bm25 >> qe >> bm25)
    ** # score of title (not originally indexed)
    (pt.text.scorer(body_attr="Title", takes='docs', wmodel="DirichletLM") ) 
    ** # score of author (not originally indexed)
    (pt.text.scorer(body_attr="Artist Display Name", takes='docs', wmodel="CoordinateMatch") ) 
    ** # score of tags (not originally indexed)
    (pt.text.scorer(body_attr="Tags", takes='docs', wmodel="DirichletLM") ) 
    ** # score of Country (not originally indexed)
    (pt.text.scorer(body_attr="Country", takes='docs', wmodel="DirichletLM") ) 
    ** # score of Department (not originally indexed)
    (pt.text.scorer(body_attr="Department", takes='docs', wmodel="DirichletLM") ) 
    ** # score of Culture (not originally indexed)
    (pt.text.scorer(body_attr="Culture", takes='docs', wmodel="DirichletLM") ) 
    ** # score of Period (not originally indexed)
    (pt.text.scorer(body_attr="Period", takes='docs', wmodel="DirichletLM") ) 
    ** # is highlited
    (pt.apply.doc_score(lambda row: int(row["Is Highlight"] == 'True')))
    ** # is Timeline Work
    (pt.apply.doc_score(lambda row: int(row["Is Timeline Work"] == 'True')))
    ** # has image
    (pt.apply.doc_score(lambda row: int( row["image"] == '1' and len(row["image"]) > 0) ))
    ** # Dichichlet Language Model
    pt.BatchRetrieve(index, wmodel="DirichletLM")
)

# for reference, lets record the feature names here too
fnames=["BM25", "SDM and QE", "Title", "Artist Name", "Tags", "Country", "Department", "Culture", "Period", "Is Highlight", "Is Timeline Work", "hasImage" , "DirichletLM"]

In [9]:
import lightgbm as lgb

# this configures LightGBM as LambdaMART
lmart_l = lgb.LGBMRanker(
    task="train",
    silent=False,
    min_data_in_leaf=1,
    min_sum_hessian_in_leaf=1,
    max_bin=255,
    num_leaves=31,
    objective="lambdarank",
    metric="ndcg",
    ndcg_eval_at=[10],
    ndcg_at=[10],
    eval_at=[10],
    learning_rate= .1,
    importance_type="gain",
    num_iterations=100,
    early_stopping_rounds=5
)

lmart_x_pipe = ltr_feats1 >> pt.ltr.apply_learned_model(lmart_l, form="ltr", fit_kwargs={'eval_at':[10]})

%time lmart_x_pipe.fit(train_topics, qrels, valid_topics, qrels)

You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 326
[LightGBM] [Info] Number of data points in the train set: 296, number of used features: 9
[1]	valid_0's ndcg@10: 0.30388
Training until validation scores don't improve for 5 rounds
[2]	valid_0's ndcg@10: 0.130301
[3]	valid_0's ndcg@10: 0.127079
[4]	valid_0's ndcg@10: 0.266235
[5]	valid_0's ndcg@10: 0.222404
[6]	valid_0's ndcg@10: 0.304075
[7]	valid_0's ndcg@10: 0.31669
[8]	valid_0's ndcg@10: 0.326075
[9]	valid_0's ndcg@10: 0.308787
[10]	valid_0's ndcg@10: 0.308787
[11]	valid_0's ndcg@10: 0.308787
[12]	valid_0's ndcg@10: 0.308787
[13]	valid_0's ndcg@10: 0.308787
Early stopping, best iteration is:
[8]	valid_0's ndcg@10: 0.326075
CPU times: user 4.86 s, sys: 390 ms, total: 5.25 s
Wall time: 3.05 s


In [14]:
pt.Experiment(
    [bm25 % RANK_CUTOFF, lmart_x_pipe],
    test_topics,
    qrels, 
    names=["BM25", "BM25 + LMart(7f)"],
    eval_metrics=["ndcg_cut_5", "ndcg_cut_10", "ndcg_cut_20", "ndcg"])



Unnamed: 0,name,ndcg_cut_5,ndcg_cut_10,ndcg_cut_20,ndcg
0,BM25,0.799531,0.819001,0.840033,0.737737
1,BM25 + LMart(7f),0.910657,0.931063,0.856404,0.738333
