In [1]:
import pyterrier as pt
import pandas as pd
import os
import warnings
warnings.filterwarnings("ignore")

In [2]:
if not pt.started():
    pt.init()

PyTerrier 0.7.1 has loaded Terrier 5.6 (built by craigmacdonald on 2021-09-17 13:27)


In [3]:
pd_indexer = pt.DFIndexer('./index', overwrite=True, blocks=True, verbose=True)
df = pd.read_csv('met_dataset.csv').astype(str).rename(columns={'Object ID': 'docno'})

In [4]:
meta_fields = df[['docno', 'Is Highlight', 'Is Timeline Work', 'Department', 'Title', 'Culture', 'Period', 'Artist Display Name', 'Country', 'Tags', 'image']]
indexref = pd_indexer.index(df["description"], **meta_fields)

34427documents [00:26, 1316.32documents/s]                    


In [5]:
index = pt.IndexFactory.of(indexref)

In [6]:
topics = pd.read_csv('topics.csv').astype(str)
qrels = pd.read_csv('qrels.csv').astype(str)
qrels['label'] = qrels['label'].astype(int)

In [7]:
RANK_CUTOFF = 50
SEED=27

from sklearn.model_selection import train_test_split

tr_va_topics, test_topics = train_test_split(topics, test_size=0.2, random_state=SEED)
train_topics, valid_topics =  train_test_split(tr_va_topics, test_size=0.2, random_state=SEED)

In [8]:
bm25 = pt.BatchRetrieve(index, wmodel="BM25")
sdm = pt.rewrite.SDM()
qe = pt.rewrite.Bo1QueryExpansion(index)

ltr_feats1 = (bm25 % RANK_CUTOFF) >> pt.text.get_text(index, ['Is Highlight', 'Is Timeline Work', 'Department', 'Title', 'Culture', 'Period', 'Artist Display Name', 'Country', 'Tags', 'image']) >> (
    pt.transformer.IdentityTransformer()
    ** # sequential dependence and query expansion
    (sdm >> bm25 >> qe >> bm25)
    ** # score of title (not originally indexed)
    (pt.text.scorer(body_attr="Title", takes='docs', wmodel="DirichletLM") ) 
    ** # score of author (not originally indexed)
    (pt.text.scorer(body_attr="Artist Display Name", takes='docs', wmodel="CoordinateMatch") ) 
    ** # score of tags (not originally indexed)
    (pt.text.scorer(body_attr="Tags", takes='docs', wmodel="DirichletLM") ) 
    ** # score of Country (not originally indexed)
    (pt.text.scorer(body_attr="Country", takes='docs', wmodel="DirichletLM") ) 
    ** # score of Department (not originally indexed)
    (pt.text.scorer(body_attr="Department", takes='docs', wmodel="DirichletLM") ) 
    ** # score of Culture (not originally indexed)
    (pt.text.scorer(body_attr="Culture", takes='docs', wmodel="DirichletLM") ) 
    ** # score of Period (not originally indexed)
    (pt.text.scorer(body_attr="Period", takes='docs', wmodel="DirichletLM") ) 
    ** # is highlited
    (pt.apply.doc_score(lambda row: int(row["Is Highlight"] == 'True')))
    ** # is Timeline Work
    (pt.apply.doc_score(lambda row: int(row["Is Timeline Work"] == 'True')))
    ** # has image
    (pt.apply.doc_score(lambda row: int( row["image"] == '1' ) ))
    ** # Dichichlet Language Model
    pt.BatchRetrieve(index, wmodel="DirichletLM")
)

# for reference, lets record the feature names here too
fnames=["BM25", "SDM and QE", "Title", "Artist Name", "Tags", "Country", "Department", "Culture", "Period", "Is Highlight", "Is Timeline Work", "hasImage" , "DirichletLM"]

In [9]:
import lightgbm as lgb

# this configures LightGBM as LambdaMART
lmart_l = lgb.LGBMRanker(
    task="train",
    silent=False,
    min_data_in_leaf=1,
    min_sum_hessian_in_leaf=1,
    max_bin=255,
    num_leaves=31,
    objective="lambdarank",
    metric="ndcg",
    ndcg_eval_at=[50],
    ndcg_at=[50],
    eval_at=[50],
    learning_rate= .1,
    importance_type="gain",
    num_iterations=100,
    early_stopping_rounds=5
)

lmart_x_pipe = ltr_feats1 >> pt.ltr.apply_learned_model(lmart_l, form="ltr", fit_kwargs={'eval_at':[50]})

%time lmart_x_pipe.fit(train_topics, qrels, valid_topics, qrels)

You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 295
[LightGBM] [Info] Number of data points in the train set: 250, number of used features: 11
[1]	valid_0's ndcg@50: 0.939687
Training until validation scores don't improve for 5 rounds
[2]	valid_0's ndcg@50: 0.95161
[3]	valid_0's ndcg@50: 0.971613
[4]	valid_0's ndcg@50: 0.95161
[5]	valid_0's ndcg@50: 0.955503
[6]	valid_0's ndcg@50: 0.955281
[7]	valid_0's ndcg@50: 0.955281
[8]	valid_0's ndcg@50: 0.955281
Early stopping, best iteration is:
[3]	valid_0's ndcg@50: 0.971613
CPU times: user 5.37 s, sys: 904 ms, total: 6.28 s
Wall time: 3.82 s


In [10]:
pt.Experiment(
    [bm25 % RANK_CUTOFF, lmart_x_pipe],
    test_topics,
    qrels, 
    names=["BM25", "BM25 + LMart(7f)"],
    eval_metrics=["ndcg_cut_5", "ndcg_cut_10", "ndcg_cut_20", "ndcg"])



Unnamed: 0,name,ndcg_cut_5,ndcg_cut_10,ndcg_cut_20,ndcg
0,BM25,0.322932,0.310385,0.284343,0.256477
1,BM25 + LMart(7f),0.487131,0.391346,0.328661,0.282865


In [11]:
lmart_x_pipe.search('river and forest').head()



Unnamed: 0,qid,docid,docno,score,query,Is Highlight,Is Timeline Work,Department,Title,Culture,Period,Artist Display Name,Country,Tags,image,query_0,features,rank
3,1,672,38036,0.091865,river and forest,False,False,Asian Art,,"India (Punjab Hills, Kangra)",,,,Birds Peacocks Deer Men Women,1,river forest #combine:0=0.1:wmodel=org.terrier...,"[12.432252192304809, 11.95729987750299, 0.0, 0...",0
0,1,22867,437684,-0.020772,river and forest,False,False,European Paintings,The Road from Moret to Saint-MammÃ¨s,,,Alfred Sisley,,Houses Roads Forests Human Figures,1,river forest #combine:0=0.1:wmodel=org.terrier...,"[13.784947509100498, 14.938358968916901, 0.0, ...",1
2,1,23706,459114,-0.020772,river and forest,False,False,Robert Lehman Collection,The Pool (Memory of the Forest of Chambord),,,ThÃ©odore Rousseau,,Forests Landscapes,1,river forest #combine:0=0.1:wmodel=org.terrier...,"[12.60247157324802, 15.552282514543561, 0.0133...",2
1,1,16339,383131,-0.070276,river and forest,False,False,Drawings and Prints,"View of Chepstow, Wales",,,John Scarlett Davis,,Rivers Landscapes Trees,1,river forest #combine:0=0.1:wmodel=org.terrier...,"[13.087937816313579, 14.060267232056386, 0.0, ...",3
37,1,22765,437526,-0.19527,river and forest,False,True,European Paintings,A Forest at Dawn with a Deer Hunt,,,Peter Paul Rubens,,Hunting Deer Forests Dawn,1,river forest #combine:0=0.1:wmodel=org.terrier...,"[8.541998579823973, 9.426145145487176, 0.01330...",4
