In [1]:
import pyterrier as pt
import pandas as pd
import os

In [2]:
if not pt.started():
    pt.init()

PyTerrier 0.7.1 has loaded Terrier 5.6 (built by craigmacdonald on 2021-09-17 13:27)


In [3]:
pd_indexer = pt.DFIndexer('./index', overwrite=True, blocks=True, verbose=True)
df = pd.read_csv('met_dataset.csv').astype(str).rename(columns={'Object ID': 'docno'})
df['Tags'] = df['Tags'].apply(lambda x: x.replace('|', ' '))
print(df.head())

  exec(code_obj, self.user_global_ns, self.user_ns)


  Unnamed: 0 Object Number Is Highlight Is Timeline Work Is Public Domain  \
0      29852      2009.224        False             True             True   
1      30293           9.3        False            False             True   
2      30294     12.37.135        False            False            False   
3      30295     13.100.22        False            False             True   
4      30296     13.100.25        False            False             True   

   docno Gallery Number      Department AccessionYear         Object Name  \
0  35155          374.0  Arms and Armor        2009.0            Painting   
1  35968            nan       Asian Art        1909.0  Wall hanging (map)   
2  35969            nan       Asian Art        1912.0      Hanging scroll   
3  35970            nan       Asian Art        1913.0      Hanging scroll   
4  35971            nan       Asian Art        1913.0      Hanging scroll   

   ... Rights and Reproduction  \
0  ...                     nan   
1  ...

In [4]:
meta_fields = df[['docno', 'Is Highlight', 'Is Timeline Work', 'Department', 'Title', 'Culture', 'Period', 'Artist Display Name', 'Country', 'Tags', 'image']]
indexref = pd_indexer.index(df["description"], **meta_fields)

34427documents [00:24, 1389.19documents/s]                    


In [5]:
index = pt.IndexFactory.of(indexref)

In [6]:
topics = pd.read_csv('topics.csv').astype(str)
qrels = pd.read_csv('qrels.csv').astype(str)
qrels['label'] = qrels['label'].astype(int)

In [7]:
RANK_CUTOFF = 100
SEED=42

from sklearn.model_selection import train_test_split

tr_va_topics, test_topics = train_test_split(topics, test_size=0.15, random_state=SEED)
train_topics, valid_topics =  train_test_split(tr_va_topics, test_size=0.15, random_state=SEED)

In [8]:
bm25 = pt.BatchRetrieve(index, wmodel="BM25")
sdm = pt.rewrite.SDM()
qe = pt.rewrite.Bo1QueryExpansion(index)

ltr_feats1 = (bm25 % RANK_CUTOFF) >> pt.text.get_text(index, ['Is Highlight', 'Is Timeline Work', 'Department', 'Title', 'Culture', 'Period', 'Artist Display Name', 'Country', 'Tags', 'image']) >> (
    pt.transformer.IdentityTransformer()
    ** # sequential dependence and query expansion
    (sdm >> bm25 >> qe >> bm25)
    ** # score of title (not originally indexed)
    (pt.text.scorer(body_attr="Title", takes='docs', wmodel='BM25') ) 
    ** # score of author (not originally indexed)
    (pt.text.scorer(body_attr="Artist Display Name", takes='docs', wmodel='BM25') ) 
    ** # score of tags (not originally indexed)
    (pt.text.scorer(body_attr="Tags", takes='docs', wmodel='BM25') ) 
    ** # score of Country (not originally indexed)
    (pt.text.scorer(body_attr="Culture", takes='docs', wmodel='BM25') ) 
    ** # score of Department (not originally indexed)
    (pt.text.scorer(body_attr="Department", takes='docs', wmodel='BM25') ) 
    ** # score of Culture (not originally indexed)
    (pt.text.scorer(body_attr="Culture", takes='docs', wmodel='BM25') ) 
    ** # score of Period (not originally indexed)
    (pt.text.scorer(body_attr="Period", takes='docs', wmodel='BM25') ) 
    ** # is highlited
    (pt.apply.doc_score(lambda row: int(row["Is Highlight"] == 'True')))
    ** # is Timeline Work
    (pt.apply.doc_score(lambda row: int(row["Is Timeline Work"] == 'True')))
    ** # has image
    (pt.apply.doc_score(lambda row: int( row["image"] == '1' and len(row["image"]) > 0) ))
    ** # abstract coordinate match
    pt.BatchRetrieve(index, wmodel="CoordinateMatch")
)

# for reference, lets record the feature names here too
fnames=["BM25", "SDM and QE", "Title", "Artist Name", "Tags", "Country", "Department", "Culture", "Period", "Is Highlight", "Is Timeline Work", "hasImage" , "CoordinateMatch"]

In [14]:
ltr_feats1.search("Chinese landscape with mountain and river")



  warn("Got number of results different expected from %s, expected %d received %d, feature scores for any missing documents be 0, extraneous documents will be removed" % (repr(m), num_results, len(res)))


Unnamed: 0,qid,docid,docno,rank,score,query,Is Highlight,Is Timeline Work,Department,Title,Culture,Period,Artist Display Name,Country,Tags,image,query_0,features
0,1,1064,44895,0,25.549796,Chinese landscape with mountain and river,False,False,Asian Art,,Japan,Edo period (1615â?868),Kushiro Unsen,,Landscapes,1,chinese landscape mountain river #combine:0=0....,"[25.549795597448846, 26.3693746786104, 0.0, 0...."
1,1,17,35992,1,21.917356,Chinese landscape with mountain and river,False,False,Asian Art,æ?æ¸? æ½é²é¦? ç§æ±è½ç§ æ|Setting ...,China,Ming dynasty (1368â?644),Pan Yunyu,,Boats Landscapes,1,chinese landscape mountain river #combine:0=0....,"[21.91735582273288, 26.412814957530824, 2.3546..."
2,1,986,42347,2,21.831142,Chinese landscape with mountain and river,False,False,Asian Art,çæ¹å«æ¯å³|Eight Views of the Xiao and Xi...,Japan,Muromachi period (1392â?573),,,Mountains Landscapes Boats,1,chinese landscape mountain river #combine:0=0....,"[21.83114215124545, 20.283100961731027, 2.3546..."
3,1,987,42489,3,21.831142,Chinese landscape with mountain and river,False,False,Asian Art,çæ¹å«æ¯å³|Eight Views of the Xiao and Xi...,Japan,Muromachi period (1392â?573),,,Mountains Boats Landscapes,1,chinese landscape mountain river #combine:0=0....,"[21.83114215124545, 20.283100961731027, 2.3546..."
4,1,33498,829383,4,21.816472,Chinese landscape with mountain and river,False,False,Asian Art,å¤©é é¾æ¾¤ç­ãå±±æ°´å³|Landscape,Japan,Muromachi period (1392â?573),Tenâin RyÅ«taku|Unidentified,,Mountains Landscapes,1,chinese landscape mountain river #combine:0=0....,"[21.816471527147698, 21.507435756496346, 2.317..."
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
95,1,1334,48896,95,12.325978,Chinese landscape with mountain and river,False,False,Asian Art,æ? ä½å ç´é¶´å? è»¸|Playing the zithe...,China,Ming dynasty (1368â?644),Unidentified artist,,Landscapes Musical Instruments Cranes Men,1,chinese landscape mountain river #combine:0=0....,"[12.325977912054366, 11.273054413817853, 0.0, ..."
96,1,1556,49471,96,12.325978,Chinese landscape with mountain and river,False,False,Asian Art,æ¸? ä»»é  ç³å®¤åç¦ªå? è»¸|Meditation in ...,China,Qing dynasty (1644â?911),Ren Yu,,Men Caves,1,chinese landscape mountain river #combine:0=0....,"[12.325977912054366, 10.32608080847633, 0.0, 0..."
97,1,28766,489553,97,12.325978,Chinese landscape with mountain and river,False,False,Modern and Contemporary Art,"Landscape at Saint-Jeannet, Provence",,,FÃ©lix Vallotton,,,1,chinese landscape mountain river #combine:0=0....,"[12.325977912054366, 11.607998733765326, 1.435..."
98,1,28,36008,98,12.312222,Chinese landscape with mountain and river,False,False,Asian Art,æ? ä½å è§é³ç¾æ¼¢å æ|Guanyin an...,China,Ming dynasty (1368â?644),Unidentified artist,,Bodhisattvas Bridges Men Women Buddhism,1,chinese landscape mountain river #combine:0=0....,"[12.312221797949377, 11.633905780625524, 0.0, ..."


In [10]:
import lightgbm as lgb

# this configures LightGBM as LambdaMART
lmart_l = lgb.LGBMRanker(
    task="train",
    silent=False,
    min_data_in_leaf=1,
    min_sum_hessian_in_leaf=1,
    max_bin=255,
    num_leaves=31,
    objective="lambdarank",
    metric="ndcg",
    ndcg_eval_at=[10],
    ndcg_at=[10],
    eval_at=[10],
    learning_rate= .1,
    importance_type="gain",
    num_iterations=100,
    early_stopping_rounds=5
)

lmart_x_pipe = ltr_feats1 >> pt.ltr.apply_learned_model(lmart_l, form="ltr", fit_kwargs={'eval_at':[10]})

%time lmart_x_pipe.fit(train_topics, qrels, valid_topics, qrels)



  warn("Got number of results different expected from %s, expected %d received %d, feature scores for any missing documents be 0, extraneous documents will be removed" % (repr(m), num_results, len(res)))


You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 233
[LightGBM] [Info] Number of data points in the train set: 296, number of used features: 8
[1]	valid_0's ndcg@10: 0.104946
Training until validation scores don't improve for 5 rounds
[2]	valid_0's ndcg@10: 0.178251
[3]	valid_0's ndcg@10: 0.198222
[4]	valid_0's ndcg@10: 0.32815
[5]	valid_0's ndcg@10: 0.215355
[6]	valid_0's ndcg@10: 0.198222
[7]	valid_0's ndcg@10: 0.198222
[8]	valid_0's ndcg@10: 0.198222
[9]	valid_0's ndcg@10: 0.198222
Early stopping, best iteration is:
[4]	valid_0's ndcg@10: 0.32815
CPU times: user 3.63 s, sys: 343 ms, total: 3.97 s
Wall time: 2.72 s


  warn("Got number of results different expected from %s, expected %d received %d, feature scores for any missing documents be 0, extraneous documents will be removed" % (repr(m), num_results, len(res)))


In [13]:
pt.Experiment(
    [bm25 % RANK_CUTOFF, lmart_x_pipe],
    test_topics,
    qrels, 
    names=["BM25", "BM25 + LMart(7f)"],
    # baseline=0,
    eval_metrics=["map", "recip_rank", "ndcg", "ndcg_cut_10", "mrt"])



  warn("Got number of results different expected from %s, expected %d received %d, feature scores for any missing documents be 0, extraneous documents will be removed" % (repr(m), num_results, len(res)))


Unnamed: 0,name,map,recip_rank,ndcg,ndcg_cut_10,mrt
0,BM25,0.571619,1.0,0.737737,0.819001,25.32233
1,BM25 + LMart(7f),0.42569,1.0,0.717204,0.890789,1018.481468
