In [64]:
!pip install cherche --upgrade -q

In [65]:
!pip install ir_datasets -q

In [66]:
!pip install pytrec_eval



In [67]:
!pip install sentence_transformers



In [68]:
import pytrec_eval

In [69]:
import ir_datasets

In [70]:
#dataset = ir_datasets.load("msmarco-passage/trec-dl-2019/judged")
dataset = ir_datasets.load("vaswani")

In [71]:
import pandas as pd
import json

In [72]:
from cherche import data, retrieve, rank
from sentence_transformers import SentenceTransformer

In [73]:
import numpy as np

In [74]:
from sklearn.feature_extraction.text import TfidfVectorizer

In [75]:
for x in dataset.docs_iter():
  print(x)
  break

GenericDoc(doc_id='1', text='compact memories have flexible capacities  a digital data storage\nsystem with capacity up to bits and random and or sequential access\nis described\n')


In [76]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import linear_kernel
from scipy.sparse import csc_matrix
import typing

In [77]:
vectorizer_queries = TfidfVectorizer()
vectorizer_queries.fit([querytext for queryid, querytext in dataset.queries_iter()])


In [78]:
list(vectorizer_queries.vocabulary_.keys())[:5], len(vectorizer_queries.vocabulary_.keys())

(['measurement', 'of', 'dielectric', 'constant', 'liquids'], 430)

In [80]:
def clean_text(raw_text):
  return " ".join(list(set(aq(raw_text))&set_vocab_queries))

In [81]:
# List of dicts
documents = [{'id':doc,'title':doc,'url':doc,'article':clean_text(text)} for (doc, text) in dataset.docs_iter()]
len(documents)

11429

In [82]:
documents[:3]

[{'id': '1',
  'title': '1',
  'url': '1',
  'article': 'and digital random with data to is have or'},
 {'id': '2',
  'title': '2',
  'url': '2',
  'article': 'analogue and for systems stability computer mathematical amplifiers linear of derivation electronic the equations an'},
 {'id': '3',
  'title': '3',
  'url': '3',
  'article': 'and for circuit calculating which of from construction electronic the to details an given transformer or'}]

In [83]:
#qrles
qrel = {k: {kk:int(vv) for kk,vv in v[['doc_id','relevance']].values} for k, v in pd.DataFrame(dataset.qrels_iter()).groupby('query_id')[['doc_id','relevance']]}
evaluator = pytrec_eval.RelevanceEvaluator(
    qrel, {'map', 'ndcg_cut'})

In [84]:
# Retrieve on field article and evaluate
retriever = retrieve.TfIdf(key="id", on=["article"], documents=documents, k=1000, tfidf=TfidfVectorizer(vocabulary=vocab_queries))


In [85]:
len(list(retriever.tfidf.vocabulary_))

430

In [86]:
retriever("born for")[:30]

[{'id': '6741', 'similarity': 1.0},
 {'id': '1969', 'similarity': 1.0},
 {'id': '8395', 'similarity': 1.0},
 {'id': '8872', 'similarity': 0.8614243268966675},
 {'id': '341', 'similarity': 0.8614243268966675},
 {'id': '4674', 'similarity': 0.8614243268966675},
 {'id': '9582', 'similarity': 0.8462282419204712},
 {'id': '2984', 'similarity': 0.8462282419204712},
 {'id': '10811', 'similarity': 0.8462282419204712},
 {'id': '8498', 'similarity': 0.7628793120384216},
 {'id': '1840', 'similarity': 0.7189940214157104},
 {'id': '7032', 'similarity': 0.7132627964019775},
 {'id': '6742', 'similarity': 0.7132627964019775},
 {'id': '7228', 'similarity': 0.7132627964019775},
 {'id': '7611', 'similarity': 0.7132627964019775},
 {'id': '10560', 'similarity': 0.6957420706748962},
 {'id': '4772', 'similarity': 0.6372761726379395},
 {'id': '4037', 'similarity': 0.6228885054588318},
 {'id': '10942', 'similarity': 0.5962892770767212},
 {'id': '2665', 'similarity': 0.5847053527832031},
 {'id': '1757', 'simila

In [87]:
del documents

In [88]:
retriever.documents[10]

{'id': '11'}

In [89]:
dataset_queries_iter = [[queryid, querytext] for queryid, querytext in dataset.queries_iter()]
#dataset_queries_iter = dataset_queries_iter

In [90]:
run = {queryid:{x['id']:float(x['similarity']) if 'similarity' in x else 1/float(i+1) for i,x in enumerate(retriever(querytext))} for queryid, querytext in dataset_queries_iter}


In [91]:
pd.DataFrame(evaluator.evaluate(run)).T.mean()

map              0.146416
ndcg_cut_5       0.268532
ndcg_cut_10      0.246502
ndcg_cut_15      0.238941
ndcg_cut_20      0.230703
ndcg_cut_30      0.237879
ndcg_cut_100     0.307093
ndcg_cut_200     0.354099
ndcg_cut_500     0.413793
ndcg_cut_1000    0.444316
dtype: float64

In [92]:
len(list(dataset.queries_iter()))

93

In [93]:
#index only retrieved documents when reranking
docsinqueries = list(set([y  for x in [list(v.keys()) for k,v in run.items()] for y in x]))
print(len(docsinqueries))
docsinqueries = set(docsinqueries)
docsinqueries = [{'id':doc,'title':doc,'url':doc,'article':text} for (doc, text) in dataset.docs_iter() if doc in docsinqueries]

11413


In [94]:
# Rerank on field article
ranker = rank.Encoder(
    key = "id",
    on = ["article"],
    encoder = SentenceTransformer("sentence-transformers/all-mpnet-base-v2").encode,
    k = 1000#,
    #path = "encoder_all-mpnet-base-v2.pkl"
)

# Pipeline creation
search = retriever + ranker

In [95]:
# Preindexing
search.add(documents=docsinqueries)

Encoder ranker: 100%|██████████| 179/179 [58:06<00:00, 19.48s/it]


TfIdf retriever
	key      : id
	on       : article
	documents: 11429
Encoder ranker
	key       : id
	on        : article
	normalize : True
	embeddings: 11413

In [96]:
# Retrieve and Rerank on field article and evaluate
run = {queryid:{x['id']:float(x['similarity']) if 'similarity' in x else 1/float(i+1) for i,x in enumerate(search(querytext))} for queryid, querytext in dataset_queries_iter}
pd.DataFrame(evaluator.evaluate(run)).T.mean()

Ranker scoring: 1it [00:00, 76.88it/s]
Ranker sorting: 1it [00:00, 389.01it/s]
Ranker scoring: 1it [00:00, 164.98it/s]
Ranker sorting: 1it [00:00, 486.69it/s]
Ranker scoring: 1it [00:00, 158.77it/s]
Ranker sorting: 1it [00:00, 410.56it/s]
Ranker scoring: 1it [00:00, 162.18it/s]
Ranker sorting: 1it [00:00, 526.79it/s]
Ranker scoring: 1it [00:00, 178.46it/s]
Ranker sorting: 1it [00:00, 418.43it/s]
Ranker scoring: 1it [00:00, 156.31it/s]
Ranker sorting: 1it [00:00, 576.62it/s]
Ranker scoring: 1it [00:00, 160.13it/s]
Ranker sorting: 1it [00:00, 435.59it/s]
Ranker scoring: 1it [00:00, 154.21it/s]
Ranker sorting: 1it [00:00, 558.64it/s]
Ranker scoring: 1it [00:00, 173.13it/s]
Ranker sorting: 1it [00:00, 482.77it/s]
Ranker scoring: 1it [00:00, 171.29it/s]
Ranker sorting: 1it [00:00, 589.17it/s]
Ranker scoring: 1it [00:00, 161.82it/s]
Ranker sorting: 1it [00:00, 581.17it/s]
Ranker scoring: 1it [00:00, 164.14it/s]
Ranker sorting: 1it [00:00, 490.73it/s]
Ranker scoring: 1it [00:00, 191.47it/s]
R

map              0.277529
ndcg_cut_5       0.489387
ndcg_cut_10      0.453051
ndcg_cut_15      0.432176
ndcg_cut_20      0.425705
ndcg_cut_30      0.423970
ndcg_cut_100     0.493365
ndcg_cut_200     0.539920
ndcg_cut_500     0.571134
ndcg_cut_1000    0.575336
dtype: float64

In [97]:
import pickle
file_pi = open('search_tfidf_msmarcopassages_dl19_2024.obj', 'wb')
pickle.dump((search,dataset_queries_iter,qrel), file_pi)

In [98]:
!cp search_tfidf_msmarcopassages_dl19_2024.obj drive/MyDrive/_RESEARCH/CHERCHE/

cp: cannot create regular file 'drive/MyDrive/_RESEARCH/CHERCHE/': No such file or directory


# Rerun using precalculated searcher

In [99]:
!cp drive/MyDrive/_RESEARCH/CHERCHE/search_tfidf_msmarcopassages_dl19_2024.obj .

cp: cannot stat 'drive/MyDrive/_RESEARCH/CHERCHE/search_tfidf_msmarcopassages_dl19_2024.obj': No such file or directory


In [100]:
import pickle
file_pi = open('search_tfidf_msmarcopassages_dl19_2024.obj', 'rb')
(search,dataset_queries_iter,qrel) = pickle.load(file_pi)

In [101]:
evaluator = pytrec_eval.RelevanceEvaluator(
    qrel, {'map', 'ndcg_cut'})

In [102]:
run = {queryid:{x['id']:float(x['similarity']) if 'similarity' in x else 1/float(i+1) for i,x in enumerate(search(querytext))} for queryid, querytext in dataset_queries_iter}
pd.DataFrame(evaluator.evaluate(run)).T.mean()

Ranker scoring: 1it [00:00, 187.82it/s]
Ranker sorting: 1it [00:00, 462.39it/s]
Ranker scoring: 1it [00:00, 128.51it/s]
Ranker sorting: 1it [00:00, 339.48it/s]
Ranker scoring: 1it [00:00, 177.48it/s]
Ranker sorting: 1it [00:00, 369.93it/s]
Ranker scoring: 1it [00:00, 172.77it/s]
Ranker sorting: 1it [00:00, 452.85it/s]
Ranker scoring: 1it [00:00, 140.32it/s]
Ranker sorting: 1it [00:00, 425.13it/s]
Ranker scoring: 1it [00:00, 190.58it/s]
Ranker sorting: 1it [00:00, 394.09it/s]
Ranker scoring: 1it [00:00, 167.22it/s]
Ranker sorting: 1it [00:00, 363.58it/s]
Ranker scoring: 1it [00:00, 192.51it/s]
Ranker sorting: 1it [00:00, 457.24it/s]
Ranker scoring: 1it [00:00, 129.04it/s]
Ranker sorting: 1it [00:00, 143.92it/s]
Ranker scoring: 1it [00:00, 156.84it/s]
Ranker sorting: 1it [00:00, 144.67it/s]
Ranker scoring: 1it [00:00, 152.31it/s]
Ranker sorting: 1it [00:00, 90.10it/s]
Ranker scoring: 1it [00:00, 171.40it/s]
Ranker sorting: 1it [00:00, 270.77it/s]
Ranker scoring: 1it [00:00, 146.68it/s]
R

map              0.277529
ndcg_cut_5       0.489387
ndcg_cut_10      0.453051
ndcg_cut_15      0.432176
ndcg_cut_20      0.425705
ndcg_cut_30      0.423970
ndcg_cut_100     0.493365
ndcg_cut_200     0.539920
ndcg_cut_500     0.571134
ndcg_cut_1000    0.575336
dtype: float64