# Imports

In [1]:
%pip install torch torchaudio torchvision
!pip install sentencepiece
%pip install python-terrier==0.10.0
%pip install scipy nltk



In [2]:
import os
import re
import time
import numpy as np
import pandas as pd
import seaborn as sns
from tqdm import tqdm
import matplotlib.pyplot as plt

import pyterrier as pt
from pathlib import Path

import torch
import torchaudio
import torchvision

import nltk
nltk.download('wordnet')
nltk.download('omw-1.4')

from transformers import pipeline, AutoModel, AutoTokenizer, MarianMTModel, MarianTokenizer
from sklearn.feature_extraction.text import TfidfVectorizer
import sentencepiece

print(pt.__version__)
pt.init()

[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to /root/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


0.10.0


PyTerrier 0.10.0 has loaded Terrier 5.8 (built by craigm on 2023-11-01 18:05) and terrier-helper 0.0.8



In [3]:
from google.colab import drive
np.random.seed=42
drive.mount("/content/drive")

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [4]:
dataset_path = '/content/drive/MyDrive/IR_2023/data'

In [5]:
test_datasets = {}
test_datasets['en'] = pd.read_csv(f'{dataset_path}/test_datasets_en.csv')
test_datasets['de'] = pd.read_csv(f'{dataset_path}/test_datasets_de.csv')
test_datasets['fr'] = pd.read_csv(f'{dataset_path}/test_datasets_fr.csv', encoding='utf-8')

# Data Preprocessing

In [6]:
test_dataset_en = test_datasets['en']
test_dataset_de = test_datasets['de']
test_dataset_fr = test_datasets['fr']

test_dataset_en['wikidata_id'] = test_dataset_en['wikidata_id'].astype(str)
test_dataset_de['wikidata_id'] = test_dataset_de['wikidata_id'].astype(str)
test_dataset_fr['wikidata_id'] = test_dataset_fr['wikidata_id'].astype(str)

common_de_ids = set(test_dataset_en['wikidata_id']).intersection(set(test_dataset_de['wikidata_id']))

common_de_en = test_dataset_en[test_dataset_en['wikidata_id'].isin(common_de_ids)]
common_de_de = test_dataset_de[test_dataset_de['wikidata_id'].isin(common_de_ids)]
common_de = pd.concat([common_de_en, common_de_de], ignore_index=True)
common_de = common_de.reset_index()
test_common_de = common_de


common_fr_ids = set(test_dataset_en['wikidata_id']).intersection(set(test_dataset_fr['wikidata_id']))
common_fr_en = test_dataset_en[test_dataset_en['wikidata_id'].isin(common_fr_ids)]
common_fr_fr = test_dataset_fr[test_dataset_fr['wikidata_id'].isin(common_fr_ids)]
common_fr = pd.concat([common_fr_en, common_fr_fr], ignore_index=True)
common_fr = common_fr.reset_index()
test_common_fr = common_fr

common_de_fr_ids = common_de_ids.intersection(common_fr_ids)
common_de_fr = pd.concat([
    test_dataset_de[test_dataset_de['wikidata_id'].isin(common_de_fr_ids)],
    test_dataset_fr[test_dataset_fr['wikidata_id'].isin(common_de_fr_ids)],
    test_dataset_en[test_dataset_en['wikidata_id'].isin(common_de_fr_ids)]
], ignore_index=True)
common_de_fr = common_de_fr.reset_index()
test_common_de_fr = common_de_fr

print('number of common articles across en and de:', len(test_common_de))
print('number of common articles across en and fr:', len(test_common_fr))
print('number of common articles across en, de and fr:', len(test_common_de_fr))

number of common articles across en and de: 3374
number of common articles across en and fr: 3140
number of common articles across en, de and fr: 123


In [7]:
def preprocess_text(string):
  if not string:
    return "none"

  if isinstance(string, bytes):
    string = string.decode('utf-8', errors='replace')

  string = re.sub(r'[^a-zA-Z0-9]', ' ', string)
  string = re.sub(r'\s+', ' ', string).strip()

  return "none" if string == "" else string

In [8]:
test_common_de['text'] = test_common_de['text'].apply(preprocess_text)
test_common_de = test_common_de.dropna(subset=['text'])

test_common_de['title'] = test_common_de['title'].apply(preprocess_text)
test_common_de = test_common_de.dropna(subset=['title'])


test_common_fr['text'] = test_common_fr['text'].apply(preprocess_text)
test_common_fr = test_common_fr.dropna(subset=['text'])

test_common_fr['title'] = test_common_fr['title'].apply(preprocess_text)
test_common_fr = test_common_fr.dropna(subset=['title'])

In [9]:
test_common_fr

Unnamed: 0.1,index,Unnamed: 0,title,text,language,wikidata_id
0,0,113,Klosterneuburg Geography,It is located on the Danube immediately north ...,en,Q487522
1,1,260,Thomas Phillipps The Collection,In 1798 when Phillipps was 6 years old he alre...,en,Q2147709
2,2,570,Malagasy bulbul Taxonomy and systematics,The Malagasy bulbul was originally described i...,en,Q839001
3,3,628,Solar eclipse of November 13 2012 Visibility,For this eclipse totality was visible from nor...,en,Q51414
4,4,693,Plunderer manga Anime,An anime adaptation was announced by Monthly S...,en,Q50138221
...,...,...,...,...,...,...
3135,3135,67931,Albator 84 LAtlantis de ma jeunesse Synopsis,Un pilote Albator et un ingnieur Alfred se ren...,fr,Q1756233
3136,3136,67948,Passerelle aroportuaire,Une passerelle aroportuaire ou passerelle demb...,fr,Q1061299
3137,3137,67951,Georgina Hogarth Ascendance,Georgina Thompson Hogarth est la fille cadette...,fr,Q3103792
3138,3138,67976,NCSM Magnificent CVL 21 Historique,Troisime unit de la classe Majestic le HMS Mag...,fr,Q2591384


In [10]:
test_common_fr[test_common_fr['language'] == 'fr']

Unnamed: 0.1,index,Unnamed: 0,title,text,language,wikidata_id
1570,1570,5,Tom Araya Enfance,Araya est n Via del Mar au Chili seulement cin...,fr,Q312292
1571,1571,42,Temps terrestre,Le temps terrestre TT est une chelle de temps ...,fr,Q2460513
1572,1572,95,Luis de Pablo Biographie,Luis de Pablo tudie trs jeune la musique mais ...,fr,Q2617839
1573,1573,108,Michal Goleniewski,Michal Goleniewski n le 16 aot 1922 Nieswiez a...,fr,Q2335319
1574,1574,112,Buteni Culture,Daprs le recensement de 2011 la commune compte...,fr,Q12085921
...,...,...,...,...,...,...
3135,3135,67931,Albator 84 LAtlantis de ma jeunesse Synopsis,Un pilote Albator et un ingnieur Alfred se ren...,fr,Q1756233
3136,3136,67948,Passerelle aroportuaire,Une passerelle aroportuaire ou passerelle demb...,fr,Q1061299
3137,3137,67951,Georgina Hogarth Ascendance,Georgina Thompson Hogarth est la fille cadette...,fr,Q3103792
3138,3138,67976,NCSM Magnificent CVL 21 Historique,Troisime unit de la classe Majestic le HMS Mag...,fr,Q2591384


In [11]:
test_common_de['wikidata_id'] = test_common_de['wikidata_id'].astype(str)
test_common_fr['wikidata_id'] = test_common_fr['wikidata_id'].astype(str)
test_common_de_fr['wikidata_id'] = test_common_de_fr['wikidata_id'].astype(str)

test_common_de['index'] = test_common_de['index'].astype(str)
test_common_fr['index'] = test_common_fr['index'].astype(str)
test_common_de_fr['index'] = test_common_de_fr['index'].astype(str)

## Process & translate words

In [12]:
en_fr_model_name = 'Helsinki-NLP/opus-mt-en-fr'
en_fr_tokenizer = MarianTokenizer.from_pretrained(en_fr_model_name)
en_fr_model = MarianMTModel.from_pretrained(en_fr_model_name)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


In [13]:
en_de_model_name = 'Helsinki-NLP/opus-mt-en-de'
en_de_tokenizer = MarianTokenizer.from_pretrained(en_de_model_name)
en_de_model = MarianMTModel.from_pretrained(en_de_model_name)

In [14]:
def extract_unique_words(string_list):
  unique_words = set()
  for string in string_list:
    words = string.split()
    unique_words.update(words)
  return unique_words

In [15]:
unique_de_title_words = extract_unique_words(test_common_de[test_common_de['language'] == 'de']['title'].tolist())
unique_fr_title_words = extract_unique_words(test_common_fr[test_common_fr['language'] == 'fr']['title'].tolist())
print(len(unique_de_title_words))
print(len(unique_fr_title_words))

3467
3564


In [16]:
from nltk.corpus import wordnet as wn

def get_synonyms_fr(word):
  synonyms = set()
  for syn in wn.synsets(word, lang='fra'):
    for lemma in syn.lemmas():
      synonym = lemma.name().replace('_', ' ')
      synonyms.add(synonym)
  return synonyms

def semantic_word_match_fr(word, bow):
  if word in bow:
    return True
  synonyms = get_synonyms_fr(word)
  return any(synonym in bow for synonym in synonyms)

In [17]:
def translate_fr(text, tokenizer, model, bow=None):
  translated = model.generate(**tokenizer(text, return_tensors="pt", padding=True))
  translated_text = tokenizer.decode(translated[0], skip_special_tokens=True)
  matched_words = [word for word in translated_text.split() if semantic_word_match_fr(word, bow)]
  if len(matched_words) > 0:
    return ' '.join(matched_words), (len(matched_words) + 1)
  else:
    return translated_text, 1

In [18]:
def translate_de(text, tokenizer, model, bow=None):
  translated = model.generate(**tokenizer(text, return_tensors="pt", padding=True))
  translated_text = tokenizer.decode(translated[0], skip_special_tokens=True)
  return translated_text

In [19]:
fr_test_titles = test_common_fr[test_common_fr['language'] == 'en']['title'].tolist()
fr_test_qids = test_common_fr[test_common_fr['language'] == 'en']['wikidata_id'].tolist()
fr_data = {'qid': fr_test_qids,'query': fr_test_titles}
fr_test_cases_en = pd.DataFrame(fr_data)

test_cases_fr = pd.DataFrame()

translated_results = fr_test_cases_en['query'].apply(lambda x: translate_fr(x, en_fr_tokenizer, en_fr_model, unique_fr_title_words))
test_cases_fr['query'] = translated_results.apply(lambda x: x[0])
test_cases_fr['matched_word_count'] = translated_results.apply(lambda x: x[1])

test_cases_fr['qid'] = range(1, len(test_cases_fr) + 1)
test_cases_fr['qid'] = test_cases_fr['qid'].astype(str)

test_cases_fr['query'] = test_cases_fr['query'].apply(preprocess_text)
test_cases_fr = test_cases_fr.dropna()
test_cases_fr

Unnamed: 0,query,matched_word_count,qid
0,de Klosterneuburg,3,1
1,Thomas La,3,2
2,et des,3,3
3,solaire du 13 novembre 2012,6,4
4,Pleunder manga Anime,1,5
...,...,...,...
1565,Origines dans les,4,1566
1566,Esnault,2,1567
1567,des sur la,4,1568
1568,de Tenan,3,1569


In [20]:
de_test_titles = test_common_de[test_common_de['language'] == 'en']['title'].tolist()
de_test_qids = test_common_de[test_common_de['language'] == 'en']['wikidata_id'].tolist()
de_data = {'qid': de_test_qids,'query': de_test_titles}
de_test_cases_en = pd.DataFrame(de_data)

test_cases_de = pd.DataFrame()
test_cases_de['query'] = de_test_cases_en['query'].apply(lambda x: translate_de(x, en_de_tokenizer, en_de_model, unique_de_title_words))
test_cases_de['query'] = test_cases_de['query'].apply(preprocess_text)
test_cases_de['qid'] = range(1, len(test_cases_de) + 1)
test_cases_de['qid'] = test_cases_de['qid'].astype(str)

test_cases_de['query'] = test_cases_de['query'].apply(preprocess_text)
test_cases_de = test_cases_de.dropna()
test_cases_de

Unnamed: 0,query,qid
0,Klosterneuburg Geographie,1
1,198687 Polska Liga Hokejowa Saison,2
2,Ludk Peek Biographie,3
3,Philosophi Naturalis Principia Mathematica Buc...,4
4,Die GClefs,5
...,...,...
1682,Ingham Queensland Geographie,1683
1683,Ari Lehman Biographie,1684
1684,Falerno del Massico,1685
1685,199394 NHL Saison League Gesch ft,1686


In [21]:
fr_subset = test_common_fr[test_common_fr['language'] == 'fr'].copy()
fr_subset['qid'] = range(1, len(fr_subset) + 1)
fr_subset['label'] = 1

fr_qrels = pd.DataFrame()

fr_qrels['label'] = test_cases_fr['matched_word_count'].values
fr_qrels['qid'] = fr_subset['qid'].values
fr_qrels['docno'] = fr_subset['wikidata_id'].values

fr_qrels = fr_qrels.rename(columns={'wikidata_id': 'docno'})

fr_qrels = fr_qrels[['qid', 'docno', 'label']].dropna()

fr_qrels['qid'] = fr_qrels['qid'].astype(str)
fr_qrels['docno'] = fr_qrels['docno'].astype(str)

fr_qrels

Unnamed: 0,qid,docno,label
0,1,Q312292,3
1,2,Q2460513,3
2,3,Q2617839,3
3,4,Q2335319,6
4,5,Q12085921,1
...,...,...,...
1565,1566,Q1756233,4
1566,1567,Q1061299,2
1567,1568,Q3103792,4
1568,1569,Q2591384,3


In [22]:
de_subset = test_common_de[test_common_de['language'] == 'de'].copy()
de_subset['qid'] = range(1, len(de_subset) + 1)
de_subset['label'] = 1

de_qrels = pd.DataFrame()

de_qrels['label'] = de_subset['label'].copy()
de_qrels['qid'] = de_subset['qid'].values
de_qrels['docno'] = de_subset['wikidata_id'].values

de_qrels = de_qrels.rename(columns={'wikidata_id': 'docno'})

de_qrels = de_qrels[['qid', 'docno', 'label']].dropna()

de_qrels['qid'] = de_qrels['qid'].astype(str)
de_qrels['docno'] = de_qrels['docno'].astype(str)

de_qrels

Unnamed: 0,qid,docno,label
1687,1,Q69476,1
1688,2,Q1165694,1
1689,3,Q827188,1
1690,4,Q1005538,1
1691,5,Q1151191,1
...,...,...,...
3369,1683,Q108288,1
3370,1684,Q275441,1
3371,1685,Q1398218,1
3372,1686,Q1275566,1


## Create indexes

In [23]:
def index_dataset(dataset, dataset_name):
  indexer = pt.IterDictIndexer(f'/content/index/{dataset_name}', overwrite=True)
  document_dicts = [{
    'title': row['title'],
    'text': row['text'],
    'docno': row['wikidata_id'],
    'wikidata_id': row['wikidata_id'],
    'index': row['index']
  } for index, row in dataset.iterrows()]

  index_ref = indexer.index(document_dicts, fields=['title', 'text'], meta=['docno',  'wikidata_id', 'index'])
  return index_ref

In [24]:
test_common_fr

Unnamed: 0.1,index,Unnamed: 0,title,text,language,wikidata_id
0,0,113,Klosterneuburg Geography,It is located on the Danube immediately north ...,en,Q487522
1,1,260,Thomas Phillipps The Collection,In 1798 when Phillipps was 6 years old he alre...,en,Q2147709
2,2,570,Malagasy bulbul Taxonomy and systematics,The Malagasy bulbul was originally described i...,en,Q839001
3,3,628,Solar eclipse of November 13 2012 Visibility,For this eclipse totality was visible from nor...,en,Q51414
4,4,693,Plunderer manga Anime,An anime adaptation was announced by Monthly S...,en,Q50138221
...,...,...,...,...,...,...
3135,3135,67931,Albator 84 LAtlantis de ma jeunesse Synopsis,Un pilote Albator et un ingnieur Alfred se ren...,fr,Q1756233
3136,3136,67948,Passerelle aroportuaire,Une passerelle aroportuaire ou passerelle demb...,fr,Q1061299
3137,3137,67951,Georgina Hogarth Ascendance,Georgina Thompson Hogarth est la fille cadette...,fr,Q3103792
3138,3138,67976,NCSM Magnificent CVL 21 Historique,Troisime unit de la classe Majestic le HMS Mag...,fr,Q2591384


In [25]:
test_index_refs = {}
for dataset_name, dataset in [("test_common_de", test_common_de),
                              ("test_common_fr", test_common_fr),
                              ("test_common_de_fr", test_common_de_fr)]:
  print(f"{dataset_name} started at: {time.time()}")
  test_index_refs[dataset_name] = index_dataset(dataset, dataset_name)
  print(f"{dataset_name} finished at: {time.time()}")

test_common_de started at: 1712533733.1835113


  index_ref = indexer.index(document_dicts, fields=['title', 'text'], meta=['docno',  'wikidata_id', 'index'])


test_common_de finished at: 1712533740.419012
test_common_fr started at: 1712533740.4197233
test_common_fr finished at: 1712533745.091908
test_common_de_fr started at: 1712533745.092932
test_common_de_fr finished at: 1712533745.7309587


In [26]:
bm25_fr = pt.BatchRetrieve(test_index_refs['test_common_fr'], wmodel="BM25")
bm25_de = pt.BatchRetrieve(test_index_refs['test_common_de'], wmodel="BM25")

# Run Experiments

In [27]:
from pyterrier.measures import RR, nDCG, MAP

res = pt.Experiment(
  [bm25_fr],
  test_cases_fr,
  fr_qrels,
  names=["bm25_fr"],
  eval_metrics=[RR @ 10, nDCG @ 10, MAP @ 100]
)
res

Unnamed: 0,name,RR@10,nDCG@10,AP@100
0,bm25_fr,0.001377,0.002216,0.00251


In [28]:
print(len(fr_qrels))
print(len(test_cases_fr))

1570
1570


In [29]:
res = pt.Experiment(
  [bm25_de],
  test_cases_de,
  de_qrels,
  names=["bm25_de"],
  eval_metrics=[RR @ 10, nDCG @ 10, MAP @ 100]
)
res

Unnamed: 0,name,RR@10,nDCG@10,AP@100
0,bm25_de,0.001267,0.002336,0.002424


In [30]:
import ir_datasets
dataset = ir_datasets.load("wikir/en1k/test")
for query in dataset.queries_iter():
  query

In [31]:
dataset = ir_datasets.load("wikir/en1k/test")

index_dir = '/content/index/wiki_test_en_small'
indexer = pt.IterDictIndexer(index_dir, overwrite=True)

document_dicts = [{
  'docno': doc.doc_id,
  'text': doc.text
} for doc in dataset.docs_iter()]


index_ref = indexer.index(document_dicts, fields=["text"])

In [32]:
bm25 = pt.BatchRetrieve(index_ref, wmodel="BM25")

queries = pd.DataFrame({
  'qid': [query.query_id for query in dataset.queries_iter()],
  'query': [query.text for query in dataset.queries_iter()]
})

qrels = pd.DataFrame({
  'qid': [qrel.query_id for qrel in dataset.qrels_iter()],
  'docno': [qrel.doc_id for qrel in dataset.qrels_iter()],
  'label': [qrel.relevance for qrel in dataset.qrels_iter()]
})

test_cases_fr['qid'] = test_cases_fr['qid'].astype(str)
fr_qrels['qid'] = fr_qrels['qid'].astype(str)
fr_qrels['docno'] = fr_qrels['docno'].astype(str)

results = pt.Experiment(
  [~bm25],
  queries,
  qrels,
  eval_metrics=[RR @ 10, nDCG @ 10, MAP @ 100]
)

print(results)

              name     RR@10   nDCG@10    AP@100
0  Cache(BR(BM25))  0.662806  0.360993  0.175218


In [33]:
print(len(qrels))
print(len(queries))

4435
100
