# Final Project

Libraries used:


In [2]:
! pip install numpy deep-translator python-dotenv python-terrier==0.10.0



In [3]:
from dotenv import dotenv_values
%load_ext dotenv
%dotenv

env = dotenv_values(".env")  # replace ".env.example" with .env file path

if (env["LANGUAGE_DETECT_API_KEY"] == "YOUR API KEY"):
  raise Exception("Please replace 'YOUR API KEY' with your actual API key in the .env file")

detection_api_key = env["LANGUAGE_DETECT_API_KEY"]

In [4]:
import os
import re
import threading

import deep_translator as dt
import numpy as np
import pandas as pd
import pyterrier as pt
from deep_translator import GoogleTranslator, single_detection

In [5]:
print("numpy version:", np.__version__)
print("deep-translator version:", dt.__version__)
print("pyterrier version:", pt.__version__)

numpy version: 1.26.4
deep-translator version: 1.9.1
pyterrier version: 0.10.0


In [6]:
if not pt.started():
    pt.init()

PyTerrier 0.10.0 has loaded Terrier 5.8 (built by craigm on 2023-11-01 18:05) and terrier-helper 0.0.8

No etc/terrier.properties, using terrier.default.properties for bootstrap configuration.


## 0. Query formulation

Let the end-user determine what they would like to find


In [7]:
q0 = "How do I repair my bike?"

## 1. Query language identification

Identify the language of the query with the help of the ... library


In [8]:
l0 = single_detection(q0, api_key=detection_api_key)
print(l0)

en


## 2. Query translation

Translate the query into Dutch, English, French, German, Italian, Portuguese, Russian, Spanish, and Chineese. Exlude the original language of the query from the translation set.


In [9]:
qs = dict({l0: q0})

languages = ["en", "fr", "it", "es"]

threads = []

def translate(lang):
    gt = GoogleTranslator(source=l0, target=lang)
    translated = gt.translate(text=q0)
    qs[lang] = translated


for lang in languages:
    if lang != l0:
        t1 = threading.Thread(target=translate, args=(lang,))
        t1.start()
        threads.append(t1)

for t in threads:
    t.join()

qs

{'en': 'How do I repair my bike?',
 'fr': 'Comment réparer mon vélo ?',
 'it': 'Come riparo la mia bicicletta?',
 'es': '¿Cómo reparo mi bicicleta?'}

## 3. Search for documents in the target language

Search for documents in the target language using the translated queries


References:
1. MLWIKIR: APython toolkit for building large-scale Wikipedia-based Information Retrieval Datasets in Chinese, English, French, Italian, Japanese, Spanish and more. [Research paper](https://www.irit.fr/CIRCLE/wp-content/uploads/2020/06/CIRCLE20_22.pdf)
1. [pyterrier jupyter notebook example of spanish document retreival](https://github.com/terrier-org/pyterrier/blob/master/examples/notebooks/non_en_retrieval.ipynb)
1. [WikIR rawa datasets](https://ir-datasets.com/wikir.html)

### 3.1 Indexing of the documents
If the index of the documents is not available, then the documents will be indexed using the pyterrier library.
Otherwise, the index will be loaded from the disk.

In [10]:
def create_index(dataset: str, index_name: str, fields=["text"]): 
    """
    Creates an index for a given dataset using the specified index name and fields.

    Parameters:
    - dataset: The dataset object containing the corpus to be indexed.
    - index_name: The name of the index to be created.
    - fields: A list of fields to be indexed. Default is ["text"].

    Returns:
    - index_ref: The reference to the created index.

    """
    indexer = pt.IterDictIndexer("./indices/" + index_name, verbose=False)
    index_ref = indexer.index(dataset.get_corpus_iter(), fields=fields)
    return index_ref


def find_index(index_name: str):
    """
    Finds the reference to an existing index with the specified name.

    Parameters:
    - index_name: The name of the index to be found.

    Returns:
    - index_ref: The reference to the found index.

    """
    return pt.IndexRef.of("./indices/" + index_name)

In [11]:
datasetNames: dict[str, str] = dict({"fr":"wikir/fr14k", "es": "wikir/es13k", "en":"wikir/en1k", "it":"wikir/it16k"})
datasets = dict()
indeces = dict()

def index(dataset: str, index_name: str, fields=["text"]):
    index_ref = create_index(dataset, index_name, fields)
    indeces[lang] = index_ref

index_threads = []

for [lang, datasetName] in datasetNames.items():
    datasetFolder = datasetName.replace("/", "_")
    dataset = pt.get_dataset("irds:"+datasetName)
    datasets[lang] = dataset

    if os.path.exists("./indices/" + datasetFolder + "/data.properties"):
        print("Index", datasetFolder, "already exists")
        index_ref = find_index(datasetFolder)
        indeces[lang] = index_ref
    else:
        print("Creating index", datasetFolder, " (takes around 1-3 minutes per dataset)")
        thread = threading.Thread(target=index, args=(dataset, datasetFolder))
        thread.start()
        index_threads.append(thread)

for thread in index_threads:
    thread.join()

print(indeces)

Index wikir_fr14k already exists
Index wikir_es13k already exists
Index wikir_en1k already exists
Index wikir_it16k already exists
{'fr': <org.terrier.querying.IndexRef at 0x1719cc090 jclass=org/terrier/querying/IndexRef jself=<LocalRef obj=0x137b6caf8 at 0x17fcee0f0>>, 'es': <org.terrier.querying.IndexRef at 0x17f7a06d0 jclass=org/terrier/querying/IndexRef jself=<LocalRef obj=0x137b6cb28 at 0x17fced330>>, 'en': <org.terrier.querying.IndexRef at 0x171a6b060 jclass=org/terrier/querying/IndexRef jself=<LocalRef obj=0x137b6cb40 at 0x17fceef70>>, 'it': <org.terrier.querying.IndexRef at 0x17f758f40 jclass=org/terrier/querying/IndexRef jself=<LocalRef obj=0x137b6cb48 at 0x17fced710>>}


### 3.2 Retrieval of the documents
The documents will be retrieved using the BM25 retrieval model.

In [12]:
re.sub(r"[^a-zA-Z0-9 ]", "", qs['fr'])

'Comment rparer mon vlo '

In [13]:
import re

document_data: dict[str, pd.DataFrame] = dict()
NUM_RESULTS = 20

for lang in languages:
    index_ref = indeces[lang]
    dataset = datasets[lang]

    pipeline = pt.BatchRetrieve(
        index_ref, wmodel="BM25", metadata=["docno"], num_results=NUM_RESULTS
    ) >> pt.text.get_text(dataset, "text")

    sanitised_query = re.sub(r"[^a-zA-Z0-9 ]", "", qs[lang])

    pandas_df: pd.DataFrame = pipeline.search(sanitised_query)
    document_data[lang] = pandas_df

print(
    "Results for fr",
    document_data["fr"].keys(),
    "- shape:",
    document_data["fr"].shape,
    "top 5:"
)

document_data["fr"].head()

Results for fr Index(['qid', 'docid', 'docno', 'rank', 'score', 'query', 'text'], dtype='object') - shape: (20, 7) top 5:


Unnamed: 0,qid,docid,docno,rank,score,query,text
0,1,401968,403301,0,21.629457,Comment rparer mon vlo,elle est issue de la fusion en 1981 de l opéra...
1,1,485591,487240,1,21.287983,Comment rparer mon vlo,elle suit la guérison d un possédé muet et fai...
2,1,195615,196210,2,19.694506,Comment rparer mon vlo,pour saluer bien bas on fait acte de soumissio...
3,1,163481,164009,3,19.58124,Comment rparer mon vlo,l histoire raconte comment les personnages des...
4,1,129221,129661,4,19.424128,Comment rparer mon vlo,le dæmon serait donc en quelque sorte la manif...


In [14]:
top_document = document_data["fr"].head(1)["text"].values[0]

print("Words in document:", len(top_document.split(" ")))

Words in document: 200


## 4. Document translation

Translate the documents back to English to be processed by other algorithms


In [19]:
documents_translated: list[str] = []

def translate_document(document_text: str, gt: GoogleTranslator):
    translated = gt.translate(text=document_text)
    documents_translated.append(translated)

threads = []

for lang, docs in document_data.items():
    text_docs = docs["text"].tolist()
    gt = GoogleTranslator(source=lang, target="en")

    for text in text_docs:
        t = threading.Thread(target=translate_document, args=(text, gt))
        t.start()
        threads.append(t)

for t in threads:
    t.join()

In [20]:
documents_translated

['in recent years the co operative has grown to be arguably the dominant bicycle retailer in scotland with the largest bicycle stores in each respective city the business offers products from many leadings brands including specialized whyte brompton kalkhoff frog and giant the longest established of all edinburgh bicycle cooperative stores is the bruntsfield bike shop to be found in a victorian building on whitehouse loan skirted by the meadows cyclepath and overlooking the bruntsfield links since its opening in 1979 the bruntsfield shop has grown in size and reputation winning the coveted best bike shop in the uk award in 2017 by cycling weekly along with a comprehensive retail offering of bikes accessories clothing and components the bruntsfield shop also offers bike repairs and servicing the canonmills store in edinburgh is found on rodney street edinburgh the bike shop became an edinburgh bicycle cooperative store in 2013 previously it traded as city cycles and then the bike chain 

## 5. Find domain-specific keywords

Find the most frequent words in the documents, exlude the 1000 most used words in the English language


## 6. Concat the keywords with the original query

Concatenate the keywords with the original query and search for documents in the original language


## 7. Evaluation

Evaluate the results of the search

Reference:
1. GitHub: [pyterrier/examples/notebooks
/retrieval_and_evaluation.ipynb](https://github.com/terrier-org/pyterrier/blob/master/examples/notebooks/retrieval_and_evaluation.ipynb)
1. GitHub: [pyterrier/examples/notebooks
/experiment.ipynb](https://github.com/terrier-org/pyterrier/blob/master/examples/notebooks/experiment.ipynb)

### 7.1 Precision / Recall

See how many of the returned documents are relevant. Did the number of relevant documents increase?


### 7.2 Keyword diversification

Did the number of unique keywords increase compared to naive domain-specific keyword identification?
