# Final Project

Libraries used:


In [13]:
! pip install numpy deep-translator python-dotenv python-terrier==0.10.0

Collecting ray
  Downloading ray-2.10.0-cp310-cp310-macosx_11_0_arm64.whl.metadata (13 kB)
Collecting click>=7.0 (from ray)
  Using cached click-8.1.7-py3-none-any.whl.metadata (3.0 kB)
Collecting filelock (from ray)
  Downloading filelock-3.13.3-py3-none-any.whl.metadata (2.8 kB)
Collecting jsonschema (from ray)
  Using cached jsonschema-4.21.1-py3-none-any.whl.metadata (7.8 kB)
Collecting msgpack<2.0.0,>=1.0.0 (from ray)
  Downloading msgpack-1.0.8-cp310-cp310-macosx_11_0_arm64.whl.metadata (9.1 kB)
Collecting protobuf!=3.19.5,>=3.15.3 (from ray)
  Downloading protobuf-5.26.0-cp37-abi3-macosx_10_9_universal2.whl.metadata (592 bytes)
Collecting aiosignal (from ray)
  Downloading aiosignal-1.3.1-py3-none-any.whl.metadata (4.0 kB)
Collecting frozenlist (from ray)
  Downloading frozenlist-1.4.1-cp310-cp310-macosx_11_0_arm64.whl.metadata (12 kB)
Collecting attrs>=22.2.0 (from jsonschema->ray)
  Using cached attrs-23.2.0-py3-none-any.whl.metadata (9.5 kB)
Collecting jsonschema-specificatio

In [2]:
from dotenv import dotenv_values
%load_ext dotenv
%dotenv

env = dotenv_values(".env")  # replace ".env.example" with .env file path

if (env["LANGUAGE_DETECT_API_KEY"] == "YOUR API KEY"):
  raise Exception("Please replace 'YOUR API KEY' with your actual API key in the .env file")

detection_api_key = env["LANGUAGE_DETECT_API_KEY"]

In [16]:
import os
import re
import threading

import deep_translator as dt
import numpy as np
import pandas as pd
import pyterrier as pt
from deep_translator import GoogleTranslator, single_detection

In [4]:
print("numpy version:", np.__version__)
print("deep-translator version:", dt.__version__)
print("pyterrier version:", pt.__version__)

numpy version: 1.26.4
deep-translator version: 1.9.1
pyterrier version: 0.10.0


In [17]:
if not pt.started():
    pt.init()

## 0. Query formulation

Let the end-user determine what they would like to find


In [6]:
q0 = "How do I repair my bike?"

## 1. Query language identification

Identify the language of the query with the help of the ... library


In [7]:
l0 = single_detection(q0, api_key=detection_api_key)
print(l0)

en


## 2. Query translation

Translate the query into Dutch, English, French, German, Italian, Portuguese, Russian, Spanish, and Chineese. Exlude the original language of the query from the translation set.


In [8]:
qs = dict({l0: q0})

languages = ["en", "fr", "it", "es"]

for lang in languages:
    if lang != l0:
        qs[lang] = translated = GoogleTranslator(source=l0, target=lang).translate(
            text=q0
        )

print(qs)

{'en': 'How do I repair my bike?', 'fr': 'Comment réparer mon vélo ?', 'it': 'Come riparo la mia bicicletta?', 'es': '¿Cómo reparo mi bicicleta?'}


## 3. Search for documents in the target language

Search for documents in the target language using the translated queries


References:
1. MLWIKIR: APython toolkit for building large-scale Wikipedia-based Information Retrieval Datasets in Chinese, English, French, Italian, Japanese, Spanish and more. [Research paper](https://www.irit.fr/CIRCLE/wp-content/uploads/2020/06/CIRCLE20_22.pdf)
1. [pyterrier jupyter notebook example of spanish document retreival](https://github.com/terrier-org/pyterrier/blob/master/examples/notebooks/non_en_retrieval.ipynb)
1. [WikIR rawa datasets](https://ir-datasets.com/wikir.html)

### 3.1 Indexing of the documents
If the index of the documents is not available, then the documents will be indexed using the pyterrier library.
Otherwise, the index will be loaded from the disk.

In [10]:
def create_index(dataset: str, index_name: str, fields=["text"]): 
    """
    Creates an index for a given dataset using the specified index name and fields.

    Parameters:
    - dataset: The dataset object containing the corpus to be indexed.
    - index_name: The name of the index to be created.
    - fields: A list of fields to be indexed. Default is ["text"].

    Returns:
    - index_ref: The reference to the created index.

    """
    indexer = pt.IterDictIndexer("./indices/" + index_name, verbose=False)
    index_ref = indexer.index(dataset.get_corpus_iter(), fields=fields)
    return index_ref


def find_index(index_name: str):
    """
    Finds the reference to an existing index with the specified name.

    Parameters:
    - index_name: The name of the index to be found.

    Returns:
    - index_ref: The reference to the found index.

    """
    return pt.IndexRef.of("./indices/" + index_name)

In [25]:
datasetNames: dict[str, str] = dict({"fr":"wikir/fr14k", "es": "wikir/es13k", "en":"wikir/en1k", "it":"wikir/it16k"})
datasets = dict()
indeces = dict()

def index(dataset: str, index_name: str, fields=["text"]):
    index_ref = create_index(dataset, index_name, fields)
    indeces[lang] = index_ref

index_threads = []

for [lang, datasetName] in datasetNames.items():
    datasetFolder = datasetName.replace("/", "_")
    dataset = pt.get_dataset("irds:"+datasetName)
    datasets[lang] = dataset

    if os.path.exists("./indices/" + datasetFolder + "/data.properties"):
        print("Index", datasetFolder, "already exists")
        index_ref = find_index(datasetFolder)
        indeces[lang] = index_ref
    else:
        print("Creating index", datasetFolder, " (takes around 1-3 minutes per dataset)")
        thread = threading.Thread(target=index, args=(dataset, datasetFolder))
        thread.start()
        index_threads.append(thread)

for thread in index_threads:
    thread.join()

print(indeces)

Index wikir_fr14k already exists
Index wikir_es13k already exists
Index wikir_en1k already exists
Index wikir_it16k already exists
{'fr': <org.terrier.querying.IndexRef at 0x14785c720 jclass=org/terrier/querying/IndexRef jself=<LocalRef obj=0x136d79900 at 0x14776d030>>, 'es': <org.terrier.querying.IndexRef at 0x1473b1620 jclass=org/terrier/querying/IndexRef jself=<LocalRef obj=0x136d79910 at 0x1523f3630>>, 'en': <org.terrier.querying.IndexRef at 0x151130270 jclass=org/terrier/querying/IndexRef jself=<LocalRef obj=0x136d79930 at 0x1523f3750>>, 'it': <org.terrier.querying.IndexRef at 0x15249d9e0 jclass=org/terrier/querying/IndexRef jself=<LocalRef obj=0x136d79960 at 0x1523f3670>>}


### 3.2 Retrieval of the documents
The documents will be retrieved using the BM25 retrieval model.

In [47]:
import re

document_data: dict[str, pd.DataFrame] = dict()

for lang in languages:
    index_ref = indeces[lang]
    dataset = datasets[lang]

    pipeline = pt.BatchRetrieve(
        index_ref, wmodel="BM25", metadata=["docno"], num_results=5
    ) >> pt.text.get_text(dataset, "text")

    sanitised_query = re.sub(r"[^a-zA-Z0-9 ]", "", qs[lang])

    pandas_df: pd.DataFrame = pipeline.search(sanitised_query)
    document_data[lang] = pandas_df

print(
    "Results for fr",
    document_data["fr"].keys(),
    "- shape:",
    document_data["fr"].shape,
    "top 5:"
)

document_data["fr"].head()

Results for fr Index(['qid', 'docid', 'docno', 'rank', 'score', 'query', 'text'], dtype='object') - shape: (5, 7) top 5:


Unnamed: 0,qid,docid,docno,rank,score,query,text
0,1,401968,403301,0,21.629457,Comment rparer mon vlo,elle est issue de la fusion en 1981 de l opéra...
1,1,485591,487240,1,21.287983,Comment rparer mon vlo,elle suit la guérison d un possédé muet et fai...
2,1,195615,196210,2,19.694506,Comment rparer mon vlo,pour saluer bien bas on fait acte de soumissio...
3,1,163481,164009,3,19.58124,Comment rparer mon vlo,l histoire raconte comment les personnages des...
4,1,129221,129661,4,19.424128,Comment rparer mon vlo,le dæmon serait donc en quelque sorte la manif...


In [50]:
top_document = document_data["fr"].head(1)["text"].values[0]

print(len(top_document))

1120


## 4. Document translation

Translate the documents back to English to be processed by other algorithms


In [None]:
documents_translated: Dict[str, list[str]] = dict()

for lang, docs in documents.items():
    documents_translated[lang] = [
        GoogleTranslator(source=lang, target="en").translate(text=d) for d in docs
    ]

TypeError: 'NotImplementedType' object is not iterable

## 5. Find domain-specific keywords

Find the most frequent words in the documents, exlude the 1000 most used words in the English language


## 6. Concat the keywords with the original query

Concatenate the keywords with the original query and search for documents in the original language


## 7. Evaluation

Evaluate the results of the search

Reference:
1. GitHub: [pyterrier/examples/notebooks
/retrieval_and_evaluation.ipynb](https://github.com/terrier-org/pyterrier/blob/master/examples/notebooks/retrieval_and_evaluation.ipynb)
1. GitHub: [pyterrier/examples/notebooks
/experiment.ipynb](https://github.com/terrier-org/pyterrier/blob/master/examples/notebooks/experiment.ipynb)

### 7.1 Precision

See how many of the returned documents are relevant. Did the number of relevant documents increase?


### 7.2 Keyword diversification

Did the number of unique keywords increase compared to naive domain-specific keyword identification?
