# Final Project

Libraries used:


In [21]:
! pip install numpy deep-translator python-dotenv python-terrier==0.10.0



In [22]:
from dotenv import dotenv_values
%load_ext dotenv
%dotenv

env = dotenv_values(".env")  # replace ".env.example" with .env file path

if (env["LANGUAGE_DETECT_API_KEY"] == "YOUR API KEY"):
  raise Exception("Please replace 'YOUR API KEY' with your actual API key in the .env file")

detection_api_key = env["LANGUAGE_DETECT_API_KEY"]

The dotenv extension is already loaded. To reload it, use:
  %reload_ext dotenv


In [23]:
import os
import re
import threading

import deep_translator as dt
import numpy as np
import pandas as pd
import pyterrier as pt
from deep_translator import GoogleTranslator, single_detection

In [24]:
print("numpy version:", np.__version__)
print("deep-translator version:", dt.__version__)
print("pyterrier version:", pt.__version__)

numpy version: 1.26.4
deep-translator version: 1.9.1
pyterrier version: 0.10.0


In [25]:
if not pt.started():
    pt.init()

## 0. Query formulation

Let the end-user determine what they would like to find


In [26]:
q0 = "How do I repair my bike?"

## 1. Query language identification

Identify the language of the query with the help of the ... library


In [27]:
l0 = single_detection(q0, api_key=detection_api_key)
print(l0)

en


## 2. Query translation

Translate the query into Dutch, English, French, German, Italian, Portuguese, Russian, Spanish, and Chineese. Exlude the original language of the query from the translation set.


In [28]:
qs = dict({l0: q0})

languages = ["en", "fr", "it", "es"]

for lang in languages:
    if lang != l0:
        qs[lang] = translated = GoogleTranslator(source=l0, target=lang).translate(
            text=q0
        )

print(qs)

{'en': 'How do I repair my bike?', 'fr': 'Comment réparer mon vélo ?', 'it': 'Come riparo la mia bicicletta?', 'es': '¿Cómo reparo mi bicicleta?'}


## 3. Search for documents in the target language

Search for documents in the target language using the translated queries


References:
1. MLWIKIR: APython toolkit for building large-scale Wikipedia-based Information Retrieval Datasets in Chinese, English, French, Italian, Japanese, Spanish and more. [Research paper](https://www.irit.fr/CIRCLE/wp-content/uploads/2020/06/CIRCLE20_22.pdf)
1. [pyterrier jupyter notebook example of spanish document retreival](https://github.com/terrier-org/pyterrier/blob/master/examples/notebooks/non_en_retrieval.ipynb)
1. [WikIR rawa datasets](https://ir-datasets.com/wikir.html)

### 3.1 Indexing of the documents
If the index of the documents is not available, then the documents will be indexed using the pyterrier library.
Otherwise, the index will be loaded from the disk.

In [29]:
def create_index(dataset: str, index_name: str, fields=["text"]): 
    """
    Creates an index for a given dataset using the specified index name and fields.

    Parameters:
    - dataset: The dataset object containing the corpus to be indexed.
    - index_name: The name of the index to be created.
    - fields: A list of fields to be indexed. Default is ["text"].

    Returns:
    - index_ref: The reference to the created index.

    """
    indexer = pt.IterDictIndexer("./indices/" + index_name, verbose=False)
    index_ref = indexer.index(dataset.get_corpus_iter(), fields=fields)
    return index_ref


def find_index(index_name: str):
    """
    Finds the reference to an existing index with the specified name.

    Parameters:
    - index_name: The name of the index to be found.

    Returns:
    - index_ref: The reference to the found index.

    """
    return pt.IndexRef.of("./indices/" + index_name)

In [30]:
datasetNames: dict[str, str] = dict({"fr":"wikir/fr14k", "es": "wikir/es13k", "en":"wikir/en1k", "it":"wikir/it16k"})
datasets = dict()
indices = dict()

def index(dataset: str, index_name: str, fields=["text"]):
    index_ref = create_index(dataset, index_name, fields)
    indices[lang] = index_ref

index_threads = []

for [lang, datasetName] in datasetNames.items():
    datasetFolder = datasetName.replace("/", "_")
    dataset = pt.get_dataset("irds:"+datasetName)
    datasets[lang] = dataset

    if os.path.exists("./indices/" + datasetFolder + "/data.properties"):
        print("Index", datasetFolder, "already exists")
        index_ref = find_index(datasetFolder)
        indices[lang] = index_ref
    else:
        print("Creating index", datasetFolder, " (takes around 1-3 minutes per dataset)")
        thread = threading.Thread(target=index, args=(dataset, datasetFolder))
        thread.start()
        index_threads.append(thread)

for thread in index_threads:
    thread.join()

print(indices)

Index wikir_fr14k already exists
Index wikir_es13k already exists
Index wikir_en1k already exists
Index wikir_it16k already exists
{'fr': <org.terrier.querying.IndexRef at 0x14e303380 jclass=org/terrier/querying/IndexRef jself=<LocalRef obj=0x7fbf9addfdb0 at 0x14e188f50>>, 'es': <org.terrier.querying.IndexRef at 0x14dbe0a90 jclass=org/terrier/querying/IndexRef jself=<LocalRef obj=0x7fbf9addfdd0 at 0x14e188210>>, 'en': <org.terrier.querying.IndexRef at 0x14dbe0860 jclass=org/terrier/querying/IndexRef jself=<LocalRef obj=0x7fbf9addfdd8 at 0x14e188bb0>>, 'it': <org.terrier.querying.IndexRef at 0x14e2f1760 jclass=org/terrier/querying/IndexRef jself=<LocalRef obj=0x7fbf9addfde0 at 0x14e188e90>>}


### 3.2 Retrieval of the documents
The documents will be retrieved using the BM25 retrieval model.

In [31]:
import re

document_data: dict[str, pd.DataFrame] = dict()

for lang in languages:
    index_ref = indices[lang]
    dataset = datasets[lang]

    pipeline = pt.BatchRetrieve(
        index_ref, wmodel="BM25", metadata=["docno"], num_results=5
    ) >> pt.text.get_text(dataset, "text")

    sanitised_query = re.sub(r"[^a-zA-Z0-9 ]", "", qs[lang])

    pandas_df: pd.DataFrame = pipeline.search(sanitised_query)
    document_data[lang] = pandas_df

print(
    "Results for fr",
    document_data["fr"].keys(),
    "- shape:",
    document_data["fr"].shape,
    "top 5:"
)

document_data["fr"].head()

Results for fr Index(['qid', 'docid', 'docno', 'rank', 'score', 'query', 'text'], dtype='object') - shape: (5, 7) top 5:


Unnamed: 0,qid,docid,docno,rank,score,query,text
0,1,401968,403301,0,21.629457,Comment rparer mon vlo,elle est issue de la fusion en 1981 de l opéra...
1,1,485591,487240,1,21.287983,Comment rparer mon vlo,elle suit la guérison d un possédé muet et fai...
2,1,195615,196210,2,19.694506,Comment rparer mon vlo,pour saluer bien bas on fait acte de soumissio...
3,1,163481,164009,3,19.58124,Comment rparer mon vlo,l histoire raconte comment les personnages des...
4,1,129221,129661,4,19.424128,Comment rparer mon vlo,le dæmon serait donc en quelque sorte la manif...


In [32]:
top_document = document_data["fr"].head(1)["text"].values[0]

print("Words in document:", len(top_document.split(" ")))

Words in document: 200


## 4. Document translation

Translate the documents back to English to be processed by other algorithms


In [33]:
documents_translated: dict[str, list[str]] = dict()

# TODO parallelize these 2 loops
for lang, docs in document_data.items():
    text_docs = docs["text"].tolist()

    documents_translated[lang] = [
        GoogleTranslator(source=lang, target="en").translate(text=d) for d in text_docs
    ]

In [34]:
documents_translated["fr"]

['it resulted from the merger in 1981 of the royal Flemish opera of Antwerp and the opera of Ghent due to the financial difficulties faced by the operas of Ghent and Antwerp the intercommunal society opera for flanders opera voor flaanderen ovv was created in 1981 it involves the cities of ghent and antwerp and the Flemish community the two municipal operas were thus merged in 1986 the ovv is short of financial means and can no longer pay salaries the Flemish community refuses to intervene and the opera is occupied by staff a commission of inquiry from the Flemish council produces a harsh report for the administration of the ovv the Flemish executive implements a restructuring plan involving the dissolution of the ovv which takes place the Flemish executive considers that a quality opera must continue to exist in flanders on July 20, 1988 the non-profit association asbl vlaamse operastichting vlos foundation of flemish opera in french is created the vlos takes over most of the staff of

## 5. Find domain-specific keywords

Find the most frequent words in the documents, exlude the 1000 most used words in the English language


In [37]:
from collections import Counter

# We have N documents per language and L langauges

# Convert each document to a set of words associated with its occurrence
def document_to_wordset(doc):
    # Create list of all words in doc
    allwords_list = doc.split(" ")

    # Length of this list is the total number of terms in doc
    total_terms = len(allwords_list)

    # Create a dictionary with terms as keys and occurrence count
    occurrence_set = Counter(allwords_list)

    return occurrence_set, total_terms

for k in documents_translated.keys():
    for doc in documents_translated[k]:
        print(document_to_wordset(doc))
    


(Counter({'the': 18, 'in': 13, 'and': 8, 'shop': 8, 'bike': 7, 'bicycle': 4, 'edinburgh': 4, 'bruntsfield': 4, 'to': 3, 'offers': 3, 'of': 3, 'on': 3, 'by': 3, 'has': 2, 'grown': 2, 'be': 2, 'with': 2, 'stores': 2, 'city': 2, 'cooperative': 2, 'is': 2, 'found': 2, 'a': 2, 'its': 2, '2017': 2, 'cycling': 2, 'weekly': 2, 'retail': 2, 'offering': 2, 'accessories': 2, 'clothing': 2, 'components': 2, 'also': 2, 'repairs': 2, 'servicing': 2, 'canonmills': 2, 'store': 2, 'street': 2, 'aberdeen': 2, 'recent': 1, 'years': 1, 'co': 1, 'operative': 1, 'arguably': 1, 'dominant': 1, 'retailer': 1, 'scotland': 1, 'largest': 1, 'each': 1, 'respective': 1, 'business': 1, 'products': 1, 'from': 1, 'many': 1, 'leadings': 1, 'brands': 1, 'including': 1, 'specialized': 1, 'whyte': 1, 'brompton': 1, 'kalkhoff': 1, 'frog': 1, 'giant': 1, 'longest': 1, 'established': 1, 'all': 1, 'victorian': 1, 'building': 1, 'whitehouse': 1, 'loan': 1, 'skirted': 1, 'meadows': 1, 'cyclepath': 1, 'overlooking': 1, 'links': 

## 6. Concat the keywords with the original query

Concatenate the keywords with the original query and search for documents in the original language


## 7. Evaluation

Evaluate the results of the search

Reference:
1. GitHub: [pyterrier/examples/notebooks
/retrieval_and_evaluation.ipynb](https://github.com/terrier-org/pyterrier/blob/master/examples/notebooks/retrieval_and_evaluation.ipynb)
1. GitHub: [pyterrier/examples/notebooks
/experiment.ipynb](https://github.com/terrier-org/pyterrier/blob/master/examples/notebooks/experiment.ipynb)

### 7.1 Precision / Recall

See how many of the returned documents are relevant. Did the number of relevant documents increase?


### 7.2 Keyword diversification

Did the number of unique keywords increase compared to naive domain-specific keyword identification?
