# Final Project

Libraries used:


In [1]:
! pip install numpy deep-translator python-dotenv



In [2]:
from dotenv import dotenv_values
%load_ext dotenv
%dotenv

env = dotenv_values(".env")  # replace ".env" with your actual .env file path

if (env["LANGUAGE_DETECT_API_KEY"] == "YOUR API KEY"):
  raise Exception("Please replace 'YOUR API KEY' with your actual API key in the .env file")

detection_api_key = env["LANGUAGE_DETECT_API_KEY"]

In [3]:
import numpy as np
from langdetect import detect
import deep_translator as dt
from deep_translator import GoogleTranslator, single_detection

In [4]:
print("numpy version:", np.__version__)
print("deep-translator version:", dt.__version__)

numpy version: 1.26.4
deep-translator version: 1.9.1


## 0. Query formulation

Let the end-user determine what they would like to find


In [5]:
q0 = "How do I repair my bike?"

## 1. Query language identification

Identify the language of the query with the help of the ... library


In [6]:
l0 = single_detection(q0, api_key=detection_api_key)
print(l0)

en


## 2. Query translation

Translate the query into Dutch, English, French, German, Italian, Portuguese, Russian, Spanish, and Chineese. Exlude the original language of the query from the translation set.


In [7]:
qs = dict({l0: q0})

languages = ["nl", "en", "fr", "de", "it", "pt", "ru", "es", "zh-CN"]

for lang in languages:
    if lang != l0:
        qs[lang] = translated = GoogleTranslator(source=l0, target=lang).translate(
            text=q0
        )

print(qs)

{'en': 'How do I repair my bike?', 'nl': 'Hoe repareer ik mijn fiets?', 'fr': 'Comment réparer mon vélo ?', 'de': 'Wie repariere ich mein Fahrrad?', 'it': 'Come riparo la mia bicicletta?', 'pt': 'Como faço para consertar minha bicicleta?', 'ru': 'Как мне отремонтировать свой велосипед?', 'es': '¿Cómo reparo mi bicicleta?', 'zh-CN': '如何修理我的自行车？'}


## 3. Search for documents in the target language

Search for documents in the target language using the translated queries


References:
1. MLWIKIR: APython toolkit for building large-scale Wikipedia-based Information Retrieval Datasets in Chinese, English, French, Italian, Japanese, Spanish and more. [Research paper](https://www.irit.fr/CIRCLE/wp-content/uploads/2020/06/CIRCLE20_22.pdf)
1. [pyterrier jupyter notebook example of spanish document retreival](https://github.com/terrier-org/pyterrier/blob/master/examples/notebooks/non_en_retrieval.ipynb)
1. [WikIR rawa datasets](https://ir-datasets.com/wikir.html)

In [8]:
def search(query, target_language="es") -> list[str]:
    return NotImplemented

In [9]:
from typing import Dict

documents: Dict[str, list[str]] = dict()

for lang, q in qs.items():
    documents[lang] = search(q, lang)

## 4. Document translation

Translate the documents back to English to be processed by other algorithms


In [10]:
documents_translated: Dict[str, list[str]] = dict()

for lang, docs in documents.items():
    documents_translated[lang] = [
        GoogleTranslator(source=lang, target="en").translate(text=d) for d in docs
    ]

TypeError: 'NotImplementedType' object is not iterable

## 5. Find domain-specific keywords

Find the most frequent words in the documents, exlude the 1000 most used words in the English language


## 6. Concat the keywords with the original query

Concatenate the keywords with the original query and search for documents in the original language


## 7. Evaluation

Evaluate the results of the search

Reference:
1. GitHub: [pyterrier/examples/notebooks
/retrieval_and_evaluation.ipynb](https://github.com/terrier-org/pyterrier/blob/master/examples/notebooks/retrieval_and_evaluation.ipynb)
1. GitHub: [pyterrier/examples/notebooks
/experiment.ipynb](https://github.com/terrier-org/pyterrier/blob/master/examples/notebooks/experiment.ipynb)

### 7.1 Precision

See how many of the returned documents are relevant. Did the number of relevant documents increase?


### 7.2 Keyword diversification

Did the number of unique keywords increase compared to naive domain-specific keyword identification?
