# Final Project

Libraries used:


In [None]:
! pip install numpy deep-translator python-dotenv python-terrier==0.10.0 unidecode sent2vec scipy sentence-transformers scikit-learn

## 0. Initialising libraries

In [None]:
from dotenv import dotenv_values
%load_ext dotenv
%dotenv

env = dotenv_values(".env")  # replace ".env.example" with .env file path

if (env["LANGUAGE_DETECT_API_KEY"] == "YOUR API KEY"):
  raise Exception("Please replace 'YOUR API KEY' with your actual API key in the .env file")

detection_api_key = env["LANGUAGE_DETECT_API_KEY"]

In [None]:
import os
import re
import threading

import deep_translator as dt
import numpy as np
import pandas as pd
import pyterrier as pt
from sent2vec.vectorizer import Vectorizer
from scipy import spatial
from deep_translator import GoogleTranslator, single_detection

In [None]:
print("numpy version:", np.__version__)
print("deep-translator version:", dt.__version__)
print("pyterrier version:", pt.__version__)

In [None]:
if not pt.started():
    pt.init()

## 1. Query formulation

Let the end-user determine what they would like to find


In [None]:
q0 = "How do I repair my bike?"

## 2. Query language identification

Identify the language of the query with the help of the ... library


In [None]:
l0 = single_detection(q0, api_key=detection_api_key)
print(l0)

## 3. Query translation

Translate the query into Dutch, English, French, German, Italian, Portuguese, Russian, Spanish, and Chineese. Exlude the original language of the query from the translation set.


In [None]:
qs = dict({l0: q0})

languages = ["en", "fr", "it", "es"]

threads = []

def translate(lang):
    gt = GoogleTranslator(source=l0, target=lang)
    translated = gt.translate(text=q0)
    qs[lang] = translated


for lang in languages:
    if lang != l0:
        t1 = threading.Thread(target=translate, args=(lang,))
        t1.start()
        threads.append(t1)

for t in threads:
    t.join()

qs

## 4. Search for documents in the target language

Search for documents in the target language using the translated queries


References:
1. MLWIKIR: APython toolkit for building large-scale Wikipedia-based Information Retrieval Datasets in Chinese, English, French, Italian, Japanese, Spanish and more. [Research paper](https://www.irit.fr/CIRCLE/wp-content/uploads/2020/06/CIRCLE20_22.pdf)
1. [pyterrier jupyter notebook example of spanish document retreival](https://github.com/terrier-org/pyterrier/blob/master/examples/notebooks/non_en_retrieval.ipynb)
1. [WikIR rawa datasets](https://ir-datasets.com/wikir.html)

### 4.1 Indexing of the documents
If the index of the documents is not available, then the documents will be indexed using the pyterrier library.
Otherwise, the index will be loaded from the disk.

In [None]:
def create_index(dataset: str, index_name: str, fields=["text"]): 
    """
    Creates an index for a given dataset using the specified index name and fields.

    Parameters:
    - dataset: The dataset object containing the corpus to be indexed.
    - index_name: The name of the index to be created.
    - fields: A list of fields to be indexed. Default is ["text"].

    Returns:
    - index_ref: The reference to the created index.

    """
    indexer = pt.IterDictIndexer("./indices/" + index_name, verbose=False)
    index_ref = indexer.index(dataset.get_corpus_iter(), fields=fields)
    return index_ref


def find_index(index_name: str):
    """
    Finds the reference to an existing index with the specified name.

    Parameters:
    - index_name: The name of the index to be found.

    Returns:
    - index_ref: The reference to the found index.

    """
    return pt.IndexRef.of("./indices/" + index_name)

In [None]:
datasetNames: dict[str, str] = dict(
    {"fr": "wikir/fr14k", "es": "wikir/es13k", "en": "wikir/en1k", "it": "wikir/it16k"}
)
datasets = dict()
indeces = dict()

def index(dataset: str, index_name: str, fields=["text"]):
    index_ref = create_index(dataset, index_name, fields)
    indeces[lang] = index_ref

index_threads = []

for [lang, datasetName] in datasetNames.items():
    datasetFolder = datasetName.replace("/", "_")
    dataset = pt.get_dataset("irds:"+datasetName)
    datasets[lang] = dataset

    if os.path.exists("./indices/" + datasetFolder + "/data.properties"):
        print("Index", datasetFolder, "already exists")
        index_ref = find_index(datasetFolder)
        indeces[lang] = index_ref
    else:
        print("Creating index", datasetFolder, " (takes around 1-3 minutes per dataset)")
        thread = threading.Thread(target=index, args=(dataset, datasetFolder))
        thread.start()
        index_threads.append(thread)

for thread in index_threads:
    thread.join()

print(indeces)

### 4.2 Retrieval of the documents
The documents will be retrieved using the BM25 retrieval model.

In [None]:
import unidecode

def sanitise_query(query: str):
    """
    Sanitises a query by removing special characters and converting it to lowercase.

    Parameters:
    - query: The query to be sanitised.

    Returns:
    - sanitised_query: The sanitised query.

    """
    decoded_query = unidecode.unidecode(query)
    sanitised_query = re.sub(r"[^a-zA-Z0-9 ]", "", decoded_query)
    return sanitised_query.lower()

In [None]:
document_data: dict[str, pd.DataFrame] = dict()
NUM_RESULTS = 20

for lang in languages:
    index_ref = indeces[lang]
    dataset = datasets[lang]

    pipeline = pt.BatchRetrieve(
        index_ref, wmodel="BM25", metadata=["docno"], num_results=NUM_RESULTS
    ) >> pt.text.get_text(dataset, "text")

    sanitised_query = sanitise_query(qs[lang])

    pandas_df: pd.DataFrame = pipeline.search(sanitised_query)
    document_data[lang] = pandas_df

print(
    "Results for fr",
    document_data["fr"].keys(),
    "- shape:",
    document_data["fr"].shape,
    "top 5:"
)

document_data["fr"].head()

## 5. Document translation

Translate the documents back to English to be processed by other algorithms


In [None]:
documents_translated_dict: dict[str, list[str]] = {}

def translate_document(lang: str, index: int, document_text: str):
    gt = GoogleTranslator(source=lang, target="en")
    translated = gt.translate(text=document_text)

    documents_translated_dict[lang][index] = translated


threads = []

for lang, docs in document_data.items():
    text_docs: list[str] = docs["text"].tolist()
    documents_translated_dict[lang] = [None] * len(text_docs)

    for index, text in enumerate(text_docs):
        t = threading.Thread(target=translate_document, args=(lang, index, text))
        t.start()
        threads.append(t)

for t in threads:
    t.join()

# Join the translated documents to one list
documents_translated = []
for lang, docs in documents_translated_dict.items():
    documents_translated += docs

## 6. Find domain-specific keywords

Find the most frequent words in the documents, exlude the 1000 most used words in the English language


In [None]:
from collections import Counter
import math

# We have N documents per language and L langauges

# Convert each document to a set of words associated with its occurrence
def document_to_wordset(doc):
    # Create list of all words in doc
    allwords_list = doc.split(" ")

    # Length of this list is the total number of terms in doc
    total_terms = len(allwords_list)

    # Create a dictionary with terms as keys and occurrence count
    occurrence_set = Counter(allwords_list)

    return occurrence_set, total_terms

def compute_term_frequencies(doc_wordset: Counter, total_terms: int) -> Counter:
    result = doc_wordset.copy()
    # Simply divide the word occurrence in the wordset by the total number of terms in the document
    for term in result.keys():
        result[term] /= total_terms

    return result

def compute_idf(total_docs_in_corpus, docs_contain_term):
    return math.log((1 + total_docs_in_corpus)/(1 + docs_contain_term))

def compute_tfidf_per_term(frequency_wordsets: list[Counter]) -> Counter:
    # WARN: there are terms which do not occur in every document

    # Start by creating a counter that contains each term and the
    # number of documents that term appeared in
    document_occurrence_counter = Counter()

    for frequency_wordset in frequency_wordsets:
        for term in frequency_wordset.keys():
            # If first encounter; set to one
            if (term not in document_occurrence_counter.keys()):
                document_occurrence_counter[term] = 1
            # Otherwise add one encounter
            else:
                document_occurrence_counter[term] += 1

    # For each term, compute the average term frequency,
    # using the document count in aggregate_counter
    avg_term_frequency_counter = Counter()

    for term in document_occurrence_counter.keys():

        avg_term_frequency_counter[term] = 0

        for frequency_wordset in frequency_wordsets:
            if (term in frequency_wordset.keys()):
                avg_term_frequency_counter[term] += frequency_wordset[term]

        avg_term_frequency_counter[term] /= document_occurrence_counter[term]

    # For each term, compute the inverse document frequency for the documents
    # in the given list
    term_inverse_document_frequency_counter = Counter()

    # The total number of documents in the corpus is simply in this case
    total_documents = len(frequency_wordsets)

    # For each term
    for term in document_occurrence_counter.keys():
        term_inverse_document_frequency_counter[term] = compute_idf(total_documents, document_occurrence_counter[term])

    
    # Finally, combine tf-idf for each term using the above two created counters
    tfidf_counter = Counter()

    for term in document_occurrence_counter.keys():
        tfidf_counter[term] = avg_term_frequency_counter[term] * term_inverse_document_frequency_counter[term]

    return tfidf_counter


all_frequency_wordsets: list[Counter] = []

for doc in documents_translated:
    doc_wordset, total_terms = document_to_wordset(doc)

    frequency_wordset = compute_term_frequencies(doc_wordset, total_terms)
    all_frequency_wordsets.append(frequency_wordset)

tfidf_per_term = compute_tfidf_per_term(all_frequency_wordsets)

# Based on tfidf, extract the 5 'most domain-specific' terms
domain_specific_terms = [item[0] for item in tfidf_per_term.most_common(100)]

print(domain_specific_terms)

## 7. Rank to the domain specific keywords with word net

In [None]:
vectorizer = Vectorizer()
vectorizer.run(documents_translated)
vectors = vectorizer.vectors

q0_sanitised = sanitise_query(q0)
q0_vectorizer = Vectorizer()
q0_vectorizer.run([q0_sanitised])

data = {"distance": [], "document": []}

for i in range(0, len(documents_translated)):
    similarity = spatial.distance.cosine(q0_vectorizer.vectors[0], vectors[i])
    data["distance"].append(similarity)
    data["document"].append(documents_translated[i])

df = pd.DataFrame(data)
df.sort_values(by=["distance"], ascending=True, inplace=True)
df

In [None]:
print("best doc:", df["document"][56])
print("worst doc:", df["document"][60])

In [None]:
vectorizer = Vectorizer()
terms = ["bike", "bikes", "citybike", "citybikes", "Citybikes", "sushi", "dog"]
vectorizer.run(terms)
vectors = vectorizer.vectors

for i in range(1, len(vectors)):
  term = terms[i]
  vector = vectors[i]
  dist = spatial.distance.cosine(vectors[0], vector)
  print("Distance between 'bike' and", term, ":", dist)

In [None]:
import pandas as pd

q0s_sanitised = sanitise_query(q0).split(" ")

q0_vectorizer = Vectorizer()
q0_vectorizer.run(q0s_sanitised)

dst_q0_vectorizer = Vectorizer()
dst_q0_vectorizer.run(domain_specific_terms)

data = dict({"q0 term": [], "dst": [], "distance": []})

for q0_i in range(0, len(q0_vectorizer.vectors)):
    for dst_i in range(0, len(dst_q0_vectorizer.vectors)):
        q0_vec = q0_vectorizer.vectors[q0_i]
        dst_vec = dst_q0_vectorizer.vectors[dst_i]

        distance = spatial.distance.cosine(q0_vec, dst_vec)
        data["q0 term"].append(q0s_sanitised[q0_i])
        data["dst"].append(domain_specific_terms[dst_i])
        data["distance"].append(distance)

df = pd.DataFrame(data)
df.sort_values(by=["distance"], ascending=True, inplace=True)
df

In [None]:
import pandas as pd

vectorizer = Vectorizer()
vectorizer.run([q0] + [sanitise_query(x) for x in documents_translated])
vectors = vectorizer.vectors

data = dict({"q0": [], "doc": [], "distance": []})


for i in range(1, len(vectors)):
    dist = spatial.distance.cosine(vectors[0], vectors[i])
    doc = documents_translated[i - 1]
    data["q0"].append(q0)
    data["doc"].append(doc)
    data["distance"].append(dist)

df = pd.DataFrame(data)
df.sort_values(by=["distance"], ascending=True, inplace=True)
df.head(10)

## 8. Concat the keywords with the original query

Concatenate the keywords with the original query and search for documents in the original language


In [None]:
# In (5), we've extracted a number of domain specific terms that we found in the documents
# in various different languages translated to the language of the original query.
# Here, we will concatenate these domain specific terms to the original query to obtain
# our reformulated query.

reformulated_query = q0

for term in domain_specific_terms:
    reformulated_query += " " + term

print(reformulated_query)


## 9. Evaluation

Evaluate the results of the search

Reference:
1. GitHub: [pyterrier/examples/notebooks
/retrieval_and_evaluation.ipynb](https://github.com/terrier-org/pyterrier/blob/master/examples/notebooks/retrieval_and_evaluation.ipynb)
1. GitHub: [pyterrier/examples/notebooks
/experiment.ipynb](https://github.com/terrier-org/pyterrier/blob/master/examples/notebooks/experiment.ipynb)

### 7.1 Precision / Recall

See how many of the returned documents are relevant. Did the number of relevant documents increase?


### 7.2 Keyword diversification

Did the number of unique keywords increase compared to naive domain-specific keyword identification?
