# Document Retrieval
This project focuses on document retrieval, a core task in information retrieval where the goal is to rank documents by their relevance to a given query. Using a dataset containing over 199,000 documents across multiple languages, the aim is to implement a model that can effectively retrieve the most relevant documents.

In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

Let's import the necessary libraries.

In [None]:
# Data Handling
import pandas as pd
import numpy as np

# Text Preprocessing
import string
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import SnowballStemmer
import nltk
nltk.download('stopwords')

# Counting and Data Structures
from collections import Counter

# Similarity Computation
from sklearn.metrics.pairwise import cosine_similarity

# Sparse Matrices
import scipy.sparse as sp

# Progress Bar
from tqdm import tqdm

Before we start, we need to import the pre-computed BM25 matrices, IDF dictionaries and vocabularies for all languages calculated previously.

In [None]:
# Define the paths for each language's precomputed data
data_paths = {
    'en': {
        'documents': 'Data/corpus_en_processed.csv',
        'idf': 'Data/idf_dict_en.pkl',
        'vocabulary': 'Data/bm25_vocabulary_en.pkl',
        'bm25': 'Data/bm25_matrix_en.npz'
    },
    'fr': {
        'documents': 'Data/corpus_fr_processed.csv',
        'idf': 'Data/idf_dict_fr.pkl',
        'vocabulary': 'Data/bm25_vocabulary_fr.pkl',
        'bm25': 'Data/bm25_matrix_fr.npz'
    },
    'de': {
        'documents': 'Data/corpus_de_processed.csv',
        'idf': 'Data/idf_dict_de.pkl',
        'vocabulary': 'Data/bm25_vocabulary_de.pkl',
        'bm25': 'Data/bm25_matrix_de.npz'
    },
    'es': {
        'documents': 'Data/corpus_es_processed.csv',
        'idf': 'Data/idf_dict_es.pkl',
        'vocabulary': 'Data/bm25_vocabulary_es.pkl',
        'bm25': 'Data/bm25_matrix_es.npz'
    },
    'it': {
        'documents': 'Data/corpus_it_processed.csv',
        'idf': 'Data/idf_dict_it.pkl',
        'vocabulary': 'Data/bm25_vocabulary_it.pkl',
        'bm25': 'Data/bm25_matrix_it.npz'
    },  
    'ar': {
        'documents': 'Data/corpus_ar_processed.csv',
        'idf': 'Data/idf_dict_ar.pkl',
        'vocabulary': 'Data/bm25_vocabulary_ar.pkl',
        'bm25': 'Data/bm25_matrix_ar.npz'
    },
    'ko': {
        'documents': 'Data/corpus_ko_processed.csv',
        'idf': 'Data/idf_dict_ko.pkl',
        'vocabulary': 'Data/bm25_vocabulary_ko.pkl',
        'bm25': 'Data/bm25_matrix_ko.npz'
    }
}

# Initialize dictionaries to store loaded data for each language
documents = {}
idf_dicts = {}
vocabularies = {}
bm25_matrices = {}

# Load the data for each language
for lang, paths in data_paths.items():

    # Load documents
    documents[lang] = pd.read_csv(paths['documents'])

    # Load IDF dictionary
    idf_dicts[lang] = pd.read_pickle(paths['idf'])

    # Load vocabulary
    vocabularies[lang] = pd.read_pickle(paths['vocabulary'])

    # Load bm25 matrix
    bm25_matrices[lang] = sp.load_npz(paths['bm25'])

Now, let's load the file containing the queries for which we need to retrieve the top-10 relevant documents.

In [None]:
# Load the CSV file containing queries (test set)
dev_path = 'Data/dev.csv'
dev_df = pd.read_csv(dev_path)

The following will be used to process all queries in the DataFrame in the same way the documents were processed.

In [None]:
# Define stopwords for each language
stopwords_dict = {
    'en': set(stopwords.words('english')),
    'fr': set(stopwords.words('french')),
    'de': set(stopwords.words('german')),
    'es': set(stopwords.words('spanish')),
    'it': set(stopwords.words('italian')),
    'ar': set(stopwords.words('arabic'))
}

# Load Korean stopwords from an external file
with open('Data/stopwords-ko.txt', 'r', encoding='utf-8') as f: # /kaggle/input/korean-stop-words/stopwords-ko.txt
    stopwords_dict['ko'] = set(f.read().splitlines())

# Define stemmers for each language
stemmer_dict = {
    'en': SnowballStemmer('english'),
    'fr': SnowballStemmer('french'),
    'de': SnowballStemmer('german'),
    'es': SnowballStemmer('spanish'),
    'it': SnowballStemmer('italian'),
    'ar': None,  # No stemmer for Arabic
    'ko': None   # No stemmer for Korean
}

# Function to apply stemming based on language
def apply_stemming(tokens, lang):
    stemmer = stemmer_dict.get(lang, None)
    if stemmer:  # Apply stemming only if a stemmer is available for the language
        return [stemmer.stem(token) for token in tokens]
    return tokens  # If no stemmer, return tokens as-is

# Preprocessing function for each document based on its language
def preprocess_single_text(text, lang):
    # Lowercasing
    text = text.lower()
    
    # Tokenization
    tokens = word_tokenize(text)

    # Retain only alphabetic tokens
    tokens = [word for word in tokens if word.isalpha()] 
    
    # Remove stopwords based on the language
    stop_words = stopwords_dict.get(lang, set())  
    tokens = [word for word in tokens if word not in stop_words]
    
    # Apply stemming
    tokens = apply_stemming(tokens, lang)

    # Join tokens back into a single string
    processed_text = ' '.join(tokens)
    
    return processed_text

We also need to create the function that process the query and returns the top 10 most relevant documents.

In [None]:
def process_query_bm25(query, lang, bm25_matrix, idf_dict, vocabulary, top_n=10):
    # Preprocess the query
    preprocessed_query = preprocess_single_text(query, lang)
    
    # Tokenize the preprocessed query
    query_tokens = preprocessed_query.split()
    
    # Initialize the query vector with zeros
    query_vector = np.zeros(len(vocabulary))
    
    # Fill in the IDF values for the query vector
    for term in set(query_tokens):
        if term in idf_dict:
            term_index = vocabulary.index(term)
            query_vector[term_index] = idf_dict[term]
    
    # Reshape the query vector to a 2D array (needed for cosine similarity)
    query_vector = query_vector.reshape(1, -1)
    
    # Compute cosine similarity between the query and the documents
    similarities = cosine_similarity(query_vector, bm25_matrix).flatten()
    
    # Get the indices of the top N most similar documents
    top_n_indices = np.argsort(similarities)[-top_n:][::-1]  # Sort in descending order of similarity
    
    return top_n_indices

Finally, all that remains for us to do is to process each query and store the corresponding top 10 document IDs.

In [None]:
# Process each query in the test_df and store the top 10 document IDs for each query
results = []

# Wrap the iteration with tqdm to add a progress bar
for idx, row in tqdm(dev_df.iterrows(), total=len(dev_df), desc="Processing Queries"):
    query = row['query']
    lang = row['lang']
    
    # Select the appropriate BM25 matrix, IDF dictionary, and vocabulary based on the language
    bm25_matrix = bm25_matrices[lang]
    idf_dict = idf_dicts[lang]
    vocabulary = vocabularies[lang]
    
    # Get the top 10 most relevant documents for the query
    top_n_doc_indices = process_query_bm25(query, lang, bm25_matrix, idf_dict, vocabulary, top_n=10)
    
    # Retrieve the document IDs based on the indices from the corresponding documents DataFrame
    top_n_docids = documents[lang].iloc[top_n_doc_indices]['docid'].tolist()
    
    # Convert the document IDs to the required format (e.g., 'doc-en-7459')
    formatted_docids = [f'{docid}' for docid in top_n_docids]
    
    # Append the result as a new row (index, docids)
    results.append({'id': idx, 'docids': str(formatted_docids)})

# Convert the results to a DataFrame
results_df = pd.DataFrame(results)

Thus, we generated the submission which is a csv file having the following two columns: 'id', 'docids'. The column 'id' refers to the id of the query in the test.csv and the columnn 'docids' refers to the list of retrieved document ids. In order to evaluate our model, we need to check whether the identifier of the positive document is among the first 10 documents returned by our model for each query. If so, the function returns 1, otherwise it returns 0.

In [None]:
# Function to calculate Recall@10
def calculate_recall_at_10(dev_row, top_10_docs):
    """
    This function calculates Recall@10 for a specific query.
    dev_row : row from the dev dataframe containing the query details
    top_10_docs : list of the top 10 relevant documents from results_df for the query
    """
    # Get the positive document ID from dev
    positive_doc = dev_row['positive_docs']
    # Check if the positive document is among the top 10 documents
    if positive_doc in top_10_docs:
        return 1  # Success
    else:
        return 0  # Failure

# Calculate Recall@10 for all queries
dev_df['recall_at_10'] = dev_df.apply(lambda row: calculate_recall_at_10(row, results_df.iloc[row.name]['docids']), axis=1)

All that remains is to calculate the average Recall@10 for each language by grouping queries according to language, and to calculate the overall average Recall@10 for all languages across the entire database.

In [None]:
# Calculate Recall@10 per Language
recall_by_lang = dev_df.groupby('lang')['recall_at_10'].mean()

# Display Recall@10 per language
print("\nRecall@10 per language:")
print(recall_by_lang)

# Calculate Overall Average Recall@10 
overall_recall = dev_df['recall_at_10'].mean()

# Display the overall Recall@10
print("\nOverall Recall@10:", overall_recall)

These scores give us a more in-depth idea of ​​the performance of our model.