### Reading dataset from *JSONL* file

In [1]:
import json
import dill
from sklearn.feature_extraction.text import TfidfVectorizer 
from sklearn.metrics.pairwise import cosine_similarity
from nltk.tokenize import word_tokenize
from nltk.stem import PorterStemmer

# Converting JSONL file into JSON
data = dict()
with open('../webis-touche2020/corpus.jsonl', 'r', encoding='utf-8') as f:
    for cnt, line in enumerate(f):
        d = json.loads(line)
        data[d['_id']] = d['title'] + ' ' + d['text']
        
docs = list(data.values())
keys = list(data.keys())

## Text Processing
Processing dataset text based on a bunch of rules:
###### Rule(1): 

In [2]:
# TODO (1): No whitespace!
# 'goingto' 'playingPoker' 'movingpictures' 'memorizingtables'
# So there are two cases: 
# 1- 'goingto'. So the program has to discover words that are grammatically correct but are not separated with whitespace.
# 2- 'playingPoker' ..
# 
# TODO (2): Hyphenated expressions!
# Example: Should-we-deport-all-illegal-immigrants-we-find-in-the-US
#
# TODO (3): Spell checking!
# Example: "occurec"
#
# TODO (4): Points and commas in between!
# Example: men.Affirmative (Frequent)
# 
# TODO (5): Handling Links!
# Example: //www.commieblaster.com
#          //sas-space.sas.ac.uk/2563/1/Amicus79_Ahmed
#          
# TODO (6): Handling abbreviations and acronyms
# 
# TODO (7): Swearing words handling?!
# 
# TODO (8): Handling Emojis!

import re
import string
from nltk.corpus import wordnet as wn
from spellchecker import SpellChecker
from emoji import emoji_list, is_emoji, demojize
from nltk.tokenize import word_tokenize
from nltk.stem import PorterStemmer

stemmer = PorterStemmer()

example = 'USA is a text with some abbreviations lik OOP and it is not expected of pythn language to detect US unknownwords because FBI is not here! But hey! This is UK stff! FASEB gun is great!'

email_example = {
    'doc1': " Hi!! ?? ,,  *** ## //www.commieblaster.com yoyo 🪀🪀",
    'doc2': "hey this is me email   USA FAB 🔗  //sas-space.sas.ac.uk/2563/1/Amicus79_Ahmed ant youu this iss misspeled",
    "doc3": "ttiip://google.com 😎  😂😂 yo!!!!"
}

# TODO: Detect abbreviations in a given text 
def detect_abbreviations(text):
    pattern = r'\b[A-Z]{2,}\b'
    abbs = re.findall(pattern, text)
    return abbs

def detect_url(doc):
    """
    Detects almost all kinds of different URLs
    
    Args:
    - doc (str): A string to look for all the matches of the url pattern 
    
    Returns:
    - (list): List of matches. Returns None if no matches
    """
    pattern = r'(?:https?\:)?(?:\/\/)?(?:[a-zA-Z0-9-]+\.)+[a-zA-Z0-9]+(?:\/[a-zA-z0-9]+)*'
    return re.match(pattern, doc)
        
def wordnet_abbreviation_expand(abbreviation):
    """
    Find abbreviations using WordNet.

    Args:
    - abbreviation (str): The abbreviation to search for in WordNet.

    Returns:
    - res (list): A list of strings that represent possible meanings of the word.
    """
    res = []
    synsets = wn.synsets(abbreviation)
    for synset in synsets:
        ext_abb = str(synset)[8:-7].replace('_',' ')
        res.append(ext_abb)
    return res


def custom_tokenizer(doc):
    import re
    from nltk.tokenize import word_tokenize
    from emoji import emoji_list, is_emoji, demojize
    
    def detect_url(doc):
        pattern = r'(?:https?\:)?(?:\/\/)?(?:[a-zA-Z0-9-]+\.)+[a-zA-Z0-9]+(?:\/[a-zA-z0-9]+)*'
        return re.match(pattern, doc)
    
    punctuation_pattern = r'[!"#$%&\'()*+,\-./:;<=>?@[\\\]^_`{|}~]'
    filtered_tokens = []
    tokens = word_tokenize(doc)

    for token in tokens:
        # URL handling
        if detect_url(token) is not None:
            filtered_tokens.append("Link")
        # Emoji Handling
        elif emoji_list(token):
            normalized_emoji = demojize(token).replace(':', ' ').strip()
            filtered_tokens.append(normalized_emoji)
        else: 
            # If token matches a punctuation mark; ignore it
            if re.match(punctuation_pattern, token) is None:
                filtered_tokens.append(token) 
    return filtered_tokens 

#### Defining `TfidfVectorizer` model 


In [3]:
# Defining Vectorizer 
vectorizer = TfidfVectorizer(      
    tokenizer=custom_tokenizer,
    stop_words="english",
)

#### Train `TfidfVectorizer` model on dataset and queries
Then storing `tfidf_matrix` and the `vectorizer` instance as `pickle` files

In [4]:
""" 
    Query looks like:
    "query": {
        "_id": string, // Can be mapped to numbers
        "text": string,
        "metadata": {
            "description": string,
            "narrative": string,
        },
    }
"""
queries = dict()
with open('../webis-touche2020/queries.jsonl', 'r', encoding='utf-8') as f:
    for cnt, line in enumerate(f):
        query = json.loads(line)
        query_metadata = query['metadata']['narrative']
        queries[query['_id']] = query['text'] 

# Train model 
tfidf_matrix = vectorizer.fit_transform(docs)

# with open('feature_names.txt', 'w', encoding='utf-8') as fff:
#     for feat_name in vectorizer.get_feature_names_out():
#         fff.writelines(feat_name + ' ')

# Writing tfidf matrix to a compressed binary file 
# with open('matrix.pkl', 'wb') as mf:
#     dill.dump(tfidf_matrix, mf)
# mf.close()

# # Writing vectorizer 'model' to a compressed binary file
# with open('model.pkl', 'wb') as bf:
#     dill.dump(vectorizer, bf)
# bf.close()



KeyboardInterrupt: 

#### Read `vectorizer` and `tfidf_matrix` from `pickle` files

In [14]:

def get_tfidf_matrix():
    with open('matrix.pkl', 'rb') as mf:
        tfidf_matrix = dill.load(mf)
    mf.close()
    return tfidf_matrix

def get_vectorizer_instance() -> TfidfVectorizer:
    with open('model.pkl', 'rb') as mf:
        vectorizer = dill.load(mf)
    mf.close()
    return vectorizer

### Handling User Query

In [15]:
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np

# tfidf_matrix = get_tfidf_matrix()
# vectorizer = get_vectorizer_instance()

def find_top_k_results(user_query, tfidf_matrix, docs, k=10): 
    # Transform user query into vector
    user_query_vector = vectorizer.transform([user_query])

    # Calculating cosine similarity between user query and stored data
    similarites = cosine_similarity(user_query_vector, tfidf_matrix)
    
    # Get indecies of top K elements
    top_indices = np.argpartition(similarites, -k, axis=None)[-k:]

    # Sort top K results by similarity score
    top_indices_sorted = top_indices[np.argsort(similarites.ravel()[top_indices])]

    # Get top K results
    top_results = [(docs[i], keys[i], similarites[0, i]) for i in top_indices_sorted]
    
    return top_results


user_query = "Should teachers get tenure?"
k = 10
top_results = find_top_k_results(user_query, tfidf_matrix, docs, k)


# Print
with open('results.txt', 'w', encoding='utf-8') as rf:
    rf.writelines
    for doc, doc_id, similarity_score in top_results:
        rf.writelines(f"Doc Id: {doc_id}\n")
        # rf.writelines(f"Document: {doc}\n")
        # rf.writelines(f"Cosine Similarity Score: {similarity_score}\n")
        # rf.writelines('--------------------------------------\n')
rf.close()


## Evaluation
#### Calculating *precision* score and *recall* score

In [16]:
from typing import List, Dict
from itertools import islice
import csv

qrels_file = '../webis-touche2020/qrels/test.tsv'

def create_qrels_inverted(qrels_file):
    inverted_index = {}
    with open(qrels_file, 'r') as f:
        next(f) # Skip first line of the file
        for line in f:
            query_id, document_id, score = line.strip().split('\t')
            score = int(score)
            query_id = int(query_id)
            if query_id not in inverted_index:
                inverted_index[query_id] = []
            inverted_index[query_id].append((document_id, score))
                    
    # Sort index in descending order
    for query_id, doc_scores in inverted_index.items():
        inverted_index[query_id] = sorted(doc_scores, key=lambda x: x[1], reverse=True)
        
    return inverted_index

def get_relevant_docs_for_query_i(query_id, k=10):
    relevant_docs = set()
    # for query_id, doc_scores in zip(qrels.keys(), qrels.values()):
    for doc_id, score in qrels[query_id]:
        relevant_docs.add(doc_id)
    return relevant_docs

def calculate_average_precision(relevant_docs, retrieved_docs, k=10): 
    precision_sum = 0
    true_positives_at_k = 0
    for i, retrieved_doc in enumerate(retrieved_docs, start=1):
        if retrieved_doc in relevant_docs:
            true_positives_at_k += 1
            precision_sum += true_positives_at_k / i
    return precision_sum / true_positives_at_k if true_positives_at_k > 0 else 0
        
def evaluate(top_results: List, query_id: str, k):
    relevant_docs = get_relevant_docs_for_query_i(query_id)
    retrieved_docs = set(result[1] for result in top_results)
   
    true_positives = retrieved_docs.intersection(relevant_docs)
    
    # Precision
    precision = len(true_positives) / k if k > 0 else 0
    # Recall 
    recall = len(true_positives)/ len(relevant_docs) if len(relevant_docs) > 0 else 0
    # Average Precision
    ap = calculate_average_precision(relevant_docs, retrieved_docs)
    
    return precision, recall, ap

qrels = create_qrels_inverted(qrels_file)

aps = []
with open('evaluation.tsv', 'w', newline='', encoding='utf-8') as ef:  
    writer = csv.writer(ef, delimiter='\t')  
    writer.writerow(['Query-id', 'Precision', 'Recall', 'AP'])
    for query_id in queries.keys():
        p, r, ap = evaluate(top_results, query_id, k)
        aps.append(ap)
        writer.writerow([query_id, p, r, ap])
        if p != 0:
            print(f"Query {query_id}: Precision: {p},  Recall: {r}, AP: {ap}")
ef.close()


mAP = sum(aps) / len(aps)
print(f"Mean Average Precision (mAP): {mAP}")

Query 1: Precision: 0.7,  Recall: 0.1590909090909091, AP: 0.9129251700680271
Mean Average Precision (mAP): 0.018631125919755655
