In [14]:
import json
import nltk
import pandas as pd
from nltk.corpus import wordnet
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from elasticsearch import Elasticsearch, helpers
import re 


In [30]:
nltk.download('punkt')
nltk.download('wordnet')
nltk.download('omw-1.4')
nltk.download('stopwords')
nltk.download('punkt_tab')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\victo\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\victo\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to
[nltk_data]     C:\Users\victo\AppData\Roaming\nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\victo\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt_tab to
[nltk_data]     C:\Users\victo\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping tokenizers\punkt_tab.zip.


True

In [13]:
corpus_path = r'..\..\trec-covid\corpus.jsonl'

# reading
data = []
with open(corpus_path, 'r', encoding='utf-8') as f:
    for line in f:
        data.append(json.loads(line))

# Dataframe creation
# This is out corpus
df = pd.DataFrame(data)

In [15]:
# The code cleans and standardizes text by converting it to lowercase, 
# removing HTML tags, digits, and punctuation, and collapsing multiple spaces into one.
def clear_text(text):
    if pd.isna(text):
        return ""
    text = text.lower()
    text = re.sub(r"<[^>]+>", " ", text)
    text = re.sub(r"\d+", " ", text)
    text = re.sub(r"[^\w\s]", " ", text)
    text = re.sub(r"\s+", " ", text).strip()

    return text

df["clean_text"] = (df["title"].fillna("") + " " + df["text"].fillna("")).apply(clear_text)
# Remove rows where clean_text is empty
df = df[df["clean_text"] != ""]

In [2]:
stop_words = set(stopwords.words('english'))

# We modified this funciton a bit. 
# We added the if statment, so that it doesn;t add it self as a synonim. 
def get_synonyms(word):  
    synonyms = set() 
    for syn in wordnet.synsets(word):  
        for lemma in syn.lemmas():  
            name = lemma.name().replace("_", " ")
            if name.lower() != word.lower():  
                synonyms.add(name)  
    return list(synonyms)

In [3]:
# take queries from file 
queries = []
with open(r"..\..\trec-covid\queries.jsonl", "r", encoding="utf-8") as f:
    for line in f:
        queries.append(json.loads(line))

In [4]:
expanded_queries = []

for q in queries:
    original_text = q["metadata"]["query"]
    tokens = word_tokenize(original_text)
    expanded_terms = set(tokens)

    for token in tokens:
        token = token.lower()
        if token.isalpha() and token not in stop_words:
            synonyms = get_synonyms(token)
            expanded_terms.update(synonyms)

    expanded_query = " ".join(expanded_terms)
    expanded_queries.append({
        "_id": q["_id"],
        "original": original_text,
        "expanded": expanded_query
    })

In [17]:
client = Elasticsearch("http://localhost:9200")
index_name = "my_index"
vsm_settings={
    "settings": {
        "number_of_shards": 1,
        "similarity": {
            "scripted_tfidf": {
                "type": "scripted",
                "script": {
                    "source": "double tf = Math.sqrt(doc.freq); double idf = Math.log((field.docCount+1.0)/(term.docFreq+1.0)) + 1.0; double norm = 1/Math.sqrt(doc.length); return query.boost * tf * idf * norm;"
                }
            }
        },        
        "analysis": {
            "analyzer": {
                "default": {
                    "type": "english"
                },
                "default_search": {
                    "type": "english"
                }
            }
        }
    },
    "mappings": {
        "properties": {
            "text": {
                "type": "text",
                "similarity": "scripted_tfidf"
        }
    }
}
}


In [None]:
client.indices.delete(index="my_index")

In [19]:
# Create the index with mappings and settings
if not client.indices.exists(index=index_name):
    response = client.indices.create(index=index_name, body=vsm_settings)
    print("Index created:", response)
else:
    print(f"Index '{index_name}' already exists.")


docs = [
    {
        "_id": row["_id"],
        "_source": {
            "text": row["clean_text"]
        }
    }
    for _, row in df.iterrows()
]
bulk_response = helpers.bulk(client, docs,index=index_name)
print("Success:", bulk_response)

Index 'my_index' already exists.
Success: (171331, [])


In [24]:
import os

def pretty_search_response(response, file_path, query_id, run_id="expanded_run"):
    with open(file_path, "w", encoding="utf-8") as f:
        rank = 1
        for hit in response["hits"]["hits"]:
            doc_id = hit["_id"]
            score = hit["_score"]
            f.write(f"{query_id} Q0 {doc_id} {rank} {score} {run_id}\n")
            rank += 1

In [27]:
for q in expanded_queries:
    query_text = q["expanded"]
    query_id = q["_id"]

    response = client.search(
        index=index_name,
        size=50,
        query={
            "match": {
                "text": query_text
            }
        }
    )
    base_path = r"50"
    file_name = os.path.join(base_path, f"query_{query_id}.txt")

    pretty_search_response(response, file_name, query_id)