In [None]:
!pip install elasticsearch

In [None]:
import pandas as pd
import json

# Corpus Path 
corpus_path = r'trec-covid\corpus.jsonl'

# reading
data = []
with open(corpus_path, 'r', encoding='utf-8') as f:
    for line in f:
        data.append(json.loads(line))

# Dataframe creation
# This is out corpus
df = pd.DataFrame(data)

In [None]:
import re

# The code cleans and standardizes text by converting it to lowercase, 
# removing HTML tags, digits, and punctuation, and collapsing multiple spaces into one.
def clear_text(text):
    if pd.isna(text):
        return ""
    text = text.lower()
    text = re.sub(r"<[^>]+>", " ", text)
    text = re.sub(r"\d+", " ", text)
    text = re.sub(r"[^\w\s]", " ", text)
    text = re.sub(r"\s+", " ", text).strip()

    return text

df["clean_text"] = (df["title"].fillna("") + " " + df["text"].fillna("")).apply(clear_text)
# Remove rows where clean_text is empty
df = df[df["clean_text"] != ""]

In [None]:
print(df[["_id", "title","clean_text","text"]].head())

In [30]:
print(df.columns)

Index(['_id', 'title', 'text', 'metadata', 'clean_text'], dtype='object')


In [4]:
from elasticsearch import Elasticsearch, helpers

client = Elasticsearch(
    "https://my-elasticsearch-project-ff8e64.es.eu-west-1.aws.elastic.cloud:443",
    api_key="SGJYZVA1WUIxUGJ2cm1mUjJUWi06c1BXRHFyc1BwZ25DbWFua1pleUVLQQ=="
)

index_name = "search-0y01"

vsm_settings={
    "settings": {
        "number_of_shards": 1,
        "similarity": {
            "scripted_tfidf": {
                "type": "scripted",
                "script": {
                    "source": "double tf = Math.sqrt(doc.freq); double idf = Math.log((field.docCount+1.0)/(term.docFreq+1.0)) + 1.0; double norm = 1/Math.sqrt(doc.length); return query.boost * tf * idf * norm;"
                }
            }
        },        
        "analysis": {
            "analyzer": {
                "default": {
                    "type": "english"
                },
                "default_search": {
                    "type": "english"
                }
            }
        }
    },
    "mappings": {
        "properties": {
            # "title": {
            #     "type": "text",
            #     "analyzer": "custom_english_analyzer",
            #     "similarity": "scripted_tfidf"},
            "text": {
                "type": "text",
                "analyzer": "custom_english_analyzer",
                "similarity": "scripted_tfidf"
        }
    }
}
}



In [None]:

# Create the index with mappings and settings
if not client.indices.exists(index=index_name):
    response = client.indices.create(index=index_name, body=vsm_settings)
    print("Index created:", response)
else:
    print(f"Index '{index_name}' already exists.")


docs = [
    {
        "_id": row["_id"],
        "_source": {
            # "title": row["title"],
            "text": row["clean_text"]
        }
    }
    for _, row in df.iterrows()
]
bulk_response = helpers.bulk(client, docs,index=index_name)
print("Success:", bulk_response)

Index 'search-0y01' already exists.
Success: (171331, [])


In [43]:
import json

# take queries from file 
queries = []
with open(r"C:\Users\30694\OneDrive\Desktop\Victor\OPA\8o examino\Anaktisi Pliroforion\Προγραμματιστική εργασία\trec-covid\queries.jsonl", "r", encoding="utf-8") as f:
    for line in f:
        queries.append(json.loads(line))

In [42]:
import os

def pretty_search_response(response, file_path):
    with open(file_path, "w", encoding="utf-8") as f:
        if len(response["hits"]["hits"]) == 0:
            msg = "Your search returned no results."
            print(msg)
            f.write(msg + "\n")
        else:
            for hit in response["hits"]["hits"]:
                id = hit["_id"]
                text = hit["_source"]["text"]
                pretty_output = f"\nID: {id},\nText: {text}\n"
                print(pretty_output)
                f.write(pretty_output)

In [46]:
# Loop through the queries and process each
for q in queries:
    query_text = q["metadata"]["query"]
    query_id = q["_id"]

    response = client.search(
        index=index_name,
        size=50,
        query={
            "match": {
                "text": query_text
            }
        }
    )
    base_path = r"C:\Users\30694\OneDrive\Desktop\Victor\OPA\8o examino\Anaktisi Pliroforion\Προγραμματιστική εργασία\trec-covid\res\50"
    file_name = os.path.join(base_path, f"query_{query_id}.txt")

    pretty_search_response(response, file_path=file_name)




ID: 8ccl9aui,
Text: mosaic evolution of the severe acute respiratory syndrome coronavirus severe acute respiratory syndrome sars is a deadly form of pneumonia caused by a novel coronavirus a viral family responsible for mild respiratory tract infections in a wide variety of animals including humans pigs cows mice cats and birds analyses to date have been unable to identify the precise origin of the sars coronavirus we used bayesian neighbor joining and split decomposition phylogenetic techniques on the sars virus replicase surface spike matrix and nucleocapsid proteins to reveal the evolutionary origin of this recently emerging infectious agent the analyses support a mammalian like origin for the replicase protein an avian like origin for the matrix and nucleocapsid proteins and a mammalian avian mosaic origin for the host determining spike protein a bootscan recombination analysis of the spike gene revealed high nucleotide identity between the sars virus and a feline infectious perit