In [1]:
!pip install elasticsearch

Defaulting to user installation because normal site-packages is not writeable
Collecting elasticsearch
  Downloading elasticsearch-9.0.1-py3-none-any.whl.metadata (8.5 kB)
Collecting elastic-transport<9,>=8.15.1 (from elasticsearch)
  Downloading elastic_transport-8.17.1-py3-none-any.whl.metadata (3.8 kB)
Downloading elasticsearch-9.0.1-py3-none-any.whl (905 kB)
   ---------------------------------------- 0.0/905.5 kB ? eta -:--:--
   ---------------------------------------- 0.0/905.5 kB ? eta -:--:--
   ----------- ---------------------------- 262.1/905.5 kB ? eta -:--:--
   ----------------------- ---------------- 524.3/905.5 kB 1.2 MB/s eta 0:00:01
   ----------------------- ---------------- 524.3/905.5 kB 1.2 MB/s eta 0:00:01
   --------------------------------- ---- 786.4/905.5 kB 881.6 kB/s eta 0:00:01
   -------------------------------------- 905.5/905.5 kB 768.9 kB/s eta 0:00:00
Downloading elastic_transport-8.17.1-py3-none-any.whl (64 kB)
Installing collected packages: elastic


[notice] A new release of pip is available: 25.0 -> 25.1.1
[notice] To update, run: python.exe -m pip install --upgrade pip


In [13]:
import json
from elasticsearch.helpers import bulk
import pandas as pd

# Corpus Path 
corpus_path = r'..\trec-covid\corpus.jsonl'

# reading
data = []
with open(corpus_path, 'r', encoding='utf-8') as f:
    for line in f:
        data.append(json.loads(line))

# Dataframe creation
# This is out corpus
df = pd.DataFrame(data)

In [91]:
!curl http://localhost:9200/

{
  "name" : "DESKTOP-7VCER44",
  "cluster_name" : "elasticsearch",
  "cluster_uuid" : "4KOjzn0zQLCGNiwWwmPhMg",
  "version" : {
    "number" : "9.0.1",
    "build_flavor" : "default",
    "build_type" : "zip",
    "build_hash" : "73f7594ea00db50aa7e941e151a5b3985f01e364",
    "build_date" : "2025-04-30T10:07:41.393025990Z",
    "build_snapshot" : false,
    "lucene_version" : "10.1.0",
    "minimum_wire_compatibility_version" : "8.18.0",
    "minimum_index_compatibility_version" : "8.0.0"
  },
  "tagline" : "You Know, for Search"
}


  % Total    % Received % Xferd  Average Speed   Time    Time     Time  Current
                                 Dload  Upload   Total   Spent    Left  Speed

  0     0    0     0    0     0      0      0 --:--:-- --:--:-- --:--:--     0
100   539  100   539    0     0  82529      0 --:--:-- --:--:-- --:--:-- 89833


In [21]:
import re

# The code cleans and standardizes text by converting it to lowercase, 
# removing HTML tags, digits, and punctuation, and collapsing multiple spaces into one.
def clear_text(text):
    if pd.isna(text):
        return ""
    text = text.lower()
    text = re.sub(r"<[^>]+>", " ", text)
    text = re.sub(r"\d+", " ", text)
    text = re.sub(r"[^\w\s]", " ", text)
    text = re.sub(r"\s+", " ", text).strip()

    return text

df["clean_text"] = (df["title"].fillna("") + " " + df["text"].fillna("")).apply(clear_text)
# Remove rows where clean_text is empty
df = df[df["clean_text"] != ""]

In [10]:
print(df[["_id", "title","clean_text","text"]].head())

        _id                                              title  \
0  ug7v899j  Clinical features of culture-proven Mycoplasma...   
1  02tnwd4m  Nitric oxide: a pro-inflammatory mediator in l...   
2  ejv2xln0    Surfactant protein-D and pulmonary host defense   
3  2b73a28n               Role of endothelin-1 in lung disease   
4  9785vg6d  Gene expression in epithelial cells in respons...   

                                          clean_text  \
0  clinical features of culture proven mycoplasma...   
1  nitric oxide a pro inflammatory mediator in lu...   
2  surfactant protein d and pulmonary host defens...   
3  role of endothelin in lung disease endothelin ...   
4  gene expression in epithelial cells in respons...   

                                                text  
0  OBJECTIVE: This retrospective chart review des...  
1  Inflammatory diseases of the respiratory tract...  
2  Surfactant protein-D (SP-D) participates in th...  
3  Endothelin-1 (ET-1) is a 21 amino acid pept

In [22]:
print(df.columns)

Index(['_id', 'title', 'text', 'metadata', 'clean_text'], dtype='object')


In [133]:
from elasticsearch import Elasticsearch, helpers

client = Elasticsearch("http://localhost:9200")

index_name = "my_index"

vsm_settings={
    "settings": {
        "number_of_shards": 1,
        "similarity": {
            "scripted_tfidf": {
                "type": "scripted",
                "script": {
                    "source": "double tf = Math.sqrt(doc.freq); double idf = Math.log((field.docCount+1.0)/(term.docFreq+1.0)) + 1.0; double norm = 1/Math.sqrt(doc.length); return query.boost * tf * idf * norm;"
                }
            }
        },        
        "analysis": {
            "analyzer": {
                "default": {
                    "type": "english"
                },
                "default_search": {
                    "type": "english"
                }
            }
        }
    },
    "mappings": {
        "properties": {
            "text": {
                "type": "text",
                "similarity": "scripted_tfidf"
        }
    }
}
}



In [134]:
client.indices.delete(index="my_index")

ObjectApiResponse({'acknowledged': True})

In [135]:
# Create the index with mappings and settings
if not client.indices.exists(index=index_name):
    response = client.indices.create(index=index_name, body=vsm_settings)
    print("Index created:", response)
else:
    print(f"Index '{index_name}' already exists.")


docs = [
    {
        "_id": row["_id"],
        "_source": {
            "text": row["clean_text"]
        }
    }
    for _, row in df.iterrows()
]
bulk_response = helpers.bulk(client, docs,index=index_name)
print("Success:", bulk_response)

Index created: {'acknowledged': True, 'shards_acknowledged': True, 'index': 'my_index'}
Success: (171331, [])


In [26]:
import json

# take queries from file 
queries = []
with open(r"..\trec-covid\queries.jsonl", "r", encoding="utf-8") as f:
    for line in f:
        queries.append(json.loads(line))

In [136]:
import os

def pretty_search_response(response, file_path):
    with open(file_path, "w", encoding="utf-8") as f:
        if len(response["hits"]["hits"]) == 0:
            msg = "Your search returned no results."
            print(msg)
            f.write(msg + "\n")
        else:
            for hit in response["hits"]["hits"]:
                id = hit["_id"]
                text = hit["_source"]["text"]
                pretty_output = f"\nID: {id},\nText: {text}\n"
                print(pretty_output)
                f.write(pretty_output)

In [137]:
# Loop through the queries and process each
for q in queries:
    query_text = q["metadata"]["query"]
    query_id = q["_id"]

    response = client.search(
        index=index_name,
        size=20,
        query={
            "match": {
                "text": query_text
            }
        }
    )
    base_path = r"res\20"
    file_name = os.path.join(base_path, f"query_{query_id}.txt")

    pretty_search_response(response, file_path=file_name)




ID: pl48ev5o,
Text: origin and evolution of the novel coronavirus


ID: h8ahn8fw,
Text: origin and evolution of the novel coronavirus


ID: k86pf2yf,
Text: coronavirus origins signs prevention and management of patients


ID: irkjiqll,
Text: coronavirus origins signs prevention and management of patients


ID: 6foz003n,
Text: diversity of coronaviruses in bats insights into origin of sars coronavirus


ID: jpnbppry,
Text: bat origin of a new human coronavirus there and back again


ID: bp9xz9wk,
Text: coronavirus


ID: vj000wal,
Text: coronavirus


ID: be0mr85h,
Text: coronavirus


ID: sfs5hsr9,
Text: coronavirus


ID: j1cdoxqs,
Text: coronavirus


ID: jkejiuf2,
Text: coronaviruses origin and evolution


ID: kyrkx2ii,
Text: the novel coronavirus originating in wuhan china challenges for global health governance


ID: ehjui7u6,
Text: the novel coronavirus originating in wuhan china challenges for global health governance


ID: a7w6lael,
Text: origin of sars remains a mystery


ID: m4kz