In [None]:
!pip install elasticsearch

In [None]:
import pandas as pd
import json

# Corpus Path 
corpus_path = r'trec-covid\corpus.jsonl'

# reading
data = []
with open(corpus_path, 'r', encoding='utf-8') as f:
    for line in f:
        data.append(json.loads(line))

# Dataframe creation
# This is out corpus
df = pd.DataFrame(data)

In [38]:
import re

# The code cleans and standardizes text by converting it to lowercase, 
# removing HTML tags, digits, and punctuation, and collapsing multiple spaces into one.
def clear_text(text):
    if pd.isna(text):
        return ""
    text = text.lower()
    text = re.sub(r"<[^>]+>", " ", text)
    text = re.sub(r"\d+", " ", text)
    text = re.sub(r"[^\w\s]", " ", text)
    text = re.sub(r"\s+", " ", text).strip()

    return text

# df["title"]=df["title"].apply(clear_text)
df["clean_text"] = (df["title"].fillna("") + " " + df["text"].fillna("")).apply(clear_text)
# Remove rows where clean_text is empty
df = df[df["clean_text"] != ""]

In [None]:
print(df[["_id", "title","clean_text","text"]].head())

In [30]:
print(df.columns)

Index(['_id', 'title', 'text', 'metadata', 'clean_text'], dtype='object')


In [None]:
from elasticsearch import Elasticsearch, helpers

client = Elasticsearch(
    "https://my-elasticsearch-project-ff8e64.es.eu-west-1.aws.elastic.cloud:443",
    api_key="SGJYZVA1WUIxUGJ2cm1mUjJUWi06c1BXRHFyc1BwZ25DbWFua1pleUVLQQ=="
)

index_name = "search-0y01"

vsm_settings={
    "settings": {
        "number_of_shards": 1,
        "similarity": {
            "scripted_tfidf": {
                "type": "scripted",
                "script": {
                    "source": "double tf = Math.sqrt(doc.freq); double idf = Math.log((field.docCount+1.0)/(term.docFreq+1.0)) + 1.0; double norm = 1/Math.sqrt(doc.length); return query.boost * tf * idf * norm;"
                }
            }
        },        
        "analysis": {
            "analyzer": {
                "default": {
                    "type": "english"
                },
                "default_search": {
                    "type": "english"
                }
            }
        }
    },
    "mappings": {
        "properties": {
            # "title": {
            #     "type": "text",
            #     "analyzer": "custom_english_analyzer",
            #     "similarity": "scripted_tfidf"},
            "text": {
                "type": "text",
                "analyzer": "custom_english_analyzer",
                "similarity": "scripted_tfidf"
        }
    }
}
}

# Create the index with mappings and settings
if not client.indices.exists(index=index_name):
    response = client.indices.create(index=index_name, body=vsm_settings)
    print("Index created:", response)
else:
    print(f"Index '{index_name}' already exists.")


docs = [
    {
        "_id": row["_id"],
        "_source": {
            # "title": row["title"],
            "text": row["clean_text"]
        }
    }
    for _, row in df.iterrows()
]
bulk_response = helpers.bulk(client, docs,index=index_name)
print("Success:", bulk_response)

Index 'search-0y01' already exists.
Success: (171331, [])
