In [None]:
!pip install datasets rank-bm25 nltk scikit-learn tqdm


Collecting rank-bm25
  Downloading rank_bm25-0.2.2-py3-none-any.whl.metadata (3.2 kB)
Downloading rank_bm25-0.2.2-py3-none-any.whl (8.6 kB)
Installing collected packages: rank-bm25
Successfully installed rank-bm25-0.2.2


In [None]:
import nltk
nltk.download("punkt")


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


True

In [None]:
from datasets import load_dataset

pubmedqa = load_dataset("pubmed_qa", "pqa_labeled")
print(pubmedqa)


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


README.md: 0.00B [00:00, ?B/s]

pqa_labeled/train-00000-of-00001.parquet:   0%|          | 0.00/1.08M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/1000 [00:00<?, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['pubid', 'question', 'context', 'long_answer', 'final_decision'],
        num_rows: 1000
    })
})


In [None]:
len(pubmedqa["train"])


1000

In [None]:
# Normalize PubMedQA into a clean format

def normalize_pubmedqa(dataset_split):
    records = []
    for item in dataset_split:
        context = " ".join(item["context"]["contexts"])
        records.append({
            "question": item["question"],
            "context": context,
            "source": "PubMedQA"
        })
    return records

pubmed_records = normalize_pubmedqa(pubmedqa["train"])

print("Total normalized records:", len(pubmed_records))
print(pubmed_records[0])


Total normalized records: 1000
{'question': 'Do mitochondria play a role in remodelling lace plant leaves during programmed cell death?', 'context': 'Programmed cell death (PCD) is the regulated death of cells within an organism. The lace plant (Aponogeton madagascariensis) produces perforations in its leaves through PCD. The leaves of the plant consist of a latticework of longitudinal and transverse veins enclosing areoles. PCD occurs in the cells at the center of these areoles and progresses outwards, stopping approximately five cells from the vasculature. The role of mitochondria during PCD has been recognized in animals; however, it has been less studied during PCD in plants. The following paper elucidates the role of mitochondrial dynamics during developmentally regulated PCD in vivo in A. madagascariensis. A single areole within a window stage leaf (PCD is occurring) was divided into three areas based on the progression of PCD; cells that will not undergo PCD (NPCD), cells in ear

In [None]:
from nltk.tokenize import word_tokenize

def chunk_text(text, chunk_size=400, overlap=50):
    tokens = word_tokenize(text)
    chunks = []

    for i in range(0, len(tokens), chunk_size - overlap):
        chunk = tokens[i:i + chunk_size]
        chunks.append(" ".join(chunk))

    return chunks


In [None]:
import nltk
nltk.download("punkt")


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [None]:
print(type(pubmed_records))
print(len(pubmed_records))
print(pubmed_records[0].keys())


<class 'list'>
1000
dict_keys(['question', 'context', 'source'])


In [None]:
print(pubmed_records[0])


{'question': 'Do mitochondria play a role in remodelling lace plant leaves during programmed cell death?', 'context': 'Programmed cell death (PCD) is the regulated death of cells within an organism. The lace plant (Aponogeton madagascariensis) produces perforations in its leaves through PCD. The leaves of the plant consist of a latticework of longitudinal and transverse veins enclosing areoles. PCD occurs in the cells at the center of these areoles and progresses outwards, stopping approximately five cells from the vasculature. The role of mitochondria during PCD has been recognized in animals; however, it has been less studied during PCD in plants. The following paper elucidates the role of mitochondrial dynamics during developmentally regulated PCD in vivo in A. madagascariensis. A single areole within a window stage leaf (PCD is occurring) was divided into three areas based on the progression of PCD; cells that will not undergo PCD (NPCD), cells in early stages of PCD (EPCD), and ce

In [None]:
def chunk_text(text, chunk_size=400, overlap=50):
    words = text.split()
    chunks = []

    step = chunk_size - overlap
    for i in range(0, len(words), step):
        chunk = words[i:i + chunk_size]
        chunks.append(" ".join(chunk))

    return chunks

print("Safe chunk_text loaded")


Safe chunk_text loaded


In [None]:
chunked_docs = []

for record in pubmed_records:
    chunks = chunk_text(record["context"])
    for chunk in chunks:
        if chunk.strip():  # safety check
            chunked_docs.append({
                "text": chunk,
                "source": record["source"]
            })

print("Total chunks created:", len(chunked_docs))
print(chunked_docs[0]["text"][:300])


Total chunks created: 1009
Programmed cell death (PCD) is the regulated death of cells within an organism. The lace plant (Aponogeton madagascariensis) produces perforations in its leaves through PCD. The leaves of the plant consist of a latticework of longitudinal and transverse veins enclosing areoles. PCD occurs in the cel


In [None]:
from rank_bm25 import BM25Okapi

tokenized_corpus = [doc["text"].split() for doc in chunked_docs]
bm25 = BM25Okapi(tokenized_corpus)

print("BM25 index built successfully")


BM25 index built successfully


In [None]:
def retrieve_bm25(query, k=5):
    tokenized_query = query.split()
    scores = bm25.get_scores(tokenized_query)

    top_k_indices = sorted(
        range(len(scores)),
        key=lambda i: scores[i],
        reverse=True
    )[:k]

    results = []
    for idx in top_k_indices:
        results.append({
            "score": scores[idx],
            "text": chunked_docs[idx]["text"]
        })

    return results


# Test query
query = "What causes high blood pressure?"
results = retrieve_bm25(query, k=5)

for i, res in enumerate(results, 1):
    print(f"\nResult {i}")
    print(res["text"][:300])



Result 1
The objectives of this study were to investigate longitudinal predictors of fear of recurrence in survivors of head and neck cancer (HNC) using Leventhal's Common Sense Model (CSM) as a framework. The research questions were as follows: (a) to what extent do HNC patients report fear of cancer recurr

Result 2
longitudinal descriptive study. 2 large nursing homes in Turin, Italy. 418 dependent elderly (83 males, 335 females, mean age 83.7+/-8.5 y, range 55-102) living in the nursing homes. the prevalence of peripheral arterial disease (PAD) was evaluated using a Doppler Ultrasound measurement of AAI (Ankl

Result 3
Diabetes mellitus (DM) is undiagnosed in approximately half of the patients actually suffering from the disease. In addition, the prevalence of DM is more than twice as high as in patients with periodontitis when compared to periodontally healthy subjects. Thus, a high number of patients with period

Result 4
To examine patterns of knowledge and attitudes among adul