In [14]:
!pip install langchain faiss-cpu sentence-transformers transformers datasets pypdf rank_bm25 -U langchain-community


from langchain.document_loaders import PyPDFLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.embeddings import HuggingFaceEmbeddings
from langchain.vectorstores import FAISS
from rank_bm25 import BM25Okapi
from sentence_transformers import SentenceTransformer
from transformers import AutoModelForCausalLM
from transformers import pipeline, AutoTokenizer, AutoModelForSeq2SeqLM
from langchain.llms import HuggingFacePipeline
import numpy as np
import csv, os, re



In [15]:
from google.colab import files
uploaded = files.upload()

Saving 13. Atlas of Diabetes Mellitus (3rd Edition).pdf to 13. Atlas of Diabetes Mellitus (3rd Edition) (1).pdf


In [17]:
from nltk.tokenize import sent_tokenize
from langchain.schema import Document
import re

# 📄 Load and Preprocess PDF
pdf_path = "/content/13. Atlas of Diabetes Mellitus (3rd Edition).pdf"
loader = PyPDFLoader(pdf_path)
documents = loader.load()

# 🧹 Clean text and add metadata
def clean_text(text):
    text = re.sub(r'\s+', ' ', text)  # Remove extra whitespace/newlines
    text = re.sub(r'\.([A-Z])', r'. \1', text)  # Fix spacing after periods
    return text.strip()

for doc in documents:
    doc.page_content = clean_text(doc.page_content)
    doc.metadata["source_file"] = os.path.basename(pdf_path)

# 🧠 Choose chunking strategy: "recursive", "sentence", or "paragraph"
chunking_strategy = "recursive"   # Change this to "sentence" or "paragraph" if needed
chunk_size = 500
chunk_overlap = 200

chunks = []

if chunking_strategy == "recursive":
    from langchain.text_splitter import RecursiveCharacterTextSplitter
    splitter = RecursiveCharacterTextSplitter(chunk_size=chunk_size, chunk_overlap=chunk_overlap)
    chunks = splitter.split_documents(documents)

elif chunking_strategy == "sentence":
    for doc in documents:
        sentences = sent_tokenize(doc.page_content)
        current_chunk = []
        current_length = 0
        for sent in sentences:
            if current_length + len(sent) > chunk_size:
                chunk_text = " ".join(current_chunk)
                chunks.append(Document(page_content=chunk_text, metadata=doc.metadata))
                current_chunk = current_chunk[-(chunk_overlap // len(sent) + 1):]  # Simple overlap
                current_length = sum(len(s) for s in current_chunk)
            current_chunk.append(sent)
            current_length += len(sent)
        if current_chunk:
            chunks.append(Document(page_content=" ".join(current_chunk), metadata=doc.metadata))

elif chunking_strategy == "paragraph":
    for doc in documents:
        paragraphs = re.split(r'\n\s*\n', doc.page_content)
        current_chunk = []
        current_length = 0
        for para in paragraphs:
            para = para.strip()
            if not para:
                continue
            if current_length + len(para) > chunk_size:
                chunks.append(Document(page_content="\n\n".join(current_chunk), metadata=doc.metadata))
                current_chunk = current_chunk[-1 * (chunk_overlap // len(para) + 1):]
                current_length = sum(len(p) for p in current_chunk)
            current_chunk.append(para)
            current_length += len(para)
        if current_chunk:
            chunks.append(Document(page_content="\n\n".join(current_chunk), metadata=doc.metadata))

else:
    raise ValueError("Invalid chunking strategy selected.")

# 📚 Extract text
texts = [doc.page_content for doc in chunks]
print(f"✅ Chunking complete using '{chunking_strategy}' — Total chunks: {len(chunks)}")


✅ Chunking complete using 'recursive' — Total chunks: 930


In [18]:
# Dense: PubMedBERT + FAISS
embedding_model = HuggingFaceEmbeddings(model_name='pritamdeka/S-PubMedBert-MS-MARCO')
vector_store = FAISS.from_texts(texts, embedding_model)

# BM25: Sparse
tokenized_texts = [text.split(" ") for text in texts]
bm25 = BM25Okapi(tokenized_texts)

# SentenceTransformer for encoding query
dense_encoder = SentenceTransformer('pritamdeka/S-PubMedBert-MS-MARCO')

In [19]:
# Summarization
summarizer = pipeline("summarization", model="facebook/bart-large-cnn")

def summarize_text(text):
    return summarizer(text, max_length=100, min_length=30, do_sample=False)[0]['summary_text']

# Reorder by query relevance
def reorder_by_query(query, context):
    query_terms = query.lower().split()
    sentences = context.split('. ')
    scored = sorted(sentences, key=lambda s: sum(word in s.lower() for word in query_terms), reverse=True)
    return '. '.join(scored)

# Highlight query terms
def highlight_terms(text, query):
    for word in query.lower().split():
        text = re.sub(fr'\b{word}\b', f'[HIGHLIGHT]{word}[/HIGHLIGHT]', text, flags=re.IGNORECASE)
    return text

# Deduplication
def deduplicate(sentences):
    unique = []
    seen = set()
    for s in sentences:
        norm = s.strip().lower()
        if norm not in seen:
            seen.add(norm)
            unique.append(s)
    return unique


Device set to use cpu


In [20]:
model_name = "tiiuae/falcon-7b-instruct"

tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForCausalLM.from_pretrained(model_name, device_map="auto", torch_dtype="auto")

pipe = pipeline("text-generation", model=model, tokenizer=tokenizer,
                max_new_tokens=256, temperature=0.4, top_k=50, top_p=0.9, repetition_penalty=1.2)

llm = HuggingFacePipeline(pipeline=pipe)

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

Device set to use cpu


In [21]:
def hybrid_retrieve(query, top_k=5):
    # Sparse
    query_tokens = query.lower().split()
    bm25_scores = bm25.get_scores(query_tokens)
    sparse_indices = np.argsort(bm25_scores)[::-1][:top_k]

    # Dense
    dense_query = dense_encoder.encode([query], convert_to_numpy=True)[0]
    _, dense_indices = vector_store.index.search(np.array([dense_query]), top_k)

    # Merge
    hybrid_indices = list(set(sparse_indices).union(set(dense_indices[0])))
    return [chunks[i] for i in hybrid_indices]


In [22]:
EVAL_TEMPLATE = """
You are an evaluator. Rate the following answer using the source documents provided.

Question: {question}

Answer: {answer}

Sources:
{sources}

Give a score from 1 to 5 for:
FAITHFULNESS: Is it consistent with sources?
RELEVANCE: Does it answer the question?

Format:
FAITHFULNESS: <score>, RELEVANCE: <score>
"""

def get_auto_scores(llm, question, answer, sources):
    combined = "\n---\n".join([doc.page_content[:500] for doc in sources])
    prompt = EVAL_TEMPLATE.format(question=question, answer=answer, sources=combined)
    response = llm(prompt)
    try:
        match = re.search(r"FAITHFULNESS: (\d).*RELEVANCE: (\d)", response, re.DOTALL)
        return int(match.group(1)), int(match.group(2))
    except Exception as e:
        print("⚠️ Eval parse failed:", e)
        return None, None

def log_evaluation_to_csv(filepath, question, answer, sources, faithfulness, relevance):
    file_exists = os.path.isfile(filepath)
    with open(filepath, 'a', newline='', encoding='utf-8') as file:
        writer = csv.writer(file)
        if not file_exists:
            writer.writerow(['Question', 'Answer', 'Sources', 'Faithfulness', 'Relevance'])
        src = "\n---\n".join([doc.page_content[:300] for doc in sources])
        writer.writerow([question, answer, src, faithfulness, relevance])


In [None]:
# test reasoning + hybrid performance
query0 = "What are the main types of diabetes and how are they different?"
query2 = "What is the difference between hypoglycemia and hyperglycemia?"

# sparse retrieval testing
query3 = "What is the normal range for blood sugar levels?"
query4 = "How is diabetes diagnosed?"

# dense retrieval testing
query5 = "What complications arise from chronic hyperglycemia?"

# test sequential understanding
query6 = "What is the lifecycle of insulin from secretion to absorption?"

# test embedding quality
query7 = "What does HbA1c mean and why is it important?"

# test noisy input
query8 = "H0w 2 treet dibetes wit diet?"

# role based query
query9 = "Advise a diabetic patient on how to manage their condition while fasting during Ramadan."
query10 = "Explain to an elderly patient how exercise can help control blood sugar levels."

retrieved_docs = hybrid_retrieve(query9)

# Preprocess context
raw_context = "\n\n".join([doc.page_content[:500] for doc in retrieved_docs])
summarized = summarize_text(raw_context)
reordered = reorder_by_query(query9, summarized)
highlighted = highlight_terms(reordered, query9)
final_context = '. '.join(deduplicate(highlighted.split('. ')))

# Generate answer
# prompt = f"Based on the context below, answer the question:\n\nContext:\n{final_context}\n\nQuestion: {query}"
# answer = llm(prompt)

# Role based
prompt = f"""
You are an experienced medical advisor.

Based on the following context, answer the patient's question in simple and accurate terms:

Context:
{final_context}

Question:
{query9}
"""
answer = llm(prompt)


print("\n📝 Question:", query9)
print("\n✅ Answer:\n", answer)
print("\n📚 Retrieved Snippets:\n")
for i, doc in enumerate(retrieved_docs):
    print(f"Source {i+1}:\n{doc.page_content[:300]}\n---")

# Evaluate
faithfulness, relevance = get_auto_scores(llm, query9, answer, retrieved_docs)
print(f"\n🤖 Auto Evaluation — Faithfulness: {faithfulness}, Relevance: {relevance}")

# Log
log_evaluation_to_csv("rag_results.csv", query9, answer, retrieved_docs, faithfulness, relevance)

Setting `pad_token_id` to `eos_token_id`:11 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:11 for open-end generation.



📝 Question: Advise a diabetic patient on how to manage their condition while fasting during Ramadan.

✅ Answer:
 
You are an experienced medical advisor.

Based on the following context, answer the patient's question in simple and accurate terms:

Context:
There is as yet no consensus [HIGHLIGHT]on[/HIGHLIGHT] [HIGHLIGHT]how[/HIGHLIGHT] often patients should check [HIGHLIGHT]their[/HIGHLIGHT] blood glucose. It allows patients [HIGHLIGHT]to[/HIGHLIGHT] make [HIGHLIGHT]their[/HIGHLIGHT] own adjustments [HIGHLIGHT]to[/HIGHLIGHT] insulin dosages and helps [HIGHLIGHT]to[/HIGHLIGHT] avoid hypoglycemia. Self-monitoring of blood glucose has become an integral part of modern insulin treatment. [HIGHLIGHT]a[/HIGHLIGHT] registered dietician is best placed [HIGHLIGHT]to[/HIGHLIGHT] offer advice [HIGHLIGHT]on[/HIGHLIGHT] recommended diets.

Question:
Advise a diabetic patient on how to manage their condition while fasting during Ramadan.

Answer:
It is recommended that patients with diabetes fast 