# **Installing Required Dependencies**

In [None]:
%pip install -q pdfplumber sentence-transformers torch tqdm pyngrok streamlit datasets groq


# **Configuring Groq Cloud to Load the Llama 3.2**

In [None]:
from groq import Groq
client = Groq(api_key='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')

# **Importing Require Libraries and Model**

In [None]:
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import DataLoader
from sentence_transformers import SentenceTransformer, InputExample
from tqdm import tqdm
import pdfplumber
from google.colab import files
import numpy as np
from transformers import pipeline , AutoTokenizer, AutoModelForCausalLM



model = SentenceTransformer("abhinand/MedEmbed-large-v0.1")


# **Configuring Hugging Face Token**

In [None]:
from huggingface_hub import login
import os
os.environ["HF_TOKEN"] = "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"
login(token=os.environ['HF_TOKEN'])

Note: Environment variable`HF_TOKEN` is set and is the current active token independently from the token you've just configured.


# **Source Code**

In [31]:
%%writefile streamlit_app.py
import streamlit as st
import pdfplumber
import numpy as np
import torch
from sentence_transformers import SentenceTransformer
from sentence_transformers.losses import CoSENTLoss, MatryoshkaLoss
from io import BytesIO

#CSS for Spotify theme
st.markdown("""
    <style>
        body {
            background-color: #191414;  /* Black background */
            color: white;  /* White text */
        }
        .stButton button {
            background-color: #1DB954;  /* Spotify green */
            color: white;
            border: none;
            padding: 12px 24px;
            border-radius: 5px;
            font-weight: bold;
            font-size: 16px;
            cursor: pointer;
        }
        .stButton button:hover {
            background-color: #1ed760;  /* Slightly lighter green on hover */
        }
        .stTextInput input, .stSelectbox select, .stFileUploader input {
            background-color: #191414;  /* Dark background for input fields */
            color: white;  /* White text inside inputs */
            border: 1px solid #1DB954;  /* Green border */
        }
        .stTextInput input:focus, .stSelectbox select:focus, .stFileUploader input:focus {
            border-color: #1ed760;  /* Light green on focus */
        }
        .sidebar .sidebar-content {
            background-color: #191414;  /* Dark sidebar */
            color: white;  /* White text in sidebar */
        }
        .css-ffhzg2 {
            background-color: #191414;  /* Background for markdown */
            color: white;
        }
        .stFileUploader {
            background-color: #191414;  /* Dark background for file uploader */
            color: white;  /* White text */
        }
        .stFileUploader input {
            color: white;
        }
        h1 {
            color: #1DB954;  /* Spotify green color for the title */
        }
    </style>
""", unsafe_allow_html=True)

# PDF Parsing
def parse_pdf(uploaded_file):
    try:
        with pdfplumber.open(BytesIO(uploaded_file.read())) as pdf:
            text = ""
            for page in pdf.pages:
                text += page.extract_text()
        return text
    except Exception as e:
        st.error(f"Error reading the PDF: {e}")
        return ""

# Preprocess PDF Text
def preprocess_pdf_text(extracted_text, chunk_size=5):
    lines = extracted_text.split("\n")
    candidates = [
        " ".join(lines[i : i + chunk_size]).strip()
        for i in range(0, len(lines), chunk_size)
    ]
    return [c for c in candidates if c]

# Truncate text for concise display
def truncate_text(text, max_length=250):
    """Truncate text to a specified maximum length with ellipsis."""
    return text if len(text) <= max_length else text[:max_length].strip() + "..."

# Define the model and loss
def initialize_model_with_loss(selected_model_path):
    matryoshka_dims = [768, 512, 256, 128, 64]
    model = SentenceTransformer(selected_model_path)
    base_loss = CoSENTLoss(model=model)
    matryoshka_loss = MatryoshkaLoss(
        model=model,
        loss=base_loss,
        matryoshka_dims=matryoshka_dims,
    )
    return model, matryoshka_loss

#Used to rule out similarites between each generated output
def rescale_similarity_to_100(similarity_matrix):
    min_sim = np.min(similarity_matrix)
    max_sim = np.max(similarity_matrix)
    return ((similarity_matrix - min_sim) / (max_sim - min_sim)) * 100

# Calculate Metrics
def calculate_metrics(similarity_matrix, ground_truth, top_k=10):
    mrr, ndcg, recall = 0.0, 0.0, 0.0

    for idx, query_similarities in enumerate(similarity_matrix):
        ranked_indices = np.argsort(query_similarities)[::-1]
        relevant_docs = ground_truth[idx]

        if not relevant_docs:
            continue

        # Mean Reciprocal Rank (MRR)
        for rank, doc_idx in enumerate(ranked_indices):
            if doc_idx in relevant_docs:
                mrr += 1 / (rank + 1)
                break

        # Normalized Discounted Cumulative Gain (NDCG)
        dcg = 0.0
        idcg = 0.0
        for rank, doc_idx in enumerate(ranked_indices[:top_k]):
            if doc_idx in relevant_docs:
                dcg += 1 / np.log2(rank + 2)
        for rank in range(min(len(relevant_docs), top_k)):
            idcg += 1 / np.log2(rank + 2)
        ndcg += (dcg / idcg) if idcg > 0 else 0

        # Recall
        retrieved_relevant = len(set(ranked_indices[:top_k]) & set(relevant_docs))
        recall += retrieved_relevant / len(relevant_docs)

    num_queries = len(ground_truth)
    return mrr / num_queries, ndcg / num_queries, recall / num_queries

# Generate ground truth based on similarity threshold
def generate_ground_truth(query_embeddings, candidate_embeddings, threshold_percentile=80, top_k=10):
    similarity_matrix = torch.mm(query_embeddings, candidate_embeddings.T).cpu().numpy()
    similarity_threshold = np.percentile(similarity_matrix.flatten(), threshold_percentile)

    ground_truth = []
    for query_similarities in similarity_matrix:
        relevant_docs = [idx for idx, sim in enumerate(query_similarities) if sim >= similarity_threshold]
        ground_truth.append(relevant_docs)

    return ground_truth, similarity_matrix

# Main Workflow
st.title("Medical Document Retrieval with Model Selection")

# Dropdown for Model Selection
model_options = {
    "MedEmbed": "abhinand/MedEmbed-large-v0.1",
    "AllMiniLMv6": "sentence-transformers/all-MiniLM-L6-v2",
    "BioClinical": "emilyalsentzer/Bio_ClinicalBERT",
    "PubMed": "pritamdeka/S-PubMedBert-MS-MARCO",
}
selected_model_name = st.selectbox("Select a model:", list(model_options.keys()))
selected_model_path = model_options[selected_model_name]

uploaded_file = st.file_uploader("Upload a PDF file", type=["pdf"])
if uploaded_file is not None:
    extracted_text = parse_pdf(uploaded_file)
    if extracted_text:
        st.success("PDF Text Extracted Successfully!")
        candidates = preprocess_pdf_text(extracted_text)

        query = st.text_input("Enter your query:")
        if query:
            # Initialize model with Matryoshka loss
            model, matryoshka_loss = initialize_model_with_loss(selected_model_path)

            # Encode the query and candidates
            query_embeddings = model.encode([query], convert_to_tensor=True)
            candidate_embeddings = model.encode(candidates, convert_to_tensor=True)

            # Generate similarity matrix and ground truth
            ground_truth, similarity_matrix = generate_ground_truth(query_embeddings, candidate_embeddings)

            # Rescale the similarity matrix to 0-100 range
            similarity_matrix_rescaled = rescale_similarity_to_100(similarity_matrix)

            # Sidebar for Metrics and Derived Ground Truth
            with st.sidebar:
                st.header("Metrics and Ground Truth")
                st.write("**Automatically Derived Ground Truth:**")
                st.write(ground_truth)

                # Metrics Calculation
                mrr, ndcg, recall = calculate_metrics(similarity_matrix_rescaled, ground_truth, top_k=10)
                st.metric(label="Mean Reciprocal Rank (MRR)", value=f"{mrr:.4f}")
                st.metric(label="Normalized Discounted Cumulative Gain (NDCG)", value=f"{ndcg:.4f}")
                st.metric(label="Recall", value=f"{recall:.4f}")

            # Information Retrieval
            ranked_indices = np.argsort(similarity_matrix_rescaled[0])[::-1]
            top_k = 3  # Increased top_k for better recall
            top_candidates = [candidates[idx] for idx in ranked_indices[:top_k]]

            st.write("Top 3 Relevant Content:")
            for idx, candidate in enumerate(top_candidates):
                truncated_candidate = truncate_text(candidate, max_length=250)
                score = similarity_matrix_rescaled[0][ranked_indices[idx]]
                st.write(f"Rank {idx + 1}: {truncated_candidate} (Score: {score:.2f})")

            if st.button("Summarization"):
                # Summarization Logic
                from groq import Groq
                client = Groq(api_key='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')

                prompt = """
                You are a helpful and concise assistant that follows the ReAct pattern step-by-step.
                Your task is to summarize the provided content into a short, context-rich response without adding external information.



                For every task:
                1. Thought: Summarize the content and context, focusing on sessions, timing, and patient progress.
                2. Action: Extract key details such as:
                  - Document Type (if available)
                  - Heading (if available)
                  - Date of birth (if available)
                  - Physician name (if available)
                  - Evaluation date (or relevant dates, e.g., sessions completed or missed)
                  - Motor strength (if mentioned)
                  - Session details (number of sessions attended, missed, or planned)
                  - ICD/CPT codes (if provided)
                  - Physical therapy plan (including number of sessions completed, missed, or planned)
                3. Observation: Briefly describe the patient's current status, progress, and next steps. If any sessions are mentioned, include them in the description, such as how many sessions have been attended, missed, or planned. Highlight any changes in treatment or future therapy plans.
                4. Answer: Provide a concise summary by considering the following from Thought, Action, and Observation:
                   "On [evaluation date], [Document Type], attended by [physician]. The patient [condition/progress]. The patient has completed [number] sessions and missed [number] sessions due to [reason]. [Next steps].
                """

                input_text = "\n".join(top_candidates)

                # Sending the request to generate a summary
                chat_completion = client.chat.completions.create(
                    messages=[
                        {"role": "system", "content": prompt},
                        {"role": "user", "content": input_text}
                    ],
                    model="llama3-8b-8192",
                    temperature=0.3,
                    max_tokens=1000,  # Limit the summary length
                    top_p=1,
                    stop=None,
                    stream=False,
                )

                # Getting and displaying the summary
                summary = chat_completion.choices[0].message.content.strip()

                st.write("Summary of Top Relevant Content:")
                st.write(summary)

Overwriting streamlit_app.py


# **Configuring Ngork**

In [None]:
!ngrok config add-authtoken 2pNSREBSxUcY9bKMdBDQ2GVAi06_6eQPJKfwjhpGstZSAw6rP

Authtoken saved to configuration file: /root/.config/ngrok/ngrok.yml


# **Creating tunnel to load the Streamlit from colab directly**

In [None]:
from pyngrok import ngrok
import subprocess

# Kill existing Ngrok tunnels if any
tunnels = ngrok.get_tunnels()
if tunnels:
    ngrok.kill()

# Start Streamlit in the background
process = subprocess.Popen(["streamlit", "run", "streamlit_app.py", "--server.port", "8501"])

# Expose Streamlit app using Ngrok
public_url = ngrok.connect("8501")  # Use the correct port here
print(f"Streamlit app is live at: {public_url}")


Streamlit app is live at: NgrokTunnel: "https://6279-34-73-220-170.ngrok-free.app" -> "http://localhost:8501"
