<a href="https://colab.research.google.com/github/viniciusmmartins/PosFiap-AI-TechChallange-Fase3/blob/main/Tech_Challange_fase3_RAG_Gemma_4b.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# RAG de protocolos médicos

## Dependency install

In [None]:
!pip install transformers torch accelerate langchain langchain-community langchain-core langchain-chroma sentence-transformers chromadb huggingface_hub

Collecting langchain-community
  Downloading langchain_community-0.4.1-py3-none-any.whl.metadata (3.0 kB)
Collecting langchain-chroma
  Downloading langchain_chroma-1.1.0-py3-none-any.whl.metadata (1.9 kB)
Collecting chromadb
  Downloading chromadb-1.3.7-cp39-abi3-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (7.2 kB)
Collecting langchain-classic<2.0.0,>=1.0.0 (from langchain-community)
  Downloading langchain_classic-1.0.0-py3-none-any.whl.metadata (3.9 kB)
Collecting requests (from transformers)
  Downloading requests-2.32.5-py3-none-any.whl.metadata (4.9 kB)
Collecting dataclasses-json<0.7.0,>=0.6.7 (from langchain-community)
  Downloading dataclasses_json-0.6.7-py3-none-any.whl.metadata (25 kB)
Collecting build>=1.0.3 (from chromadb)
  Downloading build-1.3.0-py3-none-any.whl.metadata (5.6 kB)
Collecting pybase64>=1.4.1 (from chromadb)
  Downloading pybase64-1.4.3-cp312-cp312-manylinux1_x86_64.manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_5_x86_64.whl.metadata (8

## Imports

In [None]:
import requests
from langchain_text_splitters import RecursiveCharacterTextSplitter
from langchain_community.embeddings import HuggingFaceEmbeddings
from sentence_transformers import SentenceTransformer
import chromadb
from chromadb.config import Settings
from huggingface_hub import upload_folder
import json
from google.colab import userdata

## Download Data from Your API

In [None]:
url = "http://72.60.57.100:8000/protocols"
resp = requests.get(url)
resp.raise_for_status()
protocols = resp.json()

print(protocols)

[{'id': 'proto_0001', 'name': 'Stroke - Protocol 1', 'category': 'Toxicology', 'description': 'Standard protocol for handling Stroke in Toxicology context.', 'created_at': '2025-10-16T21:18:07.959960', 'steps': [{'step_number': 1, 'description': 'Step 1 description', 'action': 'Monitor ECG', 'duration_minutes': 8}, {'step_number': 2, 'description': 'Step 2 description', 'action': 'Check Vitals', 'duration_minutes': 41}, {'step_number': 3, 'description': 'Step 3 description', 'action': 'Check Blood Glucose', 'duration_minutes': 27}, {'step_number': 4, 'description': 'Step 4 description', 'action': 'Administer Antibiotics', 'duration_minutes': 7}, {'step_number': 5, 'description': 'Step 5 description', 'action': 'Order CT Scan', 'duration_minutes': 1}, {'step_number': 6, 'description': 'Step 6 description', 'action': 'Perform CPR', 'duration_minutes': 55}, {'step_number': 7, 'description': 'Step 7 description', 'action': 'Administer Oxygen', 'duration_minutes': 1}, {'step_number': 8, 'de

## Convert each protocol into a clean RAG text document

In [None]:
def protocol_to_text(proto):
    """
    Converts your protocol JSON into a clean natural-language
    document for embedding in RAG.
    """
    header = (
        #f"Protocol ID: {proto['id']}\n"
        f"Name: {proto['name']}\n"
        f"Category: {proto['category']}\n"
        f"Description: {proto['description']}\n"
        f"Created at: {proto['created_at']}\n\n"
        "Steps:\n"
    )

    step_lines = []
    for step in proto.get("steps", []):
        step_lines.append(
            f"- Step {step['step_number']}: {step['description']}. "
            f"Action: {step['action']}. "
            f"Duration: {step['duration_minutes']} minutes."
        )

    return header + "\n".join(step_lines)


documents = [protocol_to_text(p) for p in protocols]

print(documents)

print("Total protocols loaded:", len(documents))

['Name: Stroke - Protocol 1\nCategory: Toxicology\nDescription: Standard protocol for handling Stroke in Toxicology context.\nCreated at: 2025-10-16T21:18:07.959960\n\nSteps:\n- Step 1: Step 1 description. Action: Monitor ECG. Duration: 8 minutes.\n- Step 2: Step 2 description. Action: Check Vitals. Duration: 41 minutes.\n- Step 3: Step 3 description. Action: Check Blood Glucose. Duration: 27 minutes.\n- Step 4: Step 4 description. Action: Administer Antibiotics. Duration: 7 minutes.\n- Step 5: Step 5 description. Action: Order CT Scan. Duration: 1 minutes.\n- Step 6: Step 6 description. Action: Perform CPR. Duration: 55 minutes.\n- Step 7: Step 7 description. Action: Administer Oxygen. Duration: 1 minutes.\n- Step 8: Step 8 description. Action: Check Blood Glucose. Duration: 17 minutes.', 'Name: Overdose - Protocol 2\nCategory: Neurology\nDescription: Standard protocol for handling Overdose in Neurology context.\nCreated at: 2025-02-01T21:18:08.057818\n\nSteps:\n- Step 1: Step 1 descr

## Chunk the Documents

In [None]:
splitter = RecursiveCharacterTextSplitter(
    chunk_size=512,      # Large enough for a full step + context
    chunk_overlap=100,   # High overlap to maintain continuity
    separators=["\n\n", "\n", ". ", " ", ""]
)

chunks = []
for doc in documents:
    chunks.extend(splitter.split_text(doc))

print(f"Generated {len(chunks)} chunks from {len(documents)} protocols.")

Generated 1980 chunks from 1000 protocols.


## Create Embeddings


In [None]:
embed_model = HuggingFaceEmbeddings(
    model_name="BAAI/bge-base-en-v1.5",
    model_kwargs={'device': 'cpu'}, # Change to 'cuda' if GPU available
    encode_kwargs={'normalize_embeddings': True}
)

  embed_model = HuggingFaceEmbeddings(
The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/124 [00:00<?, ?B/s]

README.md: 0.00B [00:00, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/52.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/777 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/438M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/366 [00:00<?, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/125 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

## Create ChromaDB Vector Store

In [None]:
from google.colab import drive
drive.mount("/content/drive", force_remount=True)

Mounted at /content/drive


In [None]:
db_dir = "/content/drive/MyDrive/AI/tech challenge/rag/chroma_protocols_gemma7b"

client = chromadb.PersistentClient(path=db_dir)

collection = client.get_or_create_collection("protocols_collection")

batch_size = 100
total_chunks = len(chunks)

for i in range(0, total_chunks, batch_size):
    batch_chunks = chunks[i : i + batch_size]
    batch_ids = [f"chunk_{j}" for j in range(i, i + len(batch_chunks))]

    # Compute embeddings manually to ensure control
    batch_embeddings = embed_model.embed_documents(batch_chunks)

    collection.add(
        ids=batch_ids,
        documents=batch_chunks,
        embeddings=batch_embeddings
    )
    print(f"Processed batch {i} to {i+len(batch_chunks)}")

Processed batch 0 to 100
Processed batch 100 to 200
Processed batch 200 to 300
Processed batch 300 to 400
Processed batch 400 to 500
Processed batch 500 to 600
Processed batch 600 to 700
Processed batch 700 to 800
Processed batch 800 to 900
Processed batch 900 to 1000
Processed batch 1000 to 1100
Processed batch 1100 to 1200
Processed batch 1200 to 1300
Processed batch 1300 to 1400
Processed batch 1400 to 1500
Processed batch 1500 to 1600
Processed batch 1600 to 1700
Processed batch 1700 to 1800
Processed batch 1800 to 1900
Processed batch 1900 to 1980


## Upload to Hugging Face

In [None]:
repo_id = "gerson-analista/rag-gemma-4b-tech-challenge"
folder_path = "./chroma"

upload_folder(
    folder_path=db_dir,
    repo_id=repo_id,
    repo_type="dataset",
    token=userdata.get('HF_TOKEN')
)

print("Uploaded to HuggingFace:", repo_id)

Processing Files (0 / 0)      : |          |  0.00B /  0.00B            

New Data Upload               : |          |  0.00B /  0.00B            

  ...0-5c9674efcf89/header.bin: 100%|##########|   100B /   100B            

  ...ls_gemma7b/chroma.sqlite3:  45%|####5     | 3.79MB / 8.33MB            

  ...674efcf89/data_level0.bin:  45%|####5     | 1.46MB / 3.21MB            

  ...f89/index_metadata.pickle:  45%|####5     | 17.2kB / 37.9kB            

  ...0-5c9674efcf89/length.bin:  45%|####5     | 1.82kB / 4.00kB            

  ...9674efcf89/link_lists.bin:  45%|####5     | 3.92kB / 8.62kB            

Uploaded to HuggingFace: gerson-analista/rag-gemma-7b-tech-challenge


## Test: How to use RAG from hugging face

In [None]:
from huggingface_hub import snapshot_download
from sentence_transformers import SentenceTransformer
import chromadb
import os

# --------------------------------------------------
# 1. Download dataset from Hugging Face
# --------------------------------------------------
folder = snapshot_download(
    repo_id="gerson-analista/rag-gemma-4b-tech-challenge",
    repo_type="dataset",
    token=userdata.get('HF_TOKEN'),  # optional if public
)

# --------------------------------------------------
# 2. Auto-detect Chroma DB path
# --------------------------------------------------
db_path = None
for root, dirs, files in os.walk(folder):
    if "chroma.sqlite3" in files:
        db_path = root
        break

if not db_path:
    raise RuntimeError("❌ Chroma database not found")

print(f"✅ Chroma DB found at: {db_path}")

# --------------------------------------------------
# 3. Initialize Chroma Persistent Client
# --------------------------------------------------
client = chromadb.PersistentClient(path=db_path)

collections = client.list_collections()
print("📦 Available collections:", collections)

collection = client.get_collection(name="protocols_collection")

print(f"📄 Total documents in collection: {collection.count()}")

# --------------------------------------------------
# 4. Load EMBEDDING MODEL (MUST MATCH INDEX)
# --------------------------------------------------
# ✅ Dataset was indexed using BGE
embed_model = SentenceTransformer(
    "BAAI/bge-base-en-v1.5",
    device="cpu"
)

# 5. Query
query = "List steps to stroke protocols"
q_emb = embed_model.encode([query]).tolist()

# 6. Perform retrieval
results = collection.query(
    query_embeddings=q_emb,
    n_results=3
)

print(results)

Fetching 13 files:   0%|          | 0/13 [00:00<?, ?it/s]

4d98f51b-ff2a-453b-bacc-3ec03b2201aa/hea(…):   0%|          | 0.00/100 [00:00<?, ?B/s]

4d98f51b-ff2a-453b-bacc-3ec03b2201aa/len(…):   0%|          | 0.00/4.00k [00:00<?, ?B/s]

.gitattributes:   0%|          | 0.00/2.51k [00:00<?, ?B/s]

4d98f51b-ff2a-453b-bacc-3ec03b2201aa/ind(…):   0%|          | 0.00/37.9k [00:00<?, ?B/s]

README.md:   0%|          | 0.00/24.0 [00:00<?, ?B/s]

4d98f51b-ff2a-453b-bacc-3ec03b2201aa/dat(…):   0%|          | 0.00/3.21M [00:00<?, ?B/s]

4d98f51b-ff2a-453b-bacc-3ec03b2201aa/lin(…):   0%|          | 0.00/8.62k [00:00<?, ?B/s]

chroma.sqlite3:   0%|          | 0.00/8.33M [00:00<?, ?B/s]

d481020a-384e-467a-9540-5c9674efcf89/dat(…):   0%|          | 0.00/3.21M [00:00<?, ?B/s]

d481020a-384e-467a-9540-5c9674efcf89/len(…):   0%|          | 0.00/4.00k [00:00<?, ?B/s]

d481020a-384e-467a-9540-5c9674efcf89/ind(…):   0%|          | 0.00/37.9k [00:00<?, ?B/s]

d481020a-384e-467a-9540-5c9674efcf89/lin(…):   0%|          | 0.00/8.62k [00:00<?, ?B/s]

✅ Chroma DB found at: /root/.cache/huggingface/hub/datasets--gerson-analista--rag-gemma-4b-tech-challenge/snapshots/c6dcb4873f67b5f87282f3e7720a2975083d1466
📦 Available collections: [Collection(name=protocols_collection)]
📄 Total documents in collection: 1980


modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/124 [00:00<?, ?B/s]

README.md: 0.00B [00:00, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/52.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/777 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/438M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/366 [00:00<?, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/125 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

{'ids': [['chunk_866', 'chunk_1193', 'chunk_1009']], 'embeddings': None, 'documents': [['Name: Stroke - Protocol 431\nCategory: Cardiology\nDescription: Standard protocol for handling Stroke in Cardiology context.\nCreated at: 2025-09-01T21:18:08.064787\n\nSteps:\n- Step 1: Step 1 description. Action: Monitor ECG. Duration: 37 minutes.\n- Step 2: Step 2 description. Action: Check Blood Glucose. Duration: 47 minutes.\n- Step 3: Step 3 description. Action: Start IV Fluids. Duration: 20 minutes.\n- Step 4: Step 4 description. Action: Perform CPR. Duration: 15 minutes.', 'Name: Stroke - Protocol 600\nCategory: Cardiology\nDescription: Standard protocol for handling Stroke in Cardiology context.\nCreated at: 2025-08-22T21:18:08.068616\n\nSteps:\n- Step 1: Step 1 description. Action: Check Blood Glucose. Duration: 58 minutes.\n- Step 2: Step 2 description. Action: Give Aspirin. Duration: 2 minutes.\n- Step 3: Step 3 description. Action: Administer Oxygen. Duration: 12 minutes.', 'Name: Strok