<a href="https://colab.research.google.com/github/vgu-its24-psd/MedDiag/blob/main/Pipeline_to_RAG.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

#Import data from Google Drive

In [None]:
from google.colab import drive
drive.mount('/content/drive')

PATH2FOLDER = '/content/drive/MyDrive/Pipeline_to_RAG'

In [None]:
!pip install -qU langchain-qdrant
!pip install -qU langchain-huggingface

In [None]:
from transformers import pipeline
from PIL import Image
import requests
import torch

In [None]:
import os
import json

In [None]:
from langchain_huggingface import HuggingFaceEmbeddings

embeddings = HuggingFaceEmbeddings(model_name="sentence-transformers/all-mpnet-base-v2")

In [None]:
#init image summary pipline
image_summary_pipe = pipeline("image-text-to-text", model="google/medgemma-4b-it", torch_dtype=torch.bfloat16, device="cuda")

In [None]:
def image_summary(image_url):
    image = Image.open(image_url)
    messages = [
        {
            "role": "system",
            "content": [{"type": "text", "text": "You are the expert in Tropical deases image analysis"}]
        },
        {
            "role": "user",
            "content": [
                {"type": "text", "text": "Analyze the provided image and describe it strictly from a medical diagnostic perspective, without naming or suggesting any specific disease"},
                {"type": "image", "image": image}
            ]
        }
    ]
    output = image_summary_pipe(text=messages, max_new_tokens=200)
    # Extract the text content from the output
    return output[0]["generated_text"][-1]["content"]

In [None]:
from langchain_qdrant import QdrantVectorStore
from qdrant_client import QdrantClient
from qdrant_client.http.models import Distance, VectorParams
from langchain.storage import InMemoryStore
from langchain.retrievers.multi_vector import MultiVectorRetriever
import uuid

client = QdrantClient(
    url="https://2fe338c1-dc5a-45ea-98fc-5a653ed6567d.us-east4-0.gcp.cloud.qdrant.io:6333",
    api_key="eyJhbGciOiJIUzI1NiIsInR5cCI6IkpXVCJ9.eyJhY2Nlc3MiOiJtIn0.9BdzZ5Q-sMPLQAJQJ-Q5dVMwXqT_2J6IJoz6wWCuYoo",
)

client.recreate_collection(collection_name="demo_collection", vectors_config=VectorParams(size=768, distance=Distance.COSINE))
vector_store = QdrantVectorStore(
    client=client,
    collection_name="demo_collection",
    embedding=embeddings,
)
# The storage layer for the parent documents
store = InMemoryStore()
id_key = "doc_id"

# The retriever (empty to start)
retriever = MultiVectorRetriever(vectorstore=vector_store, docstore=store ,id_key=id_key)

In [None]:
from langchain_core.documents import Document
all_docs = []

for dirpath, _, filenames in os.walk(PATH2FOLDER):
    for fname in filenames:
        if fname.endswith(".json"):
            fpath = os.path.join(dirpath, fname)
            with open(fpath, "r", encoding="utf-8") as f:
                try:
                    data = json.load(f)
                except json.JSONDecodeError:
                    continue
                for chunk in data.get("text_chunks", []):
                    all_docs.append(
                        Document(
                            page_content=chunk.get("text", ""),
                            metadata={
                                "doc_id": chunk["chunk_id"],
                                "filename": chunk["metadata"]["filename"],
                            },
                        )
                    )
            image_path = dirpath + '/images/'
            for dirpath2, _, filenames2 in os.walk(image_path):
                for fname2 in filenames2:
                    if fname2.endswith(".png"):
                        fpath2 = os.path.join(dirpath2, fname2)
                        all_docs.append(
                            Document(
                                page_content=image_summary(fpath2),
                                metadata={
                                    "doc_id": str(uuid.uuid4()),
                                    "filename": fname2 # Changed filename to fname2
                                },
                            )
                    )
# add all at once
retriever.vectorstore.add_documents(all_docs)
retriever.docstore.mset([(d.metadata["doc_id"], d) for d in all_docs])

In [None]:
# Retrieve
docs = retriever.invoke(
    "dengue fever symptoms"
)
for doc in docs:
    print(str(doc) + "\n\n" + "-" * 80)

# Image Summary

# Vector Store