In [35]:
%pip install pydantic_settings langchain langchain-core langchain-google-genai langchain-qdrant fastembed langchain-community qdrant-client langgraph

Note: you may need to restart the kernel to use updated packages.


c:\Users\alexa\OneDrive\Desktop\hackathon\.venv\Scripts\python.exe: No module named pip


In [28]:
from pydantic_settings import BaseSettings, SettingsConfigDict

class Settings(BaseSettings):
    GOOGLE_API_KEY: str
    model_config = SettingsConfigDict(env_file=".env")

env = Settings()

In [29]:
from langchain_google_genai import GoogleGenerativeAIEmbeddings

embeddings_2 = GoogleGenerativeAIEmbeddings(model="models/gemini-embedding-001", google_api_key=env.GOOGLE_API_KEY)

In [30]:
from qdrant_client.http.models import Distance

collection_name = "doctors_packages"
dimension = 768
distance = Distance.COSINE

In [31]:
# load mcu.json data
import json

with open("doctors_final.json", "r") as f:
    doctors_data = json.load(f)

print(doctors_data[0])

{'id': '5eae0017-40dd-4961-869f-79d9e45d87f2', 'name': 'Adventia Emilia Krysna Sipi Seda, M.M., M.Psi., Psikolog', 'specialization_name': 'Psikologi', 'specialization_name_en': 'Psychology', 'sub_specialization_name': 'Psikolog', 'sub_specialization_name_en': 'Psychologist', 'hospital_name': 'Siloam Hospitals Yogyakarta'}


In [32]:
from qdrant_client import QdrantClient

client = QdrantClient(":memory:")

# sebelumnnya menggunakan embed yang terbatas (tidak semua asset terbaca) tetapi testing berhasil untuk area yogyakarta
# selanjutnya menggunakan FastEmbed, tadi menggunakan cloud qdrant masih gagal, sekarang kita gunakan in-memory untuk testing


In [33]:
from qdrant_client.http.models import VectorParams

if(client.collection_exists(collection_name=collection_name) == False):
    client.create_collection(
        collection_name=collection_name,
        vectors_config=VectorParams(size=dimension, distance=distance),
    )
    

In [34]:
# alternatif FastEmbed
from qdrant_client.models import PointStruct
from langchain_community.embeddings.fastembed import FastEmbedEmbeddings

print("Initializing FastEmbed model...")
embeddings = FastEmbedEmbeddings(
    cache_dir="./embedding_cache", 
    model_name="jinaai/jina-embeddings-v2-base-en"
)
print("Model initialized.")

# --- Batch Processing Logic ---
batch_size = 128
points_batch = []

for row in doctors_data:
    text = f"Dokter: {row['name']}, Spesialisasi: {row['specialization_name_en']} ({row['sub_specialization_name_en']}), Praktik di: {row['hospital_name']}"
    
    point = PointStruct(
        id=row['id'],
        vector=[], # Vector will be added later
        payload={
            "page_content": text,
            "metadata": {
                "id": row['id'],
                "name": row['name'],
                "specialization_name": row['specialization_name'],
                "specialization_name_en": row['specialization_name_en'],
                "sub_specialization_name": row['sub_specialization_name'],
                "sub_specialization_name_en": row['sub_specialization_name_en'],
                "hospital_name": row['hospital_name'],
            },
        },
    )
    points_batch.append(point)

    if len(points_batch) >= batch_size:
        print(f"Processing a batch of {len(points_batch)} doctors...")
        
        texts_to_embed = [p.payload['page_content'] for p in points_batch]
        
        embeddings_batch = embeddings.embed_documents(texts_to_embed)
        
        for i, point_to_update in enumerate(points_batch):
            point_to_update.vector = embeddings_batch[i]
            
        client.upsert(collection_name=collection_name, points=points_batch)
        
        points_batch = []

if points_batch:
    print(f"Processing the final batch of {len(points_batch)} doctors...")
    texts_to_embed = [p.payload['page_content'] for p in points_batch]
    embeddings_batch = embeddings.embed_documents(texts_to_embed)
    for i, point_to_update in enumerate(points_batch):
        point_to_update.vector = embeddings_batch[i]
    client.upsert(collection_name=collection_name, points=points_batch)

print("All data successfully upserted!")

Initializing FastEmbed model...


PackageNotFoundError: No package metadata was found for fastembed

In [None]:
from langchain_core.documents import Document
from langchain_qdrant import Qdrant
from langchain_google_genai import GoogleGenerativeAIEmbeddings

documents = []

for doctor in doctors_data:
    page_content_text = f"Dokter: {doctor['name']}, Spesialisasi: {doctor['specialization_name_en']}, Lokasi: {doctor['hospital_name']}"
    
    metadata_dict = doctor.copy()

    new_doc = Document(
        page_content=page_content_text,
        metadata=metadata_dict
    )
    documents.append(new_doc)

embeddings = GoogleGenerativeAIEmbeddings(model="models/embedding-001")
collection_name = "doctors_packages"

qdrant_vector_store = Qdrant(
    client=client, 
    collection_name=collection_name, 
    embeddings=embeddings
)

print("Menambahkan dokumen ke vector store in-memory...")
qdrant_vector_store.add_documents(documents)

print("Vector store berhasil dibuat dengan data dokumen.")

Menambahkan dokumen ke vector store in-memory...
Vector store berhasil dibuat dengan data dokumen.


In [None]:
from langchain_qdrant import QdrantVectorStore
from qdrant_client import QdrantClient
from langchain_google_genai import GoogleGenerativeAIEmbeddings

def get_retriever(qdrant_client: QdrantClient):
    
    embeddings_model = GoogleGenerativeAIEmbeddings(model="models/embedding-001")
    collection_name = "doctors_packages"

    vector_store = QdrantVectorStore(
        client=qdrant_client,
        collection_name=collection_name,
        embedding=embeddings_model,
    )
    
    return vector_store.as_retriever()

retriever = get_retriever(client)
print("Retriever berhasil diperbaiki dan dibuat!")

Retriever berhasil diperbaiki dan dibuat!


In [None]:
from langchain_core.tools import tool
from typing import Annotated, List

@tool
def search_doctor_packages(query: Annotated[str, "search query must contain keywords related to Doctors Data packages"]) -> List[str]:
    """Search for Doctor Data packages by name or specialization or location."""
    retriever = get_retriever(client)
    results = retriever.invoke(query, k=10)
    return [result.page_content for result in results]

In [None]:
# access the Google Gemini API
from langchain_google_genai import ChatGoogleGenerativeAI
from langchain_core.prompts import ChatPromptTemplate

llm = ChatGoogleGenerativeAI(
    model="gemini-2.0-flash",
    api_key=env.GOOGLE_API_KEY,
)

In [None]:
from langgraph.graph import START, StateGraph
from typing_extensions import List, TypedDict
# Define state for application
class State(TypedDict):
    question: str
    context: List[str]
    search: str
    answer: str

In [None]:
def get_context(state: State):
    prompt = ChatPromptTemplate.from_messages(
        [
            ("system", """
                Anda adalah sebuah sistem ahli yang berfungsi tunggal untuk mengurai 
             pertanyaan pengguna terkait pencarian dokter. Tugas utama Anda
              adalah mengekstrak dua informasi kunci dari pertanyaan tersebut: 
             spesialisasi dokter dan lokasi praktiknya, yang bisa berupa nama kota
              atau rumah sakit. Nilai untuk kedua kunci ini harus selalu dalam bahasa Inggris,
              sehingga Anda perlu secara cerdas menerjemahkan istilah Bahasa Indonesia 
             ke padanan bahasa Inggrisnya, misalnya "jantung" menjadi "Cardiology" atau
              "surabaya" menjadi "Surabaya". Apabila salah satu informasi, baik spesialisasi
              maupun lokasi, tidak ditemukan dalam pertanyaan pengguna, 
             maka Anda harus menggunakan nilai null untuk kunci yang bersangkutan. 
             Jangan pernah menyertakan teks pembuka, penjelasan, atau karakter apa pun
              di luar struktur jawaban yang diminta.
            """),
            ("human", "{question}"),
        ]
    )
    chain = prompt | llm
    result = chain.invoke({"question": state["question"]})
    return {"search": result.content}

In [None]:
def retrieve(state: State):
    retrieved_docs = search_doctor_packages(state["search"])
    return {"context": retrieved_docs}

In [None]:
def generate(state: State):
    prompt = ChatPromptTemplate.from_messages(
        [
            ("system", """
                You are an assistant that provides information about Doctor Informations at Siloam hospitals.
                You will generate a response based on the context provided.
                The response should be concise and relevant to the question asked.
                package list knowledge: 
                {context}
                If the context is empty, you can provide a general response about Doctor Information packages.
            """),
            ("human", "{question}"),
        ]
    )
    chain = prompt | llm
    result = chain.invoke({"question": state["question"], "context": state["context"]})
    return {"answer": result.content}

In [None]:
graph_builder = StateGraph(State).add_sequence([get_context, retrieve, generate])
graph_builder.add_edge(START, "get_context")
graph = graph_builder.compile()

In [None]:
response = graph.invoke({
	"question": "Saya batuk berdarah, ke dokter mana saya sebaiknya konsultasi? saya berada di solo",
	"context": [],
	"search": "",
	"answer": ""
})
print(response["answer"])

TypeError: get_retriever() missing 1 required positional argument: 'qdrant_client'