In [None]:
import os
os.environ["OPENAI_API_KEY"] = ""
os.environ["LANGSMITH_API_KEY"] = ""
os.environ["LANGSMITH_TRACING"] = "true"
os.environ["LANGSMITH_PROJECT"] = "microbiology-assistance-burmese"

In [None]:
from dotenv import load_dotenv
load_dotenv(dotenv_path="../.env", override=True)

True

In [3]:
# os.environ.get("OPENAI_API_KEY")

In [1]:
from langsmith import traceable
from openai import OpenAI
from typing import List
import nest_asyncio
from utils import get_vector_db_retriever

In [2]:
MODEL_PROVIDER = "openai"
MODEL_NAME = "gpt-4o-mini"
APP_VERSION = 1.0

In [3]:
RAG_SYSTEM_PROMPT = """You are a helpful assistant for Question-Answering tasks.
User questions may be written in English or Burmese.
Always translate your final response into **Burmese only**.
If you don’t know the answer, say so in Burmese.
Keep answers concise (no more than 3 sentences).
"""

In [14]:
RAG_SYSTEM_PROMPT = """You are an expert Burmese-language assistant specializing in question-answering tasks.

## CORE INSTRUCTIONS:
- Analyze the retrieved context carefully to provide accurate answers
- Accept questions in both English and Burmese languages
- **ALWAYS respond exclusively in Burmese** regardless of the question language
- Maintain conversation context and refer to previous exchanges when relevant
- Handle follow-up questions by linking them to the original question and previous answers, unless new context explicitly overrides

## RESPONSE GUIDELINES:
1. **Answer Structure**: Provide direct, factual responses based solely on the retrieved context
2. **Length**: Keep responses concise (maximum 3 sentences)
3. **Accuracy**: Only answer what you can confidently derive from the provided context
4. **Unknown Information**: If the context doesn't contain sufficient information, clearly state in Burmese:  
   **"ပေးထားသော အချက်အလက်များတွင် ဤမေးခွန်းအတွက် လုံလောက်သော အဖြေမရှိပါ။"**

## CONTEXT USAGE:
- Prioritize the most recent and relevant context pieces
- Synthesize information from multiple context sources when applicable
- Cite specific details from context when providing factual claims
- When the user asks a follow-up question, use both the **original question** and the **previous answer** to interpret their intent
- Do not add information not present in the retrieved context

## RESPONSE QUALITY:
- Use natural, fluent Burmese language
- Adapt tone to match the question's formality level
- Provide helpful clarifications when the context allows
- Structure complex answers with clear logical flow

## FALLBACK BEHAVIOR:
If no relevant context is provided or the context is insufficient:  
**"ပေးထားသော အချက်အလက်များကို အခြေခံ၍ ဤမေးခွန်းကို မဖြေနိုင်ပါ။ ပိုမိုတိကျသော အချက်အလက်များ လိုအပ်ပါသည်။"**

Remember: Your primary goal is to be a reliable, accurate, and helpful Burmese-language assistant that provides contextually grounded responses.
"""


In [5]:
openai_client = OpenAI()
nest_asyncio.apply()
retriever = get_vector_db_retriever()

In [6]:
@traceable(
    run_type="chain",
    metadata={'vectordb': 'sklearn'}
)
def retrieve_documents(question: str):
    return retriever.invoke(question)

In [7]:
@traceable(
    run_type="llm",
    metadata={
        'model_provider': MODEL_PROVIDER,
        'model_name': MODEL_NAME,
    }
)
def call_openai(messages: List[dict], model: str = MODEL_NAME, temperature: float = 0.0) -> str:
    return openai_client.chat.completions.create(
        model=model,
        messages=messages,
        temperature=temperature
    )

In [8]:
@traceable(run_type="chain")
def generate_response(question: str, documents):
    formatted_docs = "\n\n".join(doc.page_content for doc in documents)
    messages = [
        {
            "role": "system",
            "content": RAG_SYSTEM_PROMPT
        },
        {
            "role": "user",
            "content": f"Context: {formatted_docs}\nQuestion: {question}"
        }
    ]
    
    return call_openai(messages)

In [9]:
@traceable(run_type="chain")
def langsmith_rag(question: str):
    documents = retrieve_documents(question)
    response = generate_response(question, documents)
    return response.choices[0].message.content

In [10]:
question = "အဏုဇီဝဗေဒ ဆိုတာ ဘာလဲ"
ai_answer = langsmith_rag(question, langsmith_extra={"metadata": {"runtime_metadata": "foo"}})
print(ai_answer)

အဏုဇီဝဗေဒ MicroBiology သည် မျက်စိဖြင့် မမြင်နိုင်သော အဏုဇီဝ သက်ရှိများကို လေ့လာသည့် သိပ္ပံပညာရပ် ဖြစ်သည် ။ ဤပညာရပ်တွင် ဘက်တီးရီးယား ၊ ဗိုင်းရစ်၊ မှို၊ နှင့် protozoa များကို အဓိက လေ့လာသည် ။


In [11]:
import uuid
thread_id = uuid.uuid4()

In [12]:
question = "Bacteria ၏ ဖွဲ့စည်းပုံ"
ai_answer = langsmith_rag(question, langsmith_extra={"metadata": {"thread_id": thread_id}})
print(ai_answer)

Bacteria သည် prokaryotic cell ဖြစ်ပြီး၊ နျူကလိယပ် အမြှေးပါး မရှိပါ။ ၎င်းတွင် cell wall၊ cell membrane၊ cytoplasm နှင့် ribosomes များ ပါဝင်သည်။ အချို့တွင် flagella သို့မဟုတ် pili လည်း ရှိနိုင်သည်။


In [13]:
question = "အဏုဇီဝဗေဒ ဆိုတာ ဘာလဲ"
ai_answer = langsmith_rag(question, langsmith_extra={"metadata": {"thread_id": thread_id}})
print(ai_answer)

အဏုဇီဝဗေဒ MicroBiology သည် မျက်စိဖြင့် မမြင်နိုင်သော အဏုဇီဝ သက်ရှိများကို လေ့လာသည့် သိပ္ပံပညာရပ် ဖြစ်သည် ။ ဤပညာရပ်တွင် ဘက်တီးရီးယား ၊ ဗိုင်းရစ်၊ မှို၊ နှင့် protozoa များကို အဓိက လေ့လာသည် ။
