## Create Langfuse Client

In [1]:
import os
from dotenv import load_dotenv

load_dotenv(dotenv_path=".env", override=True)
print(f"Loaded environment: {os.getenv('ENV_NAME')}")

Loaded environment: dev


In [2]:
os.environ["LANGFUSE_SECRET_KEY"] = "sk-lf-a92829d1-413f-4994-af78-f3a336efce8a"
os.environ["LANGFUSE_PUBLIC_KEY"] = "pk-lf-60f090e3-f692-4515-8625-cd4b417e9d71"
os.environ["LANGFUSE_HOST"] = "http://localhost:3000"

from langfuse import get_client
 
langfuse = get_client()

# Verify connection
if langfuse.auth_check():
    print("Langfuse client is authenticated and ready!")
else:
    print("Authentication failed. Please check your credentials and host.")

Langfuse client is authenticated and ready!


## Create Dataset

In [5]:
import pandas as pd

benchmark_dataset = pd.read_excel("data/sample-question-for-chatbot-new-tanpa-dtt.xlsx")
benchmark_dataset.head()

Unnamed: 0,NO,ROOT FOLDER,TIPE PERIZINAN,INSTALASI,QUESTION,Expected Respone,Dokumen,Halaman,Unnamed: 8
0,1,PPN PERMIT,PLO,Kombinasi,Sertifikat PLO mana saja pada Subholding PPN y...,"FT Serui,DPPU Kalimarau, TLPG Wayame dll",DPPU Kalimarau,Halaman 2,
1,2,PPN PERMIT,PLO,DPPU Ahmad Yani,Apa lembaga inspeksi yang melakukan inspeksi u...,PT Sucofindo,DPPU Ahmad Yani,Halaman 6,
2,3,PPN PERMIT,PLO,FT Cepu,Berapa sisa umur layan untuk FT Cepu berdasark...,13 Tahun sejak Oktober 2023,6. PLO FT Cepu,Halaman 1,
3,4,PPN PERMIT,PLO,Pipa Penyalur CY,Apa saja instalasi yang dihubungkan oleh pipa ...,"FT Lomanis, FT Maos, FT Rewulu,TBBM Teras",12. PLO Pipa Penyalur CY I dan 13. PLO Pipa P...,CY I Halaman 4 dan CY II Halaman 3,
4,5,PPN PERMIT,PLO,Pipeline perak -Juanda,Apa saja temuan yang di lampirkan oleh pihak i...,a. Memastikan dokumen Penelaahan desain pada s...,Pipeline Perak - Juanda,Halaman 10 - 17,


In [6]:
cleaned_benchmark_dataset = benchmark_dataset.dropna(subset=['Expected Respone']).reset_index(drop=True)
cleaned_benchmark_dataset.tail()

Unnamed: 0,NO,ROOT FOLDER,TIPE PERIZINAN,INSTALASI,QUESTION,Expected Respone,Dokumen,Halaman,Unnamed: 8
75,76,SHU PERMIT,IZIN LINGKUNGAN,EP Asset 1- Field Pangkalan Susu,Berapa jumlah sumur produksi dan injeksi yang ...,84 Sumur produksi dan 38 sumur Injeksi,3. Izin Lingkungan SKKL No. 361_MenLHK_SETJEN_...,Halaman 8,
76,77,SHU PERMIT,IZIN LINGKUNGAN,EP Asset 1 - Field Jambi,Kapan terakhir persetujuan lingkungan di EP As...,2024,29. SK No 1103 Tahun 2024 Adendum andal RKL RP...,Halaman 1,
77,78,SHU PERMIT,IZIN LINGKUNGAN,EP Asset 1 - Field Rantau,Apa saja Limbah B3 dan Sumber Limbah B3 yang t...,Aki / Baterai Bekas\nSumber: Fasilitas Produks...,2. Izin Lingkungan SKKL No. 361_MenLHK_SETJEN_...,Halaman 69 - 74,
78,79,SHU PERMIT,IZIN LINGKUNGAN,PHE Jambi Merang,Apa saja sumber dampak penting pada tahap kont...,Sumber Dampak : Penerimaan dan Mobilisasi tena...,8. SK.391_Menlhk_Setjen_PLA.4_9_2018 - Izin Li...,Halaman 5 -19,
79,80,SHU PERMIT,IZIN LINGKUNGAN,PHE NSO,Sebutkan ruang lingkup kegiatan yang tertuang ...,\n\n1. Operasional tenaga kerja eksisting.\n\n...,SKKL - SK.319_MENLH_SETJEN_PLA.4_4_2023,Halaman 4-5,


In [None]:
# DATASET_NAME = "pdh-permit-agent-tanpa-dtt-v2"

# langfuse.create_dataset(
#     name=DATASET_NAME,
#     description="Experiment dataset for PDH Permit Agent dengan tambahan pertanyaan tentang metadata dokumen"
# )

In [None]:
# for idx, row in cleaned_benchmark_dataset.iterrows():
#     additional_metadata = {
#         "root_folder": row["ROOT FOLDER"],
#         "tipe_perizinan": row["TIPE PERIZINAN"],
#         "instalasi": row["INSTALASI"],
#         "dokumen": row["Dokumen "],
#         "tipe": "Pertanyaan tanpa dtt"
#     }
    
#     langfuse.create_dataset_item(
#         dataset_name=DATASET_NAME,
#         input=row['QUESTION'],
#         expected_output=row['Expected Answer'],
#         metadata=additional_metadata
#     )

## Prepared RAGAS Benchmark

In [3]:
from langchain_openai import AzureChatOpenAI
from ragas.llms.base import LangchainLLMWrapper

evaluator_llm = LangchainLLMWrapper(AzureChatOpenAI(
    openai_api_version=os.getenv("AZURE_OPENAI_API_VERSION"),
    azure_endpoint=os.getenv("AZURE_OPENAI_ENDPOINT"),
    azure_deployment=os.getenv("AZURE_OPENAI_DEPLOYMENT_NAME"),
    model=os.getenv("AZURE_OPENAI_MODEL_NAME"),
    validate_base_url=False
))

  from .autonotebook import tqdm as notebook_tqdm


In [7]:
from src.permit_agent.agent_langchain import agent

from ragas.dataset_schema import SingleTurnSample
from ragas.metrics import SimpleCriteriaScore

from langfuse.langchain import CallbackHandler

from pprint import pprint

langfuse_handler = CallbackHandler()

TEST_DATA_ID = 5

user_input = cleaned_benchmark_dataset.iloc[TEST_DATA_ID]['QUESTION']
reference_output = cleaned_benchmark_dataset.iloc[TEST_DATA_ID]['Expected Respone']

print(f"\n ======= User Input ========\nQuestion: {user_input}\n")
agent_response = await agent.ainvoke({"messages": [user_input]}, config={"recursion_limit": 10, "callbacks": [langfuse_handler]})

sample = SingleTurnSample(
    user_input=user_input,
    response=agent_response["messages"][-1].content,
    reference=reference_output
)

print("===== Agent Response =======")
pprint(sample.model_dump())

scorer =  SimpleCriteriaScore(
    name="course_grained_score",
    definition="Score 1 to 100 by similarity",
    llm=evaluator_llm
)

score_result = await scorer.single_turn_ascore(sample)
print("\n===== Score Result =======")
print(f"Expected Output: {reference_output}")
print(f"Score result: {score_result}")


Question: Sebutkan nomor PLO untuk IT Makassar LPG dan Fuel !

{'multi_responses': None,
 'reference': '878/49-7/PLO/DMT/2023 dan 879/49-7/PLO/DMT/2023',
 'reference_context_ids': None,
 'reference_contexts': None,
 'response': 'Berikut nomor Persetujuan Layak Operasi (PLO) untuk IT '
             'Makassar:\n'
             '\n'
             '- IT Makassar LPG: 879/49-7/PLO/DMT/2023\n'
             '- IT Makassar Fuel: 878/49-7/PLO/DMT/2023\n'
             '\n'
             'Kedua PLO tersebut diterbitkan pada tanggal 8 Desember 2023 dan '
             'berlaku hingga 18 Januari 2027 oleh PT Pertamina Patra Niaga '
             'untuk kegiatan penyimpanan dan niaga LPG serta Bahan Bakar '
             'Minyak di Integrated Terminal Makassar, Sulawesi Selatan [PLO '
             'Integrated Terminal Makassar (LPG).pdf, Page 1][PLO Integrated '
             'Terminal Makassar (Fuel).pdf, Page 1].',
 'retrieved_context_ids': None,
 'retrieved_contexts': None,
 'rubrics': None,
 'user_inp

In [9]:
agent_response

{'messages': [HumanMessage(content='Sebutkan nomor PLO untuk IT Makassar LPG dan Fuel !', additional_kwargs={}, response_metadata={}, id='b1b23cf0-643d-47df-b3ad-40510e0d8a30'),
  AIMessage(content='', additional_kwargs={'refusal': None}, response_metadata={'token_usage': {'completion_tokens': 25, 'prompt_tokens': 895, 'total_tokens': 920, 'completion_tokens_details': {'accepted_prediction_tokens': 0, 'audio_tokens': 0, 'reasoning_tokens': 0, 'rejected_prediction_tokens': 0}, 'prompt_tokens_details': {'audio_tokens': 0, 'cached_tokens': 0}}, 'model_provider': 'openai', 'model_name': 'gpt-4o-2024-11-20', 'system_fingerprint': 'fp_b54fe76834', 'id': 'chatcmpl-Cp4qyXWy2LfeRA2uGzGXJTSWZxY5J', 'prompt_filter_results': [{'prompt_index': 0, 'content_filter_results': {'hate': {'filtered': False, 'severity': 'safe'}, 'jailbreak': {'filtered': False, 'detected': False}, 'self_harm': {'filtered': False, 'severity': 'safe'}, 'sexual': {'filtered': False, 'severity': 'safe'}, 'violence': {'filtered

## Test With Dataset Item ID

| Dataset ID | Question |
|------------|----------|
| 72e9a7cd-0e20-4959-9ff5-70286aeaf0bd | Sebutkan area pada PGN SOR 1 yang paling cepat akan kadaluwarsa dan kapan kadaluwarsanya ? | 
| 420198ca-a14c-4868-a8af-74c481ce1e5a | Berapa jumlah dokumen PLO yang dimiliki oleh RU 2 ? |
| c6f5ee4c-5d60-4f43-a46e-9b2d2e439878 | Sebutkan PLO (CA TAHUN 2020) PMO PGN beserta Lokasinya |
| aa30233b-a472-4f70-ba2c-549f5ccd43e8 | Sebutkan nomor SK Perstujuan Lingkungan yang ada di SOR 2 ! |
| 665248e9-8d14-40b2-8c39-33353090779f | Sebutkkan nomor KKPR yang dimiliki oleh IT Balongan ! |

In [4]:
filtered_testing_id = [
    "72e9a7cd-0e20-4959-9ff5-70286aeaf0bd",
    "420198ca-a14c-4868-a8af-74c481ce1e5a",
    "c6f5ee4c-5d60-4f43-a46e-9b2d2e439878",
    "aa30233b-a472-4f70-ba2c-549f5ccd43e8",
    "665248e9-8d14-40b2-8c39-33353090779f",
    "dd22dd8c-238c-43a5-accc-03e60218125f"
]

In [17]:
from src.permit_agent.agent_langchain import agent

from ragas.dataset_schema import SingleTurnSample
from ragas.metrics import SimpleCriteriaScore

from langfuse.langchain import CallbackHandler

from pprint import pprint

langfuse_handler = CallbackHandler()

DATASET_ID = "665248e9-8d14-40b2-8c39-33353090779f"

dataset_item = langfuse.api.dataset_items.get(id=DATASET_ID)
user_input = "Sebutkkan nomor KKPRL yang dimiliki oleh IT Balongan !"
reference_output = dataset_item.expected_output

print(f"\n ======= User Input ========\nQuestion: {user_input}\n")
agent_response = await agent.ainvoke({"messages": [user_input]}, config={"recursion_limit": 10, "callbacks": [langfuse_handler]})

sample = SingleTurnSample(
    user_input=user_input,
    response=agent_response["messages"][-1].content,
    reference=reference_output
)

print("===== Agent Response =======")
pprint(sample.model_dump())

scorer =  SimpleCriteriaScore(
    name="course_grained_score",
    definition="Score 1 to 100 by similarity",
    llm=evaluator_llm
)

score_result = await scorer.single_turn_ascore(sample)
print("\n===== Score Result =======")
print(f"Expected Output: {reference_output}")
print(f"Score result: {score_result}")


Question: Sebutkkan nomor KKPRL yang dimiliki oleh IT Balongan !

{'multi_responses': None,
 'reference': '21112310513200012,\n'
              '21112310513200009,\n'
              '21112310513200008,\n'
              '21112310513200011,\n'
              '21112310513200006,\n'
              '21112310513200010, dan\n'
              '21112310513200007',
 'reference_context_ids': None,
 'reference_contexts': None,
 'response': 'Berikut adalah daftar nomor KKPRL yang dimiliki oleh IT Balongan '
             '(PT Pertamina Patra Niaga):\n'
             '\n'
             '1. Integrated Terminal Balongan (Dermaga Island Berth)\n'
             '   - Nomor KKPRL: 21112310513200006\n'
             '\n'
             '2. Single Point Mooring (SPM) 35.000 DWT Integrated Terminal '
             'Balongan\n'
             '   - Nomor KKPRL: 21112310513200009\n'
             '\n'
             '3. Submarine Pipeline SPM 35.000 DWT Integrated Terminal '
             'Balongan\n'
             '   - Nomor 

In [6]:
from src.permit_agent.agent_langchain import agent

async def run_agent(user_input, langfuse_handler):

    agent_response = await agent.ainvoke({"messages": [user_input]}, config={"recursion_limit": 10, "callbacks": [langfuse_handler]})

    return agent_response

In [9]:
from ragas.dataset_schema import SingleTurnSample
from ragas.metrics import SimpleCriteriaScore

from langfuse.langchain import CallbackHandler

langfuse_handler = CallbackHandler()

scorer =  SimpleCriteriaScore(
    name="course_grained_score",
    definition="Score 1 to 100 by similarity. Give higher scores for more relevant and accurate answers.",
    llm=evaluator_llm
)

async def course_grained_score(user_input: str, response: list, reference: str, scorer: SimpleCriteriaScore = scorer) -> int:
    """
    Score the response on a scale of 1 to 100 based on its similarity to the reference.
    """
    
    sample = SingleTurnSample(
        user_input=user_input,
        response=response["messages"][-1].content,
        reference=reference
        )

    score_result = await scorer.single_turn_ascore(sample)

    return score_result

In [11]:
for dataset_items_id in filtered_testing_id:
    dataset_item = langfuse.api.dataset_items.get(id=dataset_items_id)
    user_input = dataset_item.input
    reference_output = dataset_item.expected_output

    print(f"\n ======= User Input ========\nQuestion: {user_input}")
    agent_response = await run_agent(user_input, langfuse_handler)

    score_result = await course_grained_score(user_input, agent_response, reference_output, scorer)
    print(f"Score result: {score_result}")

    get_client().flush()


Question: Sebutkan area pada PGN SOR 1 yang paling cepat akan kadaluwarsa dan kapan kadaluwarsanya ? 
Score result: 100

Question: Berapa jumlah dokumen PLO yang dimiliki oleh RU 2 ?
Score result: 35

Question: Sebutkan PLO (CA TAHUN 2020) PMO PGN beserta Lokasinya
Score result: 82

Question: Sebutkan nomor SK Perstujuan Lingkungan yang ada di SOR 2 !
Score result: 100

Question: Sebutkkan nomor KKPR yang dimiliki oleh IT Balongan !
Score result: 35

Question: Sebutkan Instalasi milik PGN yang memiliki KKPR dengan tanggal terbit paling lama !
Score result: 98


## Running Experiment

In [13]:
from datetime import datetime
from langfuse.langchain import CallbackHandler

langfuse_handler = CallbackHandler()
nowtime = datetime.now()

MODEL_NAME = os.getenv("AZURE_OPENAI_MODEL_NAME")
ENV_NAME = os.getenv("ENV_NAME")
DATASET_NAME = "pdh-permit-agent-tanpa-dtt-v2"
EXPERIMENT_NAME = f"Updated tool function - {nowtime.strftime('%Y-%m-%d %H:%M')} {MODEL_NAME} - {ENV_NAME}"
DESCRIPTION = "Update tool instructions. Update tools default behavior to always use metadata filtering first before vector search"
experiment_metadata = {
    "model": os.getenv("AZURE_OPENAI_MODEL_NAME"),
    "permitMetadata" : os.getenv("COSMOS_DB_DATABASE_ID"),
    "vector_store": os.getenv("AZURE_AI_SEARCH_INDEX_NAME"),
    "temperature": 0.0,
    "max_tokens": 1000,
    "timeout": 500,
    "search_type": "semantic_ranking",
    "semantic_configuration": os.getenv("AZURE_AI_SEARCH_SEMANTIC_CONFIGURATION"),
    "agent_type": "langchain"
}

dataset = get_client().get_dataset(name=DATASET_NAME)

for item in dataset.items:

    with item.run(
        run_name=EXPERIMENT_NAME,
        run_description=DESCRIPTION,
        run_metadata=experiment_metadata
    ) as root_span:

        output = await run_agent(item.input, langfuse_handler)

        root_span.update_trace(input=item.input, output=output['messages'][-1].content)

        root_span.score_trace(
            name="course_grained_score",
            comment="Score 1 to 100 by similarity",
            value=await course_grained_score(item.input, output, str(item.expected_output))
        )

get_client().flush()