In [1]:
import json
import pandas as pd
json_file_path = '../results/pubmed_results.json'

with open(json_file_path, 'r', encoding='utf-8') as f:
    loaded_data = json.load(f)

df = pd.DataFrame.from_dict(loaded_data)

In [2]:
from trulens.providers.openai import OpenAI
from dotenv import load_dotenv
import os

#load_dotenv()
provider = OpenAI()

🦑 Initialized with db url sqlite:///default.sqlite .
🛑 Secret keys may be written to the database. See the `database_redact_keys` option of `TruSession` to prevent this.


In [3]:
from trulens.apps.virtual import VirtualApp
from trulens.core import Select
from trulens.core import Feedback

virtual_app = dict(
    llm=dict(
        modelname="GPT3.5Turbo"
    ),
    template="information about the template I used in my app",
    debug="all of these fields are completely optional"
)

virtual_app = VirtualApp(virtual_app) # can start with the prior dictionary
virtual_app[Select.RecordCalls.llm.maxtokens] = 1024

retriever_component = Select.RecordCalls.retriever
virtual_app[retriever_component] = "Retriever"

context_call = retriever_component.get_context
context = context_call.rets[:]

In [4]:
len = 20

In [5]:
from trulens.apps.virtual import VirtualRecord

data_dict = df.to_dict('records')

data = []

for record in data_dict[:len]:
    rec = VirtualRecord(
        main_input=record['question'],
        main_output=record['answer'],
        meta=record['id'],
        calls=
            {
                context_call: dict(
                    args=[record['question']],
                    rets=[record['contexts']]
                )
            }
        )
    data.append(rec)

In [6]:
df_modified = df.rename(columns={"question": "query", "ground_truth": "expected_response"})

In [7]:
from trulens.feedback import GroundTruthAgreement

# Define a groundedness feedback function
f_groundedness = (
    Feedback(
        provider.groundedness_measure_with_cot_reasons, name="Groundedness"
    )
    .on(context.collect())
    .on_output()
)

# Question/answer relevance between overall question and answer.
f_qa_relevance = Feedback(
    provider.relevance_with_cot_reasons, name="Answer Relevance"
).on_input_output()

#comprehensiveness evaluation in RAG with the retrieved context (i.e. the source) and response (i.e. the summary)
f_comprehensiveness = Feedback(
    provider.comprehensiveness_with_cot_reasons, name="Comprehensiveness"
).on_input_output()


f_groundtruth = Feedback(
    GroundTruthAgreement(df_modified[:len], provider=provider).agreement_measure, name="Ground Truth Eval"
).on_input_output()

✅ In Groundedness, input source will be set to __record__.app.retriever.get_context.rets[:].collect() .
✅ In Groundedness, input statement will be set to __record__.main_output or `Select.RecordOutput` .
✅ In Answer Relevance, input prompt will be set to __record__.main_input or `Select.RecordInput` .
✅ In Answer Relevance, input response will be set to __record__.main_output or `Select.RecordOutput` .
✅ In Comprehensiveness, input source will be set to __record__.main_input or `Select.RecordInput` .
✅ In Comprehensiveness, input summary will be set to __record__.main_output or `Select.RecordOutput` .
✅ In Ground Truth Eval, input prompt will be set to __record__.main_input or `Select.RecordInput` .
✅ In Ground Truth Eval, input response will be set to __record__.main_output or `Select.RecordOutput` .


In [8]:
from trulens.apps.virtual import TruVirtual

virtual_recorder = TruVirtual(
    app_name="pubmedQA-demo2",
    app=virtual_app,
    feedbacks=[f_groundedness]
)
# feedbacks=[f_qa_relevance,f_groundedness,f_groundtruth,f_comprehensiveness]


In [9]:
import time
pace_settings = {
    "seconds_per_period": 30.0,  # Reduce from 60.0 to 30.0 seconds
    "marks_per_second": 0.5       # Reduce from 1.0 to 0.5 marks per second
}
global current_pace_settings
current_pace_settings = pace_settings
i = 1
for record in data:
     # Wait based on the pace settings
    time.sleep(pace_settings["seconds_per_period"] / pace_settings["marks_per_second"])
    print(i,record)
    i = i + 1
    virtual_recorder.add_record(record)

Could not find an instance of DummyEndpoint. trulens will create an endpoint for cost tracking.


1 Record(record_hash_0bfd797f3e68eb8362a72c595a7cf784) with 2 calls:
  RecordAppCall: .root -> app.retriever.get_context
  RecordAppCall: .root



Could not find an instance of DummyEndpoint. trulens will create an endpoint for cost tracking.


2 Record(record_hash_77ed6903f2fa07f1be4719fc9ce9971c) with 2 calls:
  RecordAppCall: .root -> app.retriever.get_context
  RecordAppCall: .root



Could not find an instance of DummyEndpoint. trulens will create an endpoint for cost tracking.


3 Record(record_hash_682bef179209fd0d53bf00b5953e01ec) with 2 calls:
  RecordAppCall: .root -> app.retriever.get_context
  RecordAppCall: .root



Could not find an instance of DummyEndpoint. trulens will create an endpoint for cost tracking.


4 Record(record_hash_c5090b6b8680adf301eae26a2a5ee855) with 2 calls:
  RecordAppCall: .root -> app.retriever.get_context
  RecordAppCall: .root



Could not find an instance of DummyEndpoint. trulens will create an endpoint for cost tracking.


5 Record(record_hash_e6338b60392c9425750a41d725d8ed69) with 2 calls:
  RecordAppCall: .root -> app.retriever.get_context
  RecordAppCall: .root



Could not find an instance of DummyEndpoint. trulens will create an endpoint for cost tracking.


6 Record(record_hash_293cf45b07860022035b7330f144633e) with 2 calls:
  RecordAppCall: .root -> app.retriever.get_context
  RecordAppCall: .root



Could not find an instance of DummyEndpoint. trulens will create an endpoint for cost tracking.


7 Record(record_hash_82bad820a101a4689f8b04787bec7c2f) with 2 calls:
  RecordAppCall: .root -> app.retriever.get_context
  RecordAppCall: .root



Could not find an instance of DummyEndpoint. trulens will create an endpoint for cost tracking.


8 Record(record_hash_8402aff1d3ebb0d0b682156e2bfca7e7) with 2 calls:
  RecordAppCall: .root -> app.retriever.get_context
  RecordAppCall: .root



Could not find an instance of DummyEndpoint. trulens will create an endpoint for cost tracking.


9 Record(record_hash_58fe7f996c7cc27ec8bde20480baca3f) with 2 calls:
  RecordAppCall: .root -> app.retriever.get_context
  RecordAppCall: .root



Could not find an instance of DummyEndpoint. trulens will create an endpoint for cost tracking.


10 Record(record_hash_7ec6381c14307392f006af7779bffdb4) with 2 calls:
  RecordAppCall: .root -> app.retriever.get_context
  RecordAppCall: .root



Could not find an instance of DummyEndpoint. trulens will create an endpoint for cost tracking.


11 Record(record_hash_940eea106b86328ad943ba6e187ff777) with 2 calls:
  RecordAppCall: .root -> app.retriever.get_context
  RecordAppCall: .root



Could not find an instance of DummyEndpoint. trulens will create an endpoint for cost tracking.


12 Record(record_hash_6c4f5b802f753ea5eb7570fcdadc3779) with 2 calls:
  RecordAppCall: .root -> app.retriever.get_context
  RecordAppCall: .root



Could not find an instance of DummyEndpoint. trulens will create an endpoint for cost tracking.


13 Record(record_hash_cd013307101b6d1dac8b646ca6588317) with 2 calls:
  RecordAppCall: .root -> app.retriever.get_context
  RecordAppCall: .root



Could not find an instance of DummyEndpoint. trulens will create an endpoint for cost tracking.


14 Record(record_hash_908caa3333ad2e983e8210b3207412b7) with 2 calls:
  RecordAppCall: .root -> app.retriever.get_context
  RecordAppCall: .root



Could not find an instance of DummyEndpoint. trulens will create an endpoint for cost tracking.


15 Record(record_hash_108ffea4873a012b713d3e975349f5b8) with 2 calls:
  RecordAppCall: .root -> app.retriever.get_context
  RecordAppCall: .root



Could not find an instance of DummyEndpoint. trulens will create an endpoint for cost tracking.


16 Record(record_hash_257356189f318b02de84273559c92755) with 2 calls:
  RecordAppCall: .root -> app.retriever.get_context
  RecordAppCall: .root



Could not find an instance of DummyEndpoint. trulens will create an endpoint for cost tracking.


17 Record(record_hash_55070a93e96288cfef0539769830570b) with 2 calls:
  RecordAppCall: .root -> app.retriever.get_context
  RecordAppCall: .root



Could not find an instance of DummyEndpoint. trulens will create an endpoint for cost tracking.


18 Record(record_hash_da38d7ff1ffbe2b6b14f77e13cdf82ab) with 2 calls:
  RecordAppCall: .root -> app.retriever.get_context
  RecordAppCall: .root



Could not find an instance of DummyEndpoint. trulens will create an endpoint for cost tracking.


19 Record(record_hash_033d74ef0e4cbe52175e68ce4d7cb74b) with 2 calls:
  RecordAppCall: .root -> app.retriever.get_context
  RecordAppCall: .root

20 Record(record_hash_9b79d27f962ebfae122a1cf2480f8847) with 2 calls:
  RecordAppCall: .root -> app.retriever.get_context
  RecordAppCall: .root



Could not find an instance of DummyEndpoint. trulens will create an endpoint for cost tracking.


In [10]:
from trulens.core import TruSession
from trulens.dashboard import run_dashboard

session = TruSession()

In [11]:
appId=virtual_recorder.app_id
print(appId)
records_df, feedbacks =session.get_records_and_feedback(app_ids=[appId])

app_hash_567b82187ab75369b432d4a48baed4ed


In [12]:
import json

records_df["id"] = records_df["record_json"].apply(lambda x : json.loads(x)['meta'])
records_df = pd.merge(records_df,df_modified,on=['id'])

In [13]:
session.get_leaderboard(app_ids=[appId])

Unnamed: 0_level_0,Unnamed: 1_level_0,Groundedness,latency,total_cost
app_name,app_version,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
pubmedQA-demo2,base,0.950292,0.000121,0.0


In [14]:
records_df

Unnamed: 0,app_id,app_json,type,record_id,input,output,tags,record_json,cost_json,perf_json,...,latency,total_tokens,total_cost,cost_currency,id,query,answer,contexts,expected_response,label
0,app_hash_567b82187ab75369b432d4a48baed4ed,"{""tru_class_info"": {""name"": ""TruVirtual"", ""mod...",VirtualApp(trulens.apps.virtual),record_hash_9b79d27f962ebfae122a1cf2480f8847,"""Internal derangement of the temporomandibular...","""Ultrasound demonstrates considerably lower se...",,"{""record_id"": ""record_hash_9b79d27f962ebfae122...","{""n_requests"": 0, ""n_successful_requests"": 0, ...","{""start_time"": ""2024-11-25T02:48:37.791382"", ""...",...,9.3e-05,0,0.0,USD,22668712,Internal derangement of the temporomandibular ...,Ultrasound demonstrates considerably lower sen...,[The aim of this study was to assess the diagn...,No. The present study does not support the rec...,PASS
1,app_hash_567b82187ab75369b432d4a48baed4ed,"{""tru_class_info"": {""name"": ""TruVirtual"", ""mod...",VirtualApp(trulens.apps.virtual),record_hash_033d74ef0e4cbe52175e68ce4d7cb74b,"""Two-year follow-up survey of patients with al...","""The prognosis of patients with allergic conta...",,"{""record_id"": ""record_hash_033d74ef0e4cbe52175...","{""n_requests"": 0, ""n_successful_requests"": 0, ...","{""start_time"": ""2024-11-25T02:48:37.789361"", ""...",...,9.1e-05,0,0.0,USD,24359102,Two-year follow-up survey of patients with all...,The prognosis of patients with allergic contac...,[Skin diseases are the most frequently recogni...,"Yes. At follow-up, patients with contact urtic...",PASS
2,app_hash_567b82187ab75369b432d4a48baed4ed,"{""tru_class_info"": {""name"": ""TruVirtual"", ""mod...",VirtualApp(trulens.apps.virtual),record_hash_da38d7ff1ffbe2b6b14f77e13cdf82ab,"""Does the SCL 90-R obsessive-compulsive dimens...","""No, the SCL 90-R obsessive-compulsive subscal...",,"{""record_id"": ""record_hash_da38d7ff1ffbe2b6b14...","{""n_requests"": 0, ""n_successful_requests"": 0, ...","{""start_time"": ""2024-11-25T02:48:37.787322"", ""...",...,9.7e-05,0,0.0,USD,9582182,Does the SCL 90-R obsessive-compulsive dimensi...,"No, the SCL 90-R obsessive-compulsive subscale...",[To investigate the relevance of the Symptom C...,Yes. Our data suggest that the SCL 90-R is bes...,PASS
3,app_hash_567b82187ab75369b432d4a48baed4ed,"{""tru_class_info"": {""name"": ""TruVirtual"", ""mod...",VirtualApp(trulens.apps.virtual),record_hash_55070a93e96288cfef0539769830570b,"""Does desflurane alter left ventricular functi...","""No change in left ventricular function was no...",,"{""record_id"": ""record_hash_55070a93e96288cfef0...","{""n_requests"": 0, ""n_successful_requests"": 0, ...","{""start_time"": ""2024-11-25T02:48:37.785071"", ""...",...,0.000103,0,0.0,USD,10456814,Does desflurane alter left ventricular functio...,No change in left ventricular function was not...,[Although desflurane is commonly used to contr...,No. This study demonstrates that in patients a...,PASS
4,app_hash_567b82187ab75369b432d4a48baed4ed,"{""tru_class_info"": {""name"": ""TruVirtual"", ""mod...",VirtualApp(trulens.apps.virtual),record_hash_257356189f318b02de84273559c92755,"""Doppler examination of uteroplacental circula...","""Yes, the study found that higher resistance i...",,"{""record_id"": ""record_hash_257356189f318b02de8...","{""n_requests"": 0, ""n_successful_requests"": 0, ...","{""start_time"": ""2024-11-25T02:48:37.782968"", ""...",...,9.2e-05,0,0.0,USD,17551944,Doppler examination of uteroplacental circulat...,"Yes, the study found that higher resistance in...",[To determine whether spectral Doppler measure...,Yes. Transvaginal Doppler examination can dete...,PASS
5,app_hash_567b82187ab75369b432d4a48baed4ed,"{""tru_class_info"": {""name"": ""TruVirtual"", ""mod...",VirtualApp(trulens.apps.virtual),record_hash_108ffea4873a012b713d3e975349f5b8,"""Is fetal anatomic assessment on follow-up ant...","""Yes, fetal anatomic assessment on follow-up a...",,"{""record_id"": ""record_hash_108ffea4873a012b713...","{""n_requests"": 0, ""n_successful_requests"": 0, ...","{""start_time"": ""2024-11-25T02:48:37.780948"", ""...",...,9.1e-05,0,0.0,USD,17715311,Is fetal anatomic assessment on follow-up ante...,"Yes, fetal anatomic assessment on follow-up an...",[The purpose of this study was to evaluate the...,Yes. A fetal anatomic survey on follow-up sono...,PASS
6,app_hash_567b82187ab75369b432d4a48baed4ed,"{""tru_class_info"": {""name"": ""TruVirtual"", ""mod...",VirtualApp(trulens.apps.virtual),record_hash_908caa3333ad2e983e8210b3207412b7,"""Out of the smokescreen II: will an advertisem...","""The antismoking advertisement had a significa...",,"{""record_id"": ""record_hash_908caa3333ad2e983e8...","{""n_requests"": 0, ""n_successful_requests"": 0, ...","{""start_time"": ""2024-11-25T02:48:37.778675"", ""...",...,9.6e-05,0,0.0,USD,17565137,Out of the smokescreen II: will an advertiseme...,The antismoking advertisement had a significan...,[To evaluate the effect of an antismoking adve...,Yes. This real-world study suggests that placi...,PASS
7,app_hash_567b82187ab75369b432d4a48baed4ed,"{""tru_class_info"": {""name"": ""TruVirtual"", ""mod...",VirtualApp(trulens.apps.virtual),record_hash_cd013307101b6d1dac8b646ca6588317,"""Does Mammographic Density have an Impact on t...","""The study found that there was a trend toward...",,"{""record_id"": ""record_hash_cd013307101b6d1dac8...","{""n_requests"": 0, ""n_successful_requests"": 0, ...","{""start_time"": ""2024-11-25T02:48:37.776643"", ""...",...,9.2e-05,0,0.0,USD,26471488,Does Mammographic Density have an Impact on th...,The study found that there was a trend towards...,[Limited and conflicting data exist on an asso...,No. Mammographic density is not associated wit...,PASS
8,app_hash_567b82187ab75369b432d4a48baed4ed,"{""tru_class_info"": {""name"": ""TruVirtual"", ""mod...",VirtualApp(trulens.apps.virtual),record_hash_6c4f5b802f753ea5eb7570fcdadc3779,"""Is the covering of the resection margin after...","""The retrospective analysis did not find a sta...",,"{""record_id"": ""record_hash_6c4f5b802f753ea5eb7...","{""n_requests"": 0, ""n_successful_requests"": 0, ...","{""start_time"": ""2024-11-25T02:48:37.774580"", ""...",...,0.00011,0,0.0,USD,24073931,Is the covering of the resection margin after ...,The retrospective analysis did not find a stat...,"[In recent years, many advances in pancreatic ...",No. The results show no significant difference...,PASS
9,app_hash_567b82187ab75369b432d4a48baed4ed,"{""tru_class_info"": {""name"": ""TruVirtual"", ""mod...",VirtualApp(trulens.apps.virtual),record_hash_940eea106b86328ad943ba6e187ff777,"""Is ultrasound equal to X-ray in pediatric fra...","""Ultrasound is not equal to X-ray in pediatric...",,"{""record_id"": ""record_hash_940eea106b86328ad94...","{""n_requests"": 0, ""n_successful_requests"": 0, ...","{""start_time"": ""2024-11-25T02:48:37.772340"", ""...",...,0.000114,0,0.0,USD,20401819,Is ultrasound equal to X-ray in pediatric frac...,Ultrasound is not equal to X-ray in pediatric ...,[Ultrasound is currently not established for t...,Yes. Ultrasound not only has comparable sensit...,PASS
