In [1]:
import json
import pandas as pd
import dspy
import os

json_file_path = '../results/pubmed_results.json'

with open(json_file_path, 'r', encoding='utf-8') as f:
    loaded_data = json.load(f)

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
df= pd.DataFrame.from_dict(loaded_data)

In [3]:
df.head()

Unnamed: 0,question,answer,contexts,ground_truth,label,id
0,Is ganglionated plexi ablation during Maze IV ...,The absence of ganglionated plexi ablation was...,[We investigated the role of surgical ablation...,No. GP ablation did not prove to be beneficial...,PASS,25985014
1,Production of chemokines by perivascular adipo...,"Yes, perivascular adipose tissue (pWAT) may pl...",[Obesity is associated with an increased risk ...,Yes. Human pWAT has chemotactic properties thr...,PASS,16195477
2,Is a 9-month treatment sufficient in tuberculo...,"Yes, a 9-month treatment was found to be suffi...",[Tuberculosis has increased in parallel with t...,Yes. Tuberculous enterocolitis can be managed ...,PASS,12848629
3,A patient with myelomeningocele: is untetherin...,The study found that patients with myelomening...,[Tethering of the spinal cord is thought to in...,No. The study results suggested that spinal co...,PASS,20594006
4,Do some U.S. states have higher/lower injury m...,"Yes, some U.S. states have higher/lower injury...",[This article examines the hypothesis that the...,Yes. Group 1 states are likely to exhibit abov...,PASS,15995461


In [4]:
from cleanlab_studio import Studio
import os 

studio = Studio(os.environ["CLEANLAB_STUDIO_API_KEY"])

In [6]:
def create_tlm_prompt(row):
    return f"Context: {row['contexts']}\n\nUser Question: {row['question']}"

df['prompt'] = df.apply(create_tlm_prompt, axis=1)

In [7]:
faithfulness_groundedness_eval_criteria = {
    "custom_eval_criteria": [
        {
            "name": "Faithfulness & Groundedness",
            "criteria": "Determine if the Response is solely based on information available in the Context (no additional facts are mentioned in the Response that are not stated in the Context). \
                        Also determine if the Response does not contradict any information in the Context. If the Context contains no information available to answer the Question, a good Response should state 'there is no information available.'"
        }
    ]
}

In [8]:
tlm_faithfulness_groundedness = studio.TLM(options=faithfulness_groundedness_eval_criteria)

In [9]:
res_faithfulness_groundedness = tlm_faithfulness_groundedness.get_trustworthiness_score(df['prompt'].tolist(), df['answer'].tolist())
res_faithfulness_groundedness_df = pd.DataFrame(res_faithfulness_groundedness)

Querying TLM... 100%|██████████|


In [10]:
df_results = pd.concat([df, res_faithfulness_groundedness_df], axis=1)
df_results[['question', 'answer', 'trustworthiness_score', 'log']]

Unnamed: 0,question,answer,trustworthiness_score,log
0,Is ganglionated plexi ablation during Maze IV ...,The absence of ganglionated plexi ablation was...,0.909700,{'custom_eval_criteria': [{'name': 'Faithfulne...
1,Production of chemokines by perivascular adipo...,"Yes, perivascular adipose tissue (pWAT) may pl...",0.937776,{'custom_eval_criteria': [{'name': 'Faithfulne...
2,Is a 9-month treatment sufficient in tuberculo...,"Yes, a 9-month treatment was found to be suffi...",0.987289,{'custom_eval_criteria': [{'name': 'Faithfulne...
3,A patient with myelomeningocele: is untetherin...,The study found that patients with myelomening...,0.926890,{'custom_eval_criteria': [{'name': 'Faithfulne...
4,Do some U.S. states have higher/lower injury m...,"Yes, some U.S. states have higher/lower injury...",0.872269,{'custom_eval_criteria': [{'name': 'Faithfulne...
...,...,...,...,...
195,Do n-terminal proBrain natriuretic peptide lev...,"Yes, higher levels of NT-proBNP predict mortal...",0.910131,{'custom_eval_criteria': [{'name': 'Faithfulne...
196,Does simvastatin Treatment Affect Serum Vitami...,"No, simvastatin therapy did not significantly ...",0.987390,{'custom_eval_criteria': [{'name': 'Faithfulne...
197,Is standard magnetic resonance imaging inadequ...,"Yes, standard MRI reports by ""non-expert"" radi...",0.975056,{'custom_eval_criteria': [{'name': 'Faithfulne...
198,Does indication-based prescribing prevent wron...,The study found that indication-based prescrib...,0.929742,{'custom_eval_criteria': [{'name': 'Faithfulne...


In [15]:
df_results.to_json('../results/cleanlab_results.json', orient='records', force_ascii=False, indent=4)

In [11]:
df_results.head()

Unnamed: 0,question,answer,contexts,ground_truth,label,id,prompt,trustworthiness_score,log
0,Is ganglionated plexi ablation during Maze IV ...,The absence of ganglionated plexi ablation was...,[We investigated the role of surgical ablation...,No. GP ablation did not prove to be beneficial...,PASS,25985014,Context: ['We investigated the role of surgica...,0.9097,{'custom_eval_criteria': [{'name': 'Faithfulne...
1,Production of chemokines by perivascular adipo...,"Yes, perivascular adipose tissue (pWAT) may pl...",[Obesity is associated with an increased risk ...,Yes. Human pWAT has chemotactic properties thr...,PASS,16195477,Context: ['Obesity is associated with an incre...,0.937776,{'custom_eval_criteria': [{'name': 'Faithfulne...
2,Is a 9-month treatment sufficient in tuberculo...,"Yes, a 9-month treatment was found to be suffi...",[Tuberculosis has increased in parallel with t...,Yes. Tuberculous enterocolitis can be managed ...,PASS,12848629,Context: ['Tuberculosis has increased in paral...,0.987289,{'custom_eval_criteria': [{'name': 'Faithfulne...
3,A patient with myelomeningocele: is untetherin...,The study found that patients with myelomening...,[Tethering of the spinal cord is thought to in...,No. The study results suggested that spinal co...,PASS,20594006,"Context: [""Tethering of the spinal cord is tho...",0.92689,{'custom_eval_criteria': [{'name': 'Faithfulne...
4,Do some U.S. states have higher/lower injury m...,"Yes, some U.S. states have higher/lower injury...",[This article examines the hypothesis that the...,Yes. Group 1 states are likely to exhibit abov...,PASS,15995461,Context: ['This article examines the hypothesi...,0.872269,{'custom_eval_criteria': [{'name': 'Faithfulne...
