In [3]:
!pip install -q 'httpx==0.27.2'

[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/76.4 kB[0m [31m?[0m eta [36m-:--:--[0m[2K   [91m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m[91m╸[0m[90m━━[0m [32m71.7/76.4 kB[0m [31m2.3 MB/s[0m eta [36m0:00:01[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m76.4/76.4 kB[0m [31m1.8 MB/s[0m eta [36m0:00:00[0m
[?25h

In [1]:
import openai
# from google.colab import userdata
# openai_api_key = userdata.get('OPENAI_API_KEY')

from time import sleep
import os
import pandas as pd
import json
import csv
from ast import literal_eval
import re
import numpy as np

In [8]:
corpus = pd.read_json('corpus.jsonl', lines=True)
train_df = pd.read_json('claims_train.jsonl', lines=True)

In [9]:
train_df = train_df[train_df['evidence'] != {}].reset_index(drop=True)

In [11]:
# iter through train_df and get the abstract and title from the cited_doc_ids by mapping them to corpus
# then making new columns in train_df containing the abstract, title, gold_evidence sentences, label

extended_train_df = pd.DataFrame(columns=['id', 'claim', 'abstract', 'title', 'gold_evidence', 'label', 'cited_doc_ids', 'evidence'])

for i, row in train_df.iterrows():
    id = row['id']
    claim = row['claim']
    evidence = row['evidence']
    cited_doc_ids = row['cited_doc_ids']
    cited_doc_ids = [int(x) for x in cited_doc_ids]

    for doc_id in cited_doc_ids:
        if str(doc_id) not in evidence:
            continue
        document = corpus[corpus['doc_id'] == doc_id].iloc[0]
        abstract = document['abstract']
        title = document['title']
        gold_evidence = []
        label = []
        for evidence_dict in evidence[str(doc_id)]:
            gold_evidence.extend([sentence for i,sentence in enumerate(abstract) if i in evidence_dict['sentences']])
            label.append(evidence_dict['label'])
        label = list(set(label))[0]

        extended_train_item = {
            'id': id,
            'claim': claim,
            'abstract': " ".join(abstract),
            'title': title,
            'gold_evidence': gold_evidence,
            'label': label,
            'cited_doc_ids': cited_doc_ids,
            'evidence': evidence
        }

        with open('extended_train.jsonl', 'a+') as f:
            f.write(json.dumps(extended_train_item) + '\n')

        extended_train_df.loc[len(extended_train_df)] = [id, claim, " ".join(abstract), title, gold_evidence, label, cited_doc_ids, evidence]



In [8]:
extended_train_df.to_csv('extended_claims_train.csv', index=False)

In [5]:
client = openai.OpenAI(api_key=userdata.get('OPENAI_API_KEY'))

with open('simulation_utils.py', 'r') as f:
    simulation_template = f.read()

with open('bacteria.py', 'r') as f:
    example_bacteria = f.read()

In [38]:

def get_gpt_response(id, claim, reference_text, gold_evidence, model="gpt-4o-mini", **kwargs):

    if not os.path.exists('gpt_results/'):
        os.makedirs('gpt_results/')

    system_prompt = """
        You are a scientific claim inspector, who can catch fake claims accurately and verify correct claims efficiently.
        Your main process is building simulation for building environment for testing out claims. You compare the results
        of a simulation with a given claim and provide verification results as Supported (for True claims) and Refuted (for False claims).

        You will be given the following information below:
        1. A Claim
        2. A passage of supporting text from a scientific article that can be used to help verify the claim.
           Sometimes you will be provided with specific gold_sentences where the evidence of claim's veracity can be found.
        3. An example simulation that can be used as a template to help guide your construction of a simulation

        Your output will be a Python simulation whose output will clearly determine whether the claim is likely supported or refuted.

        Also, You must output the simulation between clodeblocks (```), or the simulation will not be correctly parsed.

        For example:
        ```
        # python comment stating the input claim
        # code fo the simulation
        # print the output of the simulation
        print(result)
        ```
    """

    user_prompt = f"""
        Claim: {claim}
        Reference Text: {reference_text}

        Gold Sentence that can provide the evidence of claim verification: {"".join(gold_evidence)}

        ```
        # simulation_utils.py
        {simulation_template}

        # bacteria simulation example
        {example_bacteria}
        ```

        Rules for the output:
        1. Generate simulation according to the given Claim, take Reference Text to figure out the objects of the simulation and their properties.
        2. The generated code should not generate any errors
        3. If you are extending upon the given base template, or using code from the base template, include it in your code by using either of the two ways mentioned below:
            a. import statements (importing the classes from base template simulation_utils.py)
            OR
            b. writing the whole base template code before your simulation code

        4. Make sure the output of the simulation is appropriate to the given instructions.
        5. It clearly determines/shows if the  claim is supported or refuted.

        Please generate your simulation now
    """

    messages=[{"role": "system", "content": system_prompt},
                {"role": "user", "content": user_prompt}]

    kwargs["temperature"] = 0.0
    kwargs["top_p"] = 1
    kwargs["frequency_penalty"] = 0.0
    kwargs["presence_penalty"] = 0.0
    kwargs["timeout"] = 4*10*60  # 40 minutes
    kwargs["model"] = model
    kwargs["messages"] = messages


    response = client.chat.completions.create(**kwargs)

    with open(f'gpt_results/prompt_{id}.txt', 'w') as f:
        f.write(user_prompt)

    with open(f'gpt_results/response_{id}.txt', 'w') as f:
        f.write(response.choices[0].message.content.strip())

    return response.choices[0].message.content.strip()


In [39]:
def parse_gpt_response(response):
    if not os.path.exists('simscripts/'):
        os.makedirs('simscripts/')

    code_block_pattern = r'```(.*?)```'
    code_blocks = re.findall(code_block_pattern, response, re.DOTALL)
    code_block = ''.join(code_blocks)

    with open(f'simscripts/simulation_{id}.py', 'w') as f:
        f.write(code_block)

    return code_block

def run_generated_simulation(code_block):
    import sys
    from io import StringIO
    old_stdout = sys.stdout
    sys.stdout = StringIO()


    std_out = ""
    std_err = ""
    try:
        exec(code_block)
        std_out = sys.stdout.getvalue()
        sys.stdout = old_stdout

    except Exception as e:
        a,b,c = sys.exc_info()
        std_err = str(a) + "\n" + str(b) + "\n" + str(c)

    with open(f'simscripts/std_out_{id}.txt', 'w') as f:
        f.write(std_out)

    with open(f'simscripts/std_err_{id}.txt', 'w') as f:
        f.write(std_err)

    return std_out, std_err


In [53]:
train_annotated = pd.read_json('/content/extended_train_annotated.jsonl', lines=True)

In [63]:
train_annotated[train_annotated['category'] == 'directional']['label'].value_counts()

Unnamed: 0_level_0,count
label,Unnamed: 1_level_1
SUPPORT,51
CONTRADICT,41


In [61]:
train_annotated['category'].value_counts()

Unnamed: 0_level_0,count
category,Unnamed: 1_level_1
biology,267
cellular,126
directional,92
population,53
medicine,22
epidemiology,4


In [54]:
row = train_annotated.sample(1).iloc[0]
row

Unnamed: 0,526
id,1327
claim,Tuberculosis incidence occurs at higher rates ...
category,population
abstract,This paper seeks to establish the strength of ...
title,Tuberculosis mortality in England and Wales du...
gold_evidence,[A strong association was found between all TB...
label,CONTRADICT
cited_doc_ids,"[24241932, 22194407]"
evidence,"{'24241932': [{'sentences': [6], 'label': 'CON..."


In [60]:
id = row['id']
claim = row['claim']
abstract = row['abstract']
gold_evidence = row['gold_evidence']

response = get_gpt_response(id, claim, abstract, gold_evidence)
print(response)


```
# Claim: A mutation in HNF4A leads to an increased risk of developing diabetes by the age of 14 years.
# The simulation will assess the impact of HNF4A mutations on insulin secretion and diabetes risk.

from simulation_utils import GameObject, Container

class HNF4A(GameObject):
    def __init__(self, mutation_status="wild_type"):
        super().__init__("HNF4A")
        self.mutation_status = mutation_status
        self.insulin_secretion = 100  # baseline insulin secretion

    def tick(self):
        if self.mutation_status == "mutated":
            self.insulin_secretion -= 30  # decreased insulin secretion due to mutation

class DiabetesRisk(GameObject):
    def __init__(self, risk_level=0):
        super().__init__("DiabetesRisk")
        self.risk_level = risk_level

    def assess_risk(self, insulin_secretion):
        if insulin_secretion < 70:  # threshold for increased risk
            self.risk_level += 1  # increase risk if insulin secretion is low

class Simulation:


In [29]:
code_block = parse_gpt_response(response)

std_out, std_err = run_generated_simulation(code_block)

print(std_out)
print(std_err)

In the environment, you see: 
	A mouse_with_autophagy_deficiency with autophagy deficiency: True, insulin resistance level: 0.
	A mouse_without_autophagy_deficiency with autophagy deficiency: False, insulin resistance level: 0.

Ticking...
In the environment, you see: 
	A mouse_with_autophagy_deficiency with autophagy deficiency: True, insulin resistance level: 1.
	A mouse_without_autophagy_deficiency with autophagy deficiency: False, insulin resistance level: 0.

Ticking...
In the environment, you see: 
	A mouse_with_autophagy_deficiency with autophagy deficiency: True, insulin resistance level: 2.
	A mouse_without_autophagy_deficiency with autophagy deficiency: False, insulin resistance level: 0.

Ticking...
In the environment, you see: 
	A mouse_with_autophagy_deficiency with autophagy deficiency: True, insulin resistance level: 3.
	A mouse_without_autophagy_deficiency with autophagy deficiency: False, insulin resistance level: 0.

Ticking...
In the environment, you see: 
	A mouse_w

In [41]:

def get_gpt_verification(id, sim_output, claim, model="gpt-4o-mini", **kwargs):

    if not os.path.exists('gpt_ver_results/'):
        os.makedirs('gpt_ver_results/')

    system_prompt = """
        You are a scientific claim inspector, who can catch fake claims accurately and verify correct claims efficiently.
        Your main responsibility is to look at the output of a simulation.
        The simulation was run with the elements relevant to the given claim, and it displays a output that can give you details of the veraicity of the claim.

        Inspect the output and understand the claim, to verify if the claim is supported or refuted.

        You will be given the following information below:
        1. A Claim
        2. The output of a Simulation that can be used to help verify the claim.

        Your output will be a python dictionary containing the following information:
        {
            "claim": the given claim as it is,
            "verification_result": "Supported" or "Refuted" or None if you are unable to verify the claim,
            "reason": reason of the verification result, or reason of the None output
        }

    """

    user_prompt = f"""
        Claim: {claim}
        Simulation Output: {sim_output}


        Rules for the output:
        1. Make sure the output of the simulation is appropriate to the given instructions.
        2. If your verification_result is None, make sure to include the reason as to why you were note able to determine the result
        3. Your verification result should only include None if the simulation output is an execution error, otherwise you should provide a definite result and a definite reason.

        Please generate your verification result now
    """

    messages=[{"role": "system", "content": system_prompt},
                {"role": "user", "content": user_prompt}]

    kwargs["temperature"] = 0.0
    kwargs["top_p"] = 1
    kwargs["frequency_penalty"] = 0.0
    kwargs["presence_penalty"] = 0.0
    kwargs["timeout"] = 4*10*60  # 40 minutes
    kwargs["model"] = model
    kwargs["messages"] = messages


    response = client.chat.completions.create(**kwargs)

    with open(f'gpt_ver_results/prompt_{id}.txt', 'w') as f:
        f.write(user_prompt)

    with open(f'gpt_ver_results/response_{id}.txt', 'w') as f:
        f.write(response.choices[0].message.content.strip())

    return response.choices[0].message.content.strip()


In [42]:

sim_output = f"""
        Simulation Output:
        {std_out}

        Simulation Error:
        {std_err}
    """

response = get_gpt_verification(id, sim_output, claim)
print(response)

{
    "claim": "Autophagy deficiency in the liver increases vulnerability to insulin resistance.",
    "verification_result": "Supported",
    "reason": "The simulation shows that the mouse with autophagy deficiency has increasing levels of insulin resistance (from 0 to 5), while the mouse without autophagy deficiency maintains an insulin resistance level of 0. This indicates that autophagy deficiency is associated with increased insulin resistance."
}


In [50]:
literal_eval(response)

{'claim': 'Autophagy deficiency in the liver increases vulnerability to insulin resistance.',
 'verification_result': 'Supported',
 'reason': 'The simulation shows that the mouse with autophagy deficiency has increasing levels of insulin resistance (from 0 to 5), while the mouse without autophagy deficiency maintains an insulin resistance level of 0. This indicates that autophagy deficiency is associated with increased insulin resistance.'}

In [None]:
# !unzip '/content/verification_responses.zip' -d '/content/verification_responses/'

## Evaluation

In [61]:
train_annotated['category'].value_counts()

category
biology         267
cellular        126
directional      92
population       53
medicine         22
epidemiology      4
Name: count, dtype: int64

In [32]:
train_annotated = pd.read_json('../data/data/processed_data/extended_train_annotated.jsonl', lines=True)
ver_dir = '../gpt_ver_results'
ver_files = [filename for filename in os.listdir(ver_dir) if filename.startswith("response")]

train_results = train_annotated[train_annotated['category'].isin(['medicine', 'cellular', 'directional'])].copy()
train_results['label'] = np.where(train_results['label'] == 'CONTRADICT', 'Refuted', 'Supported')

train_results['verification_result'] = None
train_results['reason'] = None


for r in ver_files:
    with open(os.path.join(ver_dir, r), 'r') as f:
        ver_res = f.read()
    
    target_id = int(r.split('_')[-1].split('.')[0])
    from_code = re.findall(r'```python(.*?)```', ver_res, re.DOTALL)

    if from_code:
        ver_res = from_code[0].strip()
    
    ver_res = literal_eval(ver_res)
    
    train_results.loc[train_results['id'] == target_id, 'verification_result'] = ver_res['verification_result']
    train_results.loc[train_results['id'] == target_id, 'reason'] = ver_res['reason']

train_results['matched'] = train_results.apply(lambda x: x['verification_result'] == x['label'], axis=1)
train_results['verification_result'] = train_results['verification_result'].astype(str)

In [16]:
train_results.to_csv("../data/data/processed_data/train_results.csv", index=False)

In [None]:
# train_results_directional[train_results_directional['matched'] == False]['verification_result'].value_counts()
# train_results.groupby(['category'])['matched'].agg(
#     correct_predictions = lambda x: (x == True).sum(),
#     incorrect_match=lambda x: (x == False).sum(),
#     errors=lambda x: (x == 'None').sum()

# )

results = {}

groups = train_results.groupby('category')
for category, group in groups:
    # Calculate the sum of True values in 'matched' as correct_match
    correct_match = group['matched'].sum()
    
    # Calculate the sum of False values in 'matched' as incorrect_match (subtracted None results to get actual inaccurate)
    incorrect_match = (~group['matched']).sum() - (group['verification_result']=='None').sum()
    
    # Count the None values in 'verification_result' as errors
    errors = (group['verification_result']=='None').sum()

    total = len(group)
    
    # Store the results in the dictionary
    results[category] = {
        'correct_match': correct_match,
        'incorrect_match': incorrect_match,
        'errors': errors,
        'total': total
    }

pd.DataFrame(results).T


Unnamed: 0,correct_match,incorrect_match,errors,total
cellular,68,27,31,126
directional,51,19,22,92
medicine,13,6,3,22


In [62]:
incorrect_results = train_results[train_results['matched'] == False]
incorrect_results = incorrect_results[incorrect_results['verification_result'] != 'None']
incorrect_results = incorrect_results.reset_index(drop=True)

In [64]:
incorrect_results[incorrect_results['category'] == 'medicine']['label'].value_counts()

label
Supported    5
Refuted      1
Name: count, dtype: int64

In [78]:
for i, row in incorrect_results[incorrect_results['category'] == 'medicine'].iterrows():
    print("Evidence: ", "".join(row['gold_evidence']))
    print("Claim: ", row['claim'])
    print("Status: ", row['label'])
    print("-"*50)

Evidence:  Policies requiring discontinuation of methadone in 32% of all programs contradict the evidence base for efficacy of long-term replacement therapies and potentially result in relapse of previously stable patients.
Claim:  32% of liver transplantation programs required patients to discontinue methadone treatment in 2001.
Status:  Supported
--------------------------------------------------
Evidence:  RESULTS Tricyclic antidepressant medication and stress management therapy each produced larger reductions in headache activity, analgesic medication use, and headache-related disability than placebo, but antidepressant medication yielded more rapid improvements in headache activity.Combined therapy was more likely to produce clinically significant (>/=50%) reductions in headache index scores (64% of participants) than antidepressant medication (38% of participants; P =.006), stress management therapy (35%; P =.003), or placebo (29%; P =.001).CONCLUSIONS Our results indicate that a

In [65]:
incorrect_results[incorrect_results['category'] == 'cellular']['label'].value_counts()

label
Supported    22
Refuted       5
Name: count, dtype: int64

In [77]:
for i, row in incorrect_results[incorrect_results['category'] == 'cellular'].iterrows():
    print("Evidence: ", "".join(row['gold_evidence']))
    print("Claim: ", row['claim'])
    print("Status: ", row['label'])
    print("-"*50)

Evidence:  We report that activation of basophils by autoreactive IgE causes their homing to lymph nodes, promoting T helper type 2 (T(H)2) cell differentiation and enhancing the production of self-reactive antibodies that cause lupus-like nephritis in mice lacking the Src family protein tyrosine kinase Lyn (Lyn(-/-) mice).Individuals with SLE also have elevated serum IgE, self-reactive IgEs and activated basophils that express CD62 ligand (CD62L) and the major histocompatibility complex (MHC) class II molecule human leukocyte antigen-DR (HLA-DR), parameters that are associated with increased disease activity and active lupus nephritis.Thus, in Lyn(-/-) mice, basophils and IgE autoantibodies amplify autoantibody production that leads to lupus nephritis, and in individuals with SLE IgE autoantibodies and activated basophils are factors associated with disease activity and nephritis.
Claim:  A T helper 2 cell (Th2) environment impedes disease development in patients with systemic lupus e

In [66]:
incorrect_results[incorrect_results['category'] == 'directional']['label'].value_counts()

label
Supported    13
Refuted       6
Name: count, dtype: int64

In [79]:
for i, row in incorrect_results[incorrect_results['category'] == 'directional'].iterrows():
    print("Evidence: ", "".join(row['gold_evidence']))
    print("Claim: ", row['claim'])
    print("Status: ", row['label'])
    print("-"*50)

Evidence:  No association was detected between bariatric surgery and fractures, cancer, or stroke.
Claim:  Bariatric surgery increases rates of colorectal cancer.
Status:  Supported
--------------------------------------------------
Evidence:  No association was detected between bariatric surgery and fractures, cancer, or stroke.
Claim:  Bariatric surgery increases rates of postmenopausal breast cancer.
Status:  Supported
--------------------------------------------------
Evidence:  The suppression of TDP-43 mitochondrial localization abolishes WT and mutant TDP-43-induced mitochondrial dysfunction and neuronal loss, and improves phenotypes of transgenic mutant TDP-43 mice.
Claim:  Blocking the interaction between TDP-43 and respiratory complex I proteins ND3 and ND6 prevents TDP-43-induced neuronal loss.
Status:  Supported
--------------------------------------------------
Evidence:  Risk of dementia associated with a longer reproductive period was most pronounced in APOE epsilon4 car