In [1]:
import os
import time
import json

import concurrent
import numpy as np
import pandas as pd
from tqdm import tqdm
from openai import OpenAI, AzureOpenAI
from sentence_transformers import SentenceTransformer

In [2]:
def gpt_call(api_source, config, gpt_model, system_content, prompt, drug, adr, temperature):
    if api_source == 'OpenAI':
        client = OpenAI(api_key=config[api_source]['openai_api_key'])
    elif api_source == 'Azure':
        client = AzureOpenAI(api_key=config[api_source]['openai_api_key'], api_version="2023-12-01-preview", azure_endpoint=config[api_source]['openai_api_endpoint'])
    else:
        raise Exception(f"Unexpected API source requested: {api_source}")
  
    chat_completion = client.chat.completions.create(
        messages=[
            {"role": "system", "content": system_content},
            {
                "role": "user",
                "content": prompt.format(adr, drug)
            }
        ],
        model=gpt_model,
        temperature=temperature,
    )
    term = chat_completion.choices[0].message.content
    return term

In [14]:
config = json.load(open('../config.json'))

organization = ""

api_source = 'OpenAI'

api_key = config[api_source]['openai_api_key'] #constants.AZURE_OPENAI_KEY
api_endpoint = config[api_source]['openai_api_endpoint'] 

gpt_model = config[api_source]["gpt_model"]
temperature = 0

In [15]:
nruns = 3

system_options = {
    "lie-detector": """
    You are a lie detector. You will be given a statement and you have to determine if it is true or false.
    Do not include an explanation. Only respond with "true" or "false".
    """,
    "pharmaco-expert": """
    You are a pharmacologist. You will be given a statement regarding a drug and potential adverse reaction
    and you have to determine state whether it is true or false.
    Do not include an explanation. Only respond with "true" or "false".
    """,
}

prompt_options = {
    "side-effect": """
True or False: {} is a side effect caused by {}.
""",
    "adverse-reaction": """
{} is an adverse reaction of {}.
""",
    "caused-by": """
{} is caused by {}.
"""
}

In [16]:
system_name = "pharmaco-expert"
system_content = system_options[system_name]

prompt_name = "adverse-reaction"
prompt = prompt_options[prompt_name]

In [17]:
adr_section = pd.read_csv('/Users/undinagisladottir/Documents/Columbia/Tatonetti_Lab/20231113_onsides/adverse_reactions.csv')
adr_section.head()

Unnamed: 0,ingredients_rxcuis,ingredients_names,num_ingredients,pt_meddra_id,pt_meddra_term,percent_labels,num_labels
0,6916,metolazone,1,10000059,Abdominal discomfort,0.034483,29
1,6916,metolazone,1,10000060,Abdominal distension,1.0,29
2,6916,metolazone,1,10000081,Abdominal pain,1.0,29
3,6916,metolazone,1,10001507,Agranulocytosis,1.0,29
4,6916,metolazone,1,10001682,Alkalosis hypochloraemic,1.0,29


In [18]:
adr_section = pd.read_csv('/Users/undinagisladottir/Documents/Columbia/Tatonetti_Lab/20231113_onsides/adverse_reactions_active_labels.csv')
print(adr_section.shape)
adr_section.drop(adr_section[adr_section.ingredients_names.isna()].index, inplace=True)
print(adr_section.shape)
adr_section = adr_section[['pt_meddra_term', 'pt_meddra_id', 'set_id', 'ingredients_rxcuis', 'ingredients_names']].drop_duplicates()
print(adr_section.shape)

(2904397, 7)
(2881519, 7)
(2881519, 5)


In [5]:
rows_to_run = adr_section.values.tolist()

In [None]:
for i in range(100):
    print(rows_to_run[i])

In [None]:
for row in rows_to_run:
    drug = row[4]
    adr = row[0]
    pt_meddra_id = row[1]
    spl_id = i[2]
    rxcui = i[3]

In [None]:
def run_iteration(row):
    drug = row[4]
    adr = row[0]
    pt_meddra_id = row[1]
    rxcui = i[3]
     
    try:
        gpt_out = gpt_call(api_source, config, gpt_model, system_content, prompt, drug, adr, temperature)
        return [drug, rxcui, adr, pt_meddra_id, gpt_out]
    except Exception as err:
        print(f"Encountered an exception for row: {drug} {adr}. Error message below:")
        print(err)
        return None

In [None]:
for i in range(1):    
    results = list()
    
    with concurrent.futures.ThreadPoolExecutor(max_workers=10) as exec:
        results.extend(list(tqdm(
            exec.map(run_iteration, rows_to_run), 
            total=len(rows_to_run)
        )))
            
    gpt_output = pd.DataFrame(
        [r for r in results if r is not None],
        columns=['drug_name', 'spl_id', 'rxcui',  'adr_name', 'pt_meddra_id', 'gpt_output']
    )

In [None]:
gpt_output['response'] = ['true' if 'true' in x.lower() else 'false' for x in gpt_output['gpt_output']]
gpt_output.response.value_counts(normalize=True)

In [None]:
print(f'{system_name}_{prompt_name}_run{i}.csv')
for i in range(1):    
    results = list()
    
    with concurrent.futures.ThreadPoolExecutor(max_workers=10) as exec:
        results.extend(list(tqdm(
            exec.map(run_iteration, rows_to_run), 
            total=len(rows_to_run)
        )))
            
    gpt_output = pd.DataFrame(
        [r for r in results if r is not None],
        columns=['drug_name', 'adr_name', 'gpt_output']
    )
    gpt_output.to_csv(f'{system_name}_{prompt_name}_run{i}.csv')

In [None]:
gpt_output['response'] = ['true' if 'true' in x.lower() else 'false' for x in gpt_output['gpt_output']]
gpt_output.drop(gpt_output[gpt_output.drug_name.isna()].index, inplace=True)

In [None]:
gpt_output[gpt_output.gpt_output == 'False. (The statement is incomplete and "nan" is not a recognizable drug or substance.)'].head()

In [None]:
gpt_output.response.value_counts(normalize=True)

In [None]:
gpt_output.head()

## Agreement with Reference Set

In [None]:
gpt_output = pd.read_csv('lie-detector_side-effect_run0.csv')
gpt_output['response'] = ['true' if 'true' in x.lower() else 'false' for x in gpt_output['gpt_output']]
gpt_output.drop(gpt_output[gpt_output.drug_name.isna()].index, inplace=True)

In [None]:
reference_set = pd.read_csv('reference_set_labels.csv')
reference_set.head()

In [21]:
meddra_reference_map = pd.read_csv('/Users/undinagisladottir/Documents/Columbia/Tatonetti_Lab/causal-drug-ades/data/onsides_mapping.csv')
meddra_reference_map.head()

Unnamed: 0,cohort_id,condition_name,meddra_pt_id,meddra_pt_term,Positive Controls (N)
0,500000401,Acute kidney injury,10038435,Renal failure,20
1,500000401,Acute kidney injury,10062237,Renal impairment,8
2,500000401,Acute kidney injury,10069339,Acute kidney injury,6
3,500000301,Acute liver injury,10000804,Acute hepatic failure,5
4,500000301,Acute liver injury,10019663,Hepatic failure,45


In [22]:
meddra_reference_map.head()

Unnamed: 0,cohort_id,condition_name,meddra_pt_id,meddra_pt_term,Positive Controls (N)
0,500000401,Acute kidney injury,10038435,Renal failure,20
1,500000401,Acute kidney injury,10062237,Renal impairment,8
2,500000401,Acute kidney injury,10069339,Acute kidney injury,6
3,500000301,Acute liver injury,10000804,Acute hepatic failure,5
4,500000301,Acute liver injury,10019663,Hepatic failure,45


In [None]:
data = gpt_output.merge(meddra_reference_map, how='inner', left_on='adr_name', right_on='concept_name')
data.head()

In [None]:
reference_set.head()

In [None]:
data = gpt_output.merge(
    meddra_reference_map, how='inner', left_on='adr_name', right_on='concept_name'
    ).merge(
        reference_set, on=['drug_name', 'condition_name', 'cohort_id'], how='inner'
        )
data.head()

In [None]:
data = data.groupby(['response', 'affect']).size().reset_index(name='n')
data = data.merge(
    data.groupby('affect').sum().reset_index()[['affect', 'n']].rename(columns={'n':'total'}),
    on = 'affect'
)
data['perc'] = data['n'] / data['n'].groupby(data['affect']).transform('sum')
data.head()

In [None]:

data = data[['drug_name', 'condition_name', 'affect', 'response']]
data.groupby(['drug_name', 'condition_name', 'affect']).agg(lambda response: any(response == 'true')).reset_index()

In [None]:
# group by drug_name and condition_name, get max(affect) and response = true if any are true
data[['response', 'affect']].value_counts(normalize=True)

In [None]:
data = data.groupby(['response', 'affect']).size().reset_index(name='n')
data = data.merge(
    data.groupby('affect').sum().reset_index()[['affect', 'n']].rename(columns={'n':'total'}),
    on = 'affect'
)
data['perc'] = data['n'] / data['n'].groupby(data['affect']).transform('sum')

In [None]:
data.head()

In [None]:
reference_set.head(2)

In [None]:
rows_to_run = reference_set[['condition_name', 'drug_name']].values.tolist()
for i in range(1):    
    results = list()
    
    with concurrent.futures.ThreadPoolExecutor(max_workers=10) as exec:
        results.extend(list(tqdm(
            exec.map(run_iteration, rows_to_run), 
            total=len(rows_to_run)
        )))
            
    gpt_output = pd.DataFrame(
        [r for r in results if r is not None],
        columns=['drug_name', 'adr_name', 'gpt_output']
    )
    gpt_output.to_csv(f'{system_name}_{prompt_name}_referenceset_run{i}.csv')

In [None]:
print(f'{system_name}_{prompt_name}_referenceset_run{i}.csv')

In [None]:
ref_res = reference_set.merge(
    gpt_output,
      right_on=['drug_name', 'adr_name'],
      left_on=['drug_name', 'condition_name'],
      how='inner'
)
ref_res['response'] = ['true' if 'true' in x.lower() else 'false' for x in ref_res['gpt_output']]
print(ref_res[['affect', 'response']].value_counts())
ref_res = ref_res.groupby(['response', 'affect']).size().reset_index(name='n')
ref_res = ref_res.merge(
    ref_res.groupby('affect').sum().reset_index()[['affect', 'n']].rename(columns={'n':'total'}),
    on = 'affect'
)
ref_res['perc'] = ref_res['n'] / ref_res['n'].groupby(ref_res['affect']).transform('sum')
print(ref_res)

In [None]:
ref_res.sort_values(by=['affect', 'response'])

In [None]:
gpt_output.head()

## Combining with Reference Set

In [7]:
def gpt_call(api_source, config, gpt_model, system_content, prompt, drug, adr, temperature):
    if api_source == 'OpenAI':
        client = OpenAI(api_key=config[api_source]['openai_api_key'])
    elif api_source == 'Azure':
        client = AzureOpenAI(api_key=config[api_source]['openai_api_key'], api_version="2023-12-01-preview", azure_endpoint=config[api_source]['openai_api_endpoint'])
    else:
        raise Exception(f"Unexpected API source requested: {api_source}")
  
    chat_completion = client.chat.completions.create(
        messages=[
            {"role": "system", "content": system_content},
            {
                "role": "user",
                "content": prompt.format(adr, drug)
            }
        ],
        model=gpt_model,
        temperature=temperature,
    )
    term = chat_completion.choices[0].message.content
    return term

In [10]:
config = json.load(open('../config.json'))

organization = ""

api_source = 'OpenAI'

api_key = config[api_source]['openai_api_key'] #constants.AZURE_OPENAI_KEY
api_endpoint = config[api_source]['openai_api_endpoint'] 

gpt_model = config[api_source]["gpt_model"]
temperature = 0

In [9]:
nruns = 3

system_options = {
    "lie-detector": """
    You are a lie detector. You will be given a statement and you have to determine if it is true or false.
    Do not include an explanation. Only respond with "true" or "false".
    """,
    "pharmaco-expert": """
    You are a pharmacologist. You will be given a statement regarding a drug and potential adverse reaction
    and you have to determine state whether it is true or false.
    Do not include an explanation. Only respond with "true" or "false".
    """,
}

prompt_options = {
    "side-effect": """
True or False: {} is a side effect caused by {}.
""",
    "adverse-reaction": """
{} is an adverse reaction of {}.
""",
    "caused-by": """
{} is caused by {}.
"""
}

In [24]:
meddra_reference_map.head()

Unnamed: 0,cohort_id,condition_name,meddra_pt_id,meddra_pt_term,Positive Controls (N)
0,500000401,Acute kidney injury,10038435,Renal failure,20
1,500000401,Acute kidney injury,10062237,Renal impairment,8
2,500000401,Acute kidney injury,10069339,Acute kidney injury,6
3,500000301,Acute liver injury,10000804,Acute hepatic failure,5
4,500000301,Acute liver injury,10019663,Hepatic failure,45


In [26]:
adr_section = pd.read_csv('/Users/undinagisladottir/Documents/Columbia/Tatonetti_Lab/20231113_onsides/adverse_reactions_active_labels.csv')
adr_condition_name = adr_section.merge(
    meddra_reference_map, how = 'inner',
    left_on = 'pt_meddra_id',
    right_on = 'meddra_pt_id'
)
adr_condition_name.drop(adr_condition_name[adr_condition_name.ingredients_names.isna()].index, inplace=True)
adr_condition_name = adr_condition_name[['condition_name', 'cohort_id', 'pt_meddra_term', 'pt_meddra_id', 'ingredients_names']]
adr_condition_name['ingredients_names'] = adr_condition_name['ingredients_names'].str.split(',')
adr_condition_name = adr_condition_name.explode('ingredients_names')
adr_condition_name.drop_duplicates()

Unnamed: 0,condition_name,cohort_id,pt_meddra_term,pt_meddra_id,ingredients_names
0,GI bleed,500001001,Gastrointestinal haemorrhage,10017955,dantrolene
1,GI bleed,500001001,Gastrointestinal haemorrhage,10017955,ketorolac
2,GI bleed,500001001,Gastrointestinal haemorrhage,10017955,meloxicam
3,GI bleed,500001001,Gastrointestinal haemorrhage,10017955,flurbiprofen
4,GI bleed,500001001,Gastrointestinal haemorrhage,10017955,carvedilol
...,...,...,...,...,...
58494,Acute liver injury,500000301,Acute hepatic failure,10000804,abacavir
58494,Acute liver injury,500000301,Acute hepatic failure,10000804,lamivudine
58512,Acute liver injury,500000301,Acute hepatic failure,10000804,isopropyl alcohol
58540,Acute liver injury,500000301,Acute hepatic failure,10000804,rilpivirine


In [27]:
adr_condition_name['ingredients_names'].unique().shape

(1020,)

In [None]:
reference_set.head()

In [None]:
ref_adr_cond  = adr_condition_name.merge(
    reference_set, left_on = ['condition_name', 'cohort_id', 'ingredients_names'], how = 'left',
    right_on= ['condition_name', 'cohort_id', 'drug_name']
)

In [None]:
rows_to_run = list()
for _, row in ref_adr_cond.iterrows():
    drug_name, condition_name, adr_name = row['drug_name'], row['condition_name'], row['pt_meddra_term']
    rows_to_run.append(row)

In [None]:
def run_iteration(row):
    
    try:
        gpt_out = gpt_call(api_source, config, gpt_model, system_content, prompt,
                            row['ingredients_names'], row['pt_meddra_term'], temperature)
        return [row['ingredients_names'], row['drug_name'], row['condition_name'],
                row['cohort_id'], row['pt_meddra_term'], row['pt_meddra_id'], row['affect'], gpt_out]
    except Exception as err:
        print(f"Encountered an exception for row: {drug} {adr}. Error message below:")
        print(err)
        return None

In [None]:
print(f'{system_name}_{prompt_name}_run{i}.csv')
for i in range(1):    
    results = list()
    
    with concurrent.futures.ThreadPoolExecutor(max_workers=15) as exec:
        results.extend(list(tqdm(
            exec.map(run_iteration, rows_to_run), 
            total=len(rows_to_run)
        )))
            
    gpt_output = pd.DataFrame(
        [r for r in results if r is not None],
        columns=['ingredients_names', 'drug_name', 'condition_name',
                'cohort_id', 'pt_meddra_term', 'pt_meddra_id', 'affect', 'gpt_output']
    )
    gpt_output.to_csv(f'{system_name}_{prompt_name}_run{i}.csv')