In [48]:
import csv
import random
import numpy as np
import pandas as pd
from sentence_transformers import SentenceTransformer
from sentence_transformers.util import cos_sim
import concurrent
from tqdm import tqdm
import json
import pickle

In [3]:
model_name = 'llmrails/ember-v1'
model = SentenceTransformer(model_name)

  warn("The installed version of bitsandbytes was compiled without GPU support. "


'NoneType' object has no attribute 'cadam32bit_grad_fp32'


## Overlapping Dataset

In [4]:
overlap_data = pd.read_csv('data/overlap_data.csv').drop_duplicates()
overlap_data.head()

Unnamed: 0,drug,adverse_event,comp_term,is_overlapping
0,XEOMIN,Hypersensitivity,Precautions Dysphagia and,0
1,XEOMIN,Hypersensitivity,reactions to XEOMIN are discussed in greater d...,0
2,XEOMIN,Dysphagia,Dysphagia and Breathing Difficulties in Treatment,1
3,XEOMIN,Dysphagia,Dysphagia and,1
5,XEOMIN,Dysphagia,Dysphagia and Breathing,1


In [6]:
overlap_data.shape

(46840, 4)

In [7]:
similarities = []
for ind, row in enumerate(overlap_data.itertuples()):
    if ind > 10:
        break
    ade_embed = model.encode(row.adverse_event)
    comp_term = model.encode(row.comp_term)
    similarities.append(cos_sim(ade_embed, comp_term))

In [11]:
ade_100 = model.encode(overlap_data.adverse_event.unique()[:1000])

In [19]:
drug_ade_embed = {}
for drug in tqdm(overlap_data.drug.unique()):
    sub_data = overlap_data[overlap_data.drug == drug]
    ade_list = list(sub_data.adverse_event.unique())
    comp_list = list(sub_data.comp_term.unique())
    all_list = ade_list + comp_list
    all_embed = model.encode(all_list)
    drug_ade_embed[drug] = dict(zip(all_list, all_embed))
    

100%|██████████| 101/101 [46:31<00:00, 27.64s/it]


In [32]:
new_embed = {}
for drug, embed_dict in tqdm(drug_ade_embed.items()):
    new_embed[drug] = {}
    for term, embed in embed_dict.items():
        new_embed[drug][term] = [float(x) for x in embed]

  3%|▎         | 3/101 [00:00<00:09, 10.69it/s]

100%|██████████| 101/101 [00:05<00:00, 16.90it/s]


In [34]:
json.dump(new_embed, open('data/drug_ade_embed.json', 'w'))

In [40]:
pickle.dump(drug_ade_embed, open('data/drug_ade_embed.pkl', 'wb'))

In [43]:
loaded_embeds = pickle.load(open('data/drug_ade_embed.pkl', 'rb'))

In [35]:
loaded_embeds = json.load(open('data/drug_ade_embed.json'))

In [44]:
similarities = []
for row in overlap_data.itertuples():
    drug = row.drug
    ade = row.adverse_event
    comp = row.comp_term
    ade_embed = loaded_embeds[drug][ade]
    comp_embed = loaded_embeds[drug][comp]
    similarities.append(float(cos_sim(ade_embed, comp_embed)))

In [46]:
overlap_data['similarity'] = similarities
overlap_data.to_csv('data/overlap_data_w_similarity.csv', index=False)

## GPT output compared to manual

In [52]:
THRESHOLD = 0.6163216

In [49]:
gpt_output = pd.read_csv('results/OpenAI_gpt-4-1106-preview_fatal-prompt-v2_pharmexpert-v1_temp0_train_run0.csv',
                         usecols = ['drug_name', 'section_name', 'gpt_output'])
gpt_output['gpt_output'] = gpt_output['gpt_output'].str.replace('.', '').str.split(', ')
gpt_output = gpt_output.explode('gpt_output').reset_index(drop = True).drop_duplicates()
manual_file = pd.read_csv('data/train_drug_label_text_manual_ades.csv',
                          usecols = ['drug_name', 'reaction_string', 'meddra_pt', 'section_name',
                                     'discontinuous_term', 'negated_term', 'meddra_exact_term']).drop_duplicates()
gpt_output.head()

Unnamed: 0,drug_name,section_name,gpt_output
0,KYPROLIS,adverse reactions,Cardiac Toxicities
1,KYPROLIS,adverse reactions,Acute Renal Failure
2,KYPROLIS,adverse reactions,Tumor Lysis Syndrome
3,KYPROLIS,adverse reactions,Pulmonary Toxicity
4,KYPROLIS,adverse reactions,Pulmonary Hypertension


In [50]:
gpt_embeds = model.encode(gpt_output['gpt_output'].tolist())
gpt_output['embeds'] = list(gpt_embeds)

In [51]:
man_embeds = model.encode(manual_file['reaction_string'].tolist())
manual_file['embeds'] = list(man_embeds)

In [30]:
def get_similarities(row):

Unnamed: 0,drug_name,section_name,gpt_output,embeds
0,KYPROLIS,adverse reactions,Cardiac Toxicities,"[-0.11013074, 0.6761941, 0.7869268, 0.3508221,..."
1,KYPROLIS,adverse reactions,Acute Renal Failure,"[0.010086697, 0.9340907, -0.20226005, -0.00631..."
2,KYPROLIS,adverse reactions,Tumor Lysis Syndrome,"[0.768426, -0.20337762, 0.062222198, 0.8446492..."
3,KYPROLIS,adverse reactions,Pulmonary Toxicity,"[-0.09318504, 0.15409195, 0.7109283, -0.237119..."
4,KYPROLIS,adverse reactions,Pulmonary Hypertension,"[-0.39612737, 0.2627319, -0.0540106, -0.066323..."


In [82]:
for drug in manual_file.drug_name.unique():
    sub_man = manual_file[manual_file.drug_name == drug]
    sub_gpt = gpt_output[gpt_output.drug_name == drug]
    TP = 0
    FP = 0
    FN = 0
    for row in sub_man.itertuples():
        sims = [cos_sim(row.embeds, gpt_emb) for gpt_emb in sub_gpt.embeds]
        # print(drug, row.reaction_string, sub_gpt.gpt_output.values[np.argmax(sims)], np.max(sims))
        if np.max(sims) > THRESHOLD:
            TP += 1
        else:
            FN += 1
        
    for row in sub_gpt.itertuples():
        sims = [cos_sim(row.embeds, man_emb) for man_emb in sub_man.embeds]
        if np.max(sims) < THRESHOLD:
            FP += 1
    print(TP, FP, FN)
    print('precision:', TP / (TP + FP))
    print('recall:', TP / (TP + FN))
    print('f1:', 2 * (TP / (TP + FP)) * (TP / (TP + FN)) / ((TP / (TP + FP)) + (TP / (TP + FN))))
    break
    

85 0 3
precision: 1.0
recall: 0.9659090909090909
f1: 0.9826589595375723


In [None]:
105,XEOMIN,all,46,41,41,0.0,5,1.0,0.8913043478260869,0.9425287356321839