In [48]:
import csv
import random
import numpy as np
import pandas as pd
from sentence_transformers import SentenceTransformer
from sentence_transformers.util import cos_sim
import concurrent
from tqdm import tqdm
import json
import pickle

In [100]:
model_name = 'Salesforce/SFR-Embedding-Mistral'
model = SentenceTransformer(model_name)

modules.json: 100%|██████████| 229/229 [00:00<00:00, 1.55MB/s]
config_sentence_transformers.json: 100%|██████████| 123/123 [00:00<00:00, 1.02MB/s]
README.md: 100%|██████████| 84.9k/84.9k [00:00<00:00, 2.05MB/s]
sentence_bert_config.json: 100%|██████████| 54.0/54.0 [00:00<00:00, 334kB/s]
config.json: 100%|██████████| 663/663 [00:00<00:00, 4.87MB/s]
model.safetensors.index.json: 100%|██████████| 22.2k/22.2k [00:00<00:00, 36.0MB/s]
model-00001-of-00003.safetensors: 100%|██████████| 4.94G/4.94G [24:27<00:00, 3.37MB/s]
model-00002-of-00003.safetensors: 100%|██████████| 5.00G/5.00G [21:26<00:00, 3.89MB/s]
model-00003-of-00003.safetensors: 100%|██████████| 4.28G/4.28G [09:52<00:00, 7.22MB/s]
Downloading shards: 100%|██████████| 3/3 [55:48<00:00, 1116.02s/it]
Loading checkpoint shards: 100%|██████████| 3/3 [00:56<00:00, 18.70s/it]
tokenizer_config.json: 100%|██████████| 981/981 [00:00<00:00, 3.80MB/s]
tokenizer.model: 100%|██████████| 493k/493k [00:00<00:00, 2.31MB/s]
tokenizer.json: 100%|████

RuntimeError: MPS backend out of memory (MPS allocated: 12.97 GB, other allocations: 526.98 MB, max allowed: 13.57 GB). Tried to allocate 224.00 MB on private pool. Use PYTORCH_MPS_HIGH_WATERMARK_RATIO=0.0 to disable upper limit for memory allocations (may cause system failure).

## Overlapping Dataset

In [86]:
overlap_data = pd.read_csv('data/overlap_data.csv').drop_duplicates()
overlap_data.head()

Unnamed: 0,drug,adverse_event,comp_term,is_overlapping
0,XEOMIN,Hypersensitivity,Precautions Dysphagia and,0
1,XEOMIN,Hypersensitivity,reactions to XEOMIN are discussed in greater d...,0
2,XEOMIN,Dysphagia,Dysphagia and Breathing Difficulties in Treatment,1
3,XEOMIN,Dysphagia,Dysphagia and,1
5,XEOMIN,Dysphagia,Dysphagia and Breathing,1


In [88]:
overlap_data.shape

(46840, 4)

In [89]:
drug_ade_embed = {}
for drug in tqdm(overlap_data.drug.unique()):
    sub_data = overlap_data[overlap_data.drug == drug]
    ade_list = list(sub_data.adverse_event.unique())
    comp_list = list(sub_data.comp_term.unique())
    all_list = ade_list + comp_list
    all_embed = model.encode(all_list)
    drug_ade_embed[drug] = dict(zip(all_list, all_embed))
    

100%|██████████| 101/101 [45:37<00:00, 27.11s/it]


In [90]:
new_embed = {}
for drug, embed_dict in tqdm(drug_ade_embed.items()):
    new_embed[drug] = {}
    for term, embed in embed_dict.items():
        new_embed[drug][term] = [float(x) for x in embed]

100%|██████████| 101/101 [00:05<00:00, 18.46it/s]


In [91]:
json.dump(new_embed, open('data/drug_ade_embed_modeluae.json', 'w'))

In [92]:
pickle.dump(drug_ade_embed, open('data/drug_ade_embed_modeluae.pkl', 'wb'))

In [97]:
loaded_embeds = pickle.load(open('data/drug_ade_embed_modeluae.pkl', 'rb'))

In [None]:
loaded_embeds = json.load(open('data/drug_ade_embed_modeluae.json'))

In [98]:
similarities = []
for row in overlap_data.itertuples():
    drug = row.drug
    ade = row.adverse_event
    comp = row.comp_term
    ade_embed = loaded_embeds[drug][ade]
    comp_embed = loaded_embeds[drug][comp]
    similarities.append(float(cos_sim(ade_embed, comp_embed)))

In [99]:
overlap_data['similarity'] = similarities
overlap_data.to_csv('data/overlap_data_w_similarity_modeluae.csv', index=False)

## GPT output compared to manual

In [83]:
THRESHOLD = 0.6681796

In [49]:
gpt_output = pd.read_csv('results/OpenAI_gpt-4-1106-preview_fatal-prompt-v2_pharmexpert-v1_temp0_train_run0.csv',
                         usecols = ['drug_name', 'section_name', 'gpt_output'])
gpt_output['gpt_output'] = gpt_output['gpt_output'].str.replace('.', '').str.split(', ')
gpt_output = gpt_output.explode('gpt_output').reset_index(drop = True).drop_duplicates()
manual_file = pd.read_csv('data/train_drug_label_text_manual_ades.csv',
                          usecols = ['drug_name', 'reaction_string', 'meddra_pt', 'section_name',
                                     'discontinuous_term', 'negated_term', 'meddra_exact_term']).drop_duplicates()
gpt_output.head()

Unnamed: 0,drug_name,section_name,gpt_output
0,KYPROLIS,adverse reactions,Cardiac Toxicities
1,KYPROLIS,adverse reactions,Acute Renal Failure
2,KYPROLIS,adverse reactions,Tumor Lysis Syndrome
3,KYPROLIS,adverse reactions,Pulmonary Toxicity
4,KYPROLIS,adverse reactions,Pulmonary Hypertension


In [50]:
gpt_embeds = model.encode(gpt_output['gpt_output'].tolist())
gpt_output['embeds'] = list(gpt_embeds)

In [51]:
man_embeds = model.encode(manual_file['reaction_string'].tolist())
manual_file['embeds'] = list(man_embeds)

In [84]:
for drug in manual_file.drug_name.unique():
    sub_man = manual_file[manual_file.drug_name == drug]
    sub_gpt = gpt_output[gpt_output.drug_name == drug]
    TP = 0
    FP = 0
    FN = 0
    for row in sub_man.itertuples():
        sims = [cos_sim(row.embeds, gpt_emb) for gpt_emb in sub_gpt.embeds]
        # print(drug, row.reaction_string, sub_gpt.gpt_output.values[np.argmax(sims)], np.max(sims))
        if np.max(sims) > THRESHOLD:
            TP += 1
        else:
            FN += 1
        
    for row in sub_gpt.itertuples():
        sims = [cos_sim(row.embeds, man_emb) for man_emb in sub_man.embeds]
        if np.max(sims) < THRESHOLD:
            FP += 1
    print(TP, FP, FN)
    print('precision:', TP / (TP + FP))
    print('recall:', TP / (TP + FN))
    print('f1:', 2 * (TP / (TP + FP)) * (TP / (TP + FN)) / ((TP / (TP + FP)) + (TP / (TP + FN))))
    break
    

77 0 11
precision: 1.0
recall: 0.875
f1: 0.9333333333333333
