In [2]:
import os
import time
import json

import concurrent
import numpy as np
import pandas as pd
from tqdm import tqdm
from sentence_transformers import SentenceTransformer
from sentence_transformers.util import cos_sim


In [3]:
manual_file = 'data/TAC2017/test_drug_label_text_manual_ades.csv'
manual_ades = pd.read_csv(manual_file)
manual_ades.head()

Unnamed: 0,drug_name,section_id,reaction_string,meddra_pt,meddra_pt_id,meddra_llt,meddra_llt_id,section_name,section,str,discontinuous_term,negated_term,hypothetical_term,meddra_exact_term
0,IMPAVIDO,S1,nausea,Nausea,10028813.0,,,adverse reactions,S1,nausea,0,0.0,0.0,1
1,IMPAVIDO,S1,vomiting,Vomiting,10047700.0,,,adverse reactions,S1,vomiting,0,0.0,0.0,1
2,IMPAVIDO,S1,diarrhea,Diarrhoea,10012735.0,Diarrhea,10012727.0,adverse reactions,S1,diarrhea,0,0.0,0.0,1
3,IMPAVIDO,S1,headache,Headache,10019211.0,,,adverse reactions,S1,headache,0,0.0,0.0,1
4,IMPAVIDO,S1,decreased appetite,Decreased appetite,10061428.0,,,adverse reactions,S1,decreased appetite,0,0.0,0.0,1


In [4]:
# if using embeddings -- run this once:
# get embeddings for manual annotation --- this part is slow -- but should take <5 min
embed_model_name = 'llmrails/ember-v1'
embed_model = SentenceTransformer(embed_model_name)
man_embeds = embed_model.encode(manual_ades['reaction_string'].tolist())
manual_ades['react_embeds'] = list(man_embeds)

  warn("The installed version of bitsandbytes was compiled without GPU support. "


'NoneType' object has no attribute 'cadam32bit_grad_fp32'


In [14]:
for val in manual_ades['meddra_pt'].dropna():
    if type(val) != type(''):
        print(val)

In [None]:
man_embeds_meddra = embed_model.encode(manual_ades['meddra_pt'].dropna().tolist())

In [26]:
manual_meddra = manual_ades
manual_meddra.dropna(subset=['meddra_pt'], inplace=True)
manual_meddra['meddra_embed'] = list(man_embeds_meddra)

In [34]:
manual_ades['react_embeds'] = list(man_embeds)

In [29]:
generative_results = pd.read_csv('results/extract/OpenAI_gpt-4-1106-preview_gpt-written-prompt_pharmexpert-v0_temp0_test_run0.csv')
generative_results.head()
output = generative_results[['drug_name', 'section_name', 'gpt_output']]
output['gpt_output'] = output['gpt_output'].str.lower().str.replace('.', '').str.replace('\n-', ', ').str.split(', ')
output = output.explode('gpt_output').reset_index(drop = True).drop_duplicates()
output['gpt_output'] = output['gpt_output'].str.strip()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  output['gpt_output'] = output['gpt_output'].str.lower().str.replace('.', '').str.replace('\n-', ', ').str.split(', ')


In [30]:
gpt_embeds = embed_model.encode(output['gpt_output'].tolist())
output['extract_embeds'] = list(gpt_embeds)

In [31]:
output.head()

Unnamed: 0,drug_name,section_name,gpt_output,extract_embeds
0,IMPAVIDO,adverse reactions,nausea,"[0.36169887, 0.0702219, -0.45790783, 0.2367041..."
1,IMPAVIDO,adverse reactions,vomiting,"[-0.08356379, -0.042075437, -0.27159053, 0.207..."
2,IMPAVIDO,adverse reactions,diarrhea,"[-0.74565417, -0.39410535, 0.030682098, 0.1229..."
3,IMPAVIDO,adverse reactions,headache,"[-0.21452497, -0.23164745, -0.0380048, 0.42433..."
4,IMPAVIDO,adverse reactions,decreased appetite,"[-0.2951049, -0.5751922, -0.9559081, 0.7851779..."


In [73]:
count = 0
results = list()
for iter, row in output.iterrows():
    man_df = manual_ades[(manual_ades['drug_name'] == row.drug_name) &
                          (manual_ades['section_name'] == row.section_name)]
    if man_df.shape[0] == 0:
        print(f"no manual annotations for {row.drug_name} - {row.section_name}")
        continue
    sims = [float(cos_sim(row.extract_embeds, man_emb)) for man_emb in man_df.react_embeds]
    results.append([row.drug_name, row.section_name, row.gpt_output,
                    max(sims),
                    man_df['reaction_string'].iloc[np.argmax(sims)],
                    man_df['discontinuous_term'].iloc[np.argmax(sims)],
                    man_df['negated_term'].iloc[np.argmax(sims)],
                    man_df['hypothetical_term'].iloc[np.argmax(sims)],
                    man_df['meddra_exact_term'].iloc[np.argmax(sims)]])
    count += 1

no manual annotations for ACTEMRA - all-concat
no manual annotations for ACTEMRA - all-concat
no manual annotations for ACTEMRA - all-concat
no manual annotations for ACTEMRA - all-concat
no manual annotations for ACTEMRA - all-concat
no manual annotations for ACTEMRA - all-concat
no manual annotations for ACTEMRA - all-concat
no manual annotations for ACTEMRA - all-concat
no manual annotations for ACTEMRA - all-concat
no manual annotations for ACTEMRA - all-concat
no manual annotations for ACTEMRA - all-concat
no manual annotations for ACTEMRA - all-concat
no manual annotations for ACTEMRA - all-concat
no manual annotations for ACTEMRA - all-concat
no manual annotations for ACTEMRA - all-concat
no manual annotations for ACTEMRA - all-concat
no manual annotations for ACTEMRA - all-concat
no manual annotations for ACTEMRA - all-concat
no manual annotations for ACTEMRA - all-concat
no manual annotations for ACTEMRA - all-concat
no manual annotations for ACTEMRA - all-concat
no manual ann

In [76]:
pd.DataFrame(results, columns = ['drug_name', 'section_name', 'gpt_output', 'similarity',
                                 'manual_output',
       'discontinuous_term', 'negated_term', 'hypothetical_term',  'meddra_exact_term']
       ).to_csv('results/embed_cosine_similarity.csv', index = False)

In [27]:
manual_meddra.head()

Unnamed: 0,drug_name,section_id,reaction_string,meddra_pt,meddra_pt_id,meddra_llt,meddra_llt_id,section_name,section,str,discontinuous_term,negated_term,hypothetical_term,meddra_exact_term,react_embeds,meddra_embed
0,IMPAVIDO,S1,nausea,Nausea,10028813.0,,,adverse reactions,S1,nausea,0,0.0,0.0,1,"[0.36169887, 0.0702219, -0.45790783, 0.2367041...","[0.36169887, 0.0702219, -0.45790783, 0.2367041..."
1,IMPAVIDO,S1,vomiting,Vomiting,10047700.0,,,adverse reactions,S1,vomiting,0,0.0,0.0,1,"[-0.08356379, -0.042075437, -0.27159053, 0.207...","[-0.08356379, -0.042075437, -0.27159053, 0.207..."
2,IMPAVIDO,S1,diarrhea,Diarrhoea,10012735.0,Diarrhea,10012727.0,adverse reactions,S1,diarrhea,0,0.0,0.0,1,"[-0.74565417, -0.39410535, 0.030682098, 0.1229...","[-0.18300593, -0.011852082, -0.31014004, 0.226..."
3,IMPAVIDO,S1,headache,Headache,10019211.0,,,adverse reactions,S1,headache,0,0.0,0.0,1,"[-0.21452497, -0.23164745, -0.0380048, 0.42433...","[-0.21452497, -0.23164745, -0.0380048, 0.42433..."
4,IMPAVIDO,S1,decreased appetite,Decreased appetite,10061428.0,,,adverse reactions,S1,decreased appetite,0,0.0,0.0,1,"[-0.2951049, -0.5751922, -0.9559081, 0.7851779...","[-0.2951049, -0.5751922, -0.9559081, 0.7851779..."


In [32]:
count = 0
results = list()
for iter, row in output.iterrows():
    man_df = manual_meddra[(manual_meddra['drug_name'] == row.drug_name) &
                          (manual_meddra['section_name'] == row.section_name)]
    if man_df.shape[0] == 0:
        print(f"no manual annotations for {row.drug_name} - {row.section_name}")
        continue
    sims = [float(cos_sim(row.extract_embeds, man_emb)) for man_emb in man_df.meddra_embed]
    results.append([row.drug_name, row.section_name, row.gpt_output,
                    max(sims),
                    man_df['meddra_pt'].iloc[np.argmax(sims)],
                    man_df['discontinuous_term'].iloc[np.argmax(sims)],
                    man_df['negated_term'].iloc[np.argmax(sims)],
                    man_df['hypothetical_term'].iloc[np.argmax(sims)],
                    man_df['meddra_exact_term'].iloc[np.argmax(sims)]])
    count += 1

no manual annotations for ACTEMRA - all-concat
no manual annotations for ACTEMRA - all-concat
no manual annotations for ACTEMRA - all-concat
no manual annotations for ACTEMRA - all-concat
no manual annotations for ACTEMRA - all-concat
no manual annotations for ACTEMRA - all-concat
no manual annotations for ACTEMRA - all-concat
no manual annotations for ACTEMRA - all-concat
no manual annotations for ACTEMRA - all-concat
no manual annotations for ACTEMRA - all-concat
no manual annotations for ACTEMRA - all-concat
no manual annotations for ACTEMRA - all-concat
no manual annotations for ACTEMRA - all-concat
no manual annotations for ACTEMRA - all-concat
no manual annotations for ACTEMRA - all-concat
no manual annotations for ACTEMRA - all-concat
no manual annotations for ACTEMRA - all-concat
no manual annotations for ACTEMRA - all-concat
no manual annotations for ACTEMRA - all-concat
no manual annotations for ACTEMRA - all-concat
no manual annotations for ACTEMRA - all-concat
no manual ann

In [33]:
pd.DataFrame(results, columns = ['drug_name', 'section_name', 'gpt_output', 'similarity',
                                 'manual_output',
       'discontinuous_term', 'negated_term', 'hypothetical_term',  'meddra_exact_term']
       ).to_csv('results/embed_meddra_cosine_similarity.csv', index = False)