In [3]:
import numpy as np
import pandas as pd
import requests
from tqdm import tqdm
from glob import glob
import ast, json
from bs4 import BeautifulSoup
#specify home folder (all sub-folders, scripts, data in this folder)
external_data_folder = ''
data_folder = ''
api = ''
label_folder = data_folder+'data/raw/'

In [None]:
#implement openai key
openai_api = ''
import os
import openai
openai.organization = ""
openai.api_key = openai_api

In [4]:
##read in table for all drugs
kegg_df = pd.read_csv(data_folder+'data/kegg_rx_drug_data.csv')
print(kegg_df.shape[0])
kegg_df.head(1)

13382


Unnamed: 0,product,ingredient,indication,kegg_drug_id,kegg_product_id
0,オファコルカプセル50mg\n \n \n(レクメド),コール酸;Cholic Acid,先天性胆汁酸代謝異常症治療薬,,70830


## 1 - (Rx/OTC) Rule-based Extraction

### 1.1 - Rx (tabular, rule-based extraction)

input : rx_drug_ade_raw.csv\
output : rx_raw_ade_extraction.csv\
method : we use the meddra and msh japanese maps to directly extract exact-matched terms. for the direct extraction from tabular data, we can be reasonably confident. (TODO : better filtering algorithm)

In [5]:
ades_df = pd.read_csv(data_folder+'data/rx_drug_ade_raw.csv')
print(ades_df.shape, len(ades_df.japic_code.unique().tolist()))
ades_df.head(1)

(99132, 3) 11027


Unnamed: 0,japic_code,tags,ade
0,70793,"('過敏症', '1％未満')",発疹


In [6]:
#because we have strings of data table cells, we just look into the unique cell strings. (there is a lot of overlap)
unique_ades = ades_df[['ade']].drop_duplicates()
print(unique_ades.shape)
unique_ades.head(1)

(27151, 1)


Unnamed: 0,ade
0,発疹


In [None]:
#prep for mapping
#read in the meddra map file
umls_map = pd.read_csv(external_data_folder+'umls_meddra_jp.csv')
umls_mjp_str_sdui = dict(zip(umls_map.STR, umls_map.SDUI)) #dict of meddra str -> sdui
#read in the mesh-jp map file
umls_mshjp = pd.read_csv(external_data_folder+'umls_mshjpn.csv')
umls_mshjp = umls_mshjp[['CUI', 'STR']].merge(umls_map[['CUI', 'SDUI']], on = 'CUI', how = 'inner')
umls_mshjp_str_cui = dict(zip(umls_mshjp.STR, umls_mshjp.SDUI)) #dict of mesh-jp -> meddra sdui
umls_map.head(1)

Unnamed: 0,CUI,LAT,TS,LUI,STT,SUI,ISPREF,AUI,SAUI,SCUI,SDUI,SAB,TTY,CODE,STR,SRL,SUPPRESS,CVF
0,C0000727,JPN,P,L3471380,PF,S3998874,N,A11418933,,,10000647,MDRJPN,PT,10000647,急性腹症,3,N,


In [None]:
meddra_found_ades = []
mesh_found_ades = []
meddra_names = umls_map.STR.unique().tolist()
mesh_names = umls_mshjp.STR.unique().tolist()

for ade_text in tqdm(unique_ades.ade.tolist()):
  ar_text = ade_text
  meddra_found_terms = []
  mesh_found_terms = []
  #iterate through list of meddra concepts, if found - add to list
  for concept_name in meddra_names:
    if ar_text.find(concept_name) == -1:
      continue
    else:
      meddra_found_terms.append(concept_name)
  #check to see if term is in MeSH too (extra coverage)
  for concept_name in mesh_names:
    if ar_text.find(concept_name) == -1:
      continue
    else:
      mesh_found_terms.append(concept_name)
  mesh_found_ades.append(mesh_found_terms)
  meddra_found_ades.append(meddra_found_terms)

unique_ades['meddra_ade_list'] = meddra_found_ades
unique_ades['mesh_ade_list'] = mesh_found_ades
unique_ades['ade_combination_num'] = unique_ades.ade_list.apply(lambda x: len(x) if x!= None else None)
unique_ades.to_csv(data_folder+'data/ade/rx_raw_ade_extraction.csv', index=False)
unique_ades.head()

Unnamed: 0,ade,ade_list,ade_combination_num,meddra_ade_list,mesh_ade_list
0,発疹,['発疹'],6,[発疹],[発疹]
1,そう痒,['そう痒'],7,[そう痒],[そう痒]
2,悪心、嘔吐、下痢、腹部不快感,"['下痢', '悪心', '嘔吐', '腹部不快感', '不快感']",34,"[下痢, 悪心, 嘔吐, 腹部不快感, 不快感]","[下痢, 悪心, 嘔吐]"
3,頭痛,['頭痛'],6,[頭痛],"[頭痛, 痛]"
4,脂質異常症,"['脂質異常症', '脂質', '脂質異常']",23,"[脂質異常症, 脂質, 脂質異常]",[]


---

## 2. (Rx/OTC) OnSIDES method based extraction

some of the strings in both the free-text and table cells aren't MedDRA terms and are more complicated. We use the method we used to extract terms from fda labels for these strings. However, the labels are in Japanese - we first translate them using GPT-3.5-turbo, and then run the method on the translated strings.

### 2.0 - (Rx) extracting strings that need the onsides extraction

In [None]:
unique_ades = pd.read_csv(data_folder+'data/ade/rx_raw_ade_extraction.csv')
unique_ades['meddra_ade_list'] = unique_ades['meddra_ade_list'].apply(lambda x: ast.literal_eval(x))
unique_ades['mesh_ade_list'] = unique_ades['mesh_ade_list'].apply(lambda x: ast.literal_eval(x))
unique_ades['ade_combination_num'] = unique_ades.apply(lambda x: len(x.meddra_ade_list) + len(x.mesh_ade_list), axis = 1)
unique_ades = unique_ades.drop('ade_list', axis = 1)
unique_ades.head(1)

Unnamed: 0,ade,ade_combination_num,meddra_ade_list,mesh_ade_list
0,発疹,2,[発疹],[発疹]


In [None]:
#we will only do translation extraction for the text with no terms extracted for now
gpt_extraction = unique_ades[unique_ades.ade_combination_num == 0][['ade']]
gpt_extraction.head(1)

Unnamed: 0,ade
9,下垂体・副腎皮質系機能の抑制


### 2.1 translate terms to english using GPT

In [None]:
gpt_translation = []
for word in tqdm(gpt_extraction.ade.tolist()):
  completion = openai.ChatCompletion.create(
      model="gpt-3.5-turbo",
      messages=[
        {"role": "user", "content": "Translate the following text including medical terms into English."\
        "Reply with only the translation. The word is {}".format(word)}
      ]
    )
  term = completion.choices[0].message['content']
  gpt_translation.append(term)
gpt_extraction['translation'] = gpt_translation

100%|██████████| 224/224 [04:02<00:00,  1.08s/it]


In [None]:
gpt_extraction.to_csv(data_folder+'data/ade/rx_raw_ade_gpt_extraction.csv', index=False)

### 2.2 extract meddra terms from translation string

In [None]:
gpt_extraction = pd.read_csv(data_folder+'data/ade/rx_raw_ade_gpt_extraction.csv')
gpt_extraction.head(1)

Unnamed: 0,ade,translation
0,下垂体・副腎皮質系機能の抑制,Suppression of pituitary-adrenal cortical syst...


In [None]:
#read in the meddra map file
umls_map = pd.read_csv(external_data_folder+'umls_meddra_en.csv')
umls_map['STR'] = umls_map.STR.apply(lambda x: x.lower())
umls_men_str_sdui = dict(zip(umls_map.STR, umls_map.SDUI)) #dict of meddra str -> sdui
meddra_names = umls_map.STR.tolist()
umls_map.head(1)

Unnamed: 0,CUI,LAT,TS,LUI,STT,SUI,ISPREF,AUI,SAUI,SCUI,SDUI,SAB,TTY,CODE,STR,SRL,SUPPRESS,CVF
0,C0000727,ENG,P,L0000727,VCW,S0584932,N,A0639292,,,10000647,MDR,PT,10000647,acute abdomen,3,N,256.0


In [None]:
app_data = []
for ade_text in tqdm(gpt_extraction.translation.tolist()):
  ade_text = ade_text.lower()
  meddra_found_terms = []
  #iterate through list of meddra concepts, if found - add to list
  for concept_name in meddra_names:
    if ade_text.find(concept_name) == -1:
      continue
    else:
      i = ade_text.index(concept_name)
      meddra_found_terms.append([i, concept_name, umls_men_str_sdui[concept_name]])
  app_data.append([ade_text, meddra_found_terms])

100%|██████████| 224/224 [00:05<00:00, 37.64it/s]


In [None]:
app_data_df = pd.DataFrame(app_data, columns = ['string', 'list'])
app_data_df = app_data_df.explode('list')
app_data_df['index'] = app_data_df['list'].apply(lambda x: x[0] if str(x) != 'nan' else None)
app_data_df['found_term'] = app_data_df['list'].apply(lambda x: x[1] if str(x) != 'nan' else None)
app_data_df['meddra_id'] = app_data_df['list'].apply(lambda x: x[2] if str(x) != 'nan' else None)
app_data_df = app_data_df.drop(['list'], axis = 1)
app_data_df.to_csv(data_folder+'data/ade/rx_raw_ade_gpt_onsides_app.csv', index=False)
app_data_df.head(1)

Unnamed: 0,string,index,found_term,meddra_id
0,suppression of pituitary-adrenal cortical syst...,46.0,em,10015218.0


### 2.3 format to be used for onsides prediction, run onsides

In [None]:
app_data_df = pd.read_csv(data_folder+'data/ade/rx_raw_ade_gpt_onsides_app.csv')
app_data_df['term_len'] = app_data_df.found_term.apply(lambda x: len(x) if str(x) != 'nan' else None)
app_data_df = app_data_df[app_data_df.term_len >= 5]
app_data_df['section'] = 'AR'
app_data_df['drug'] = 'KEGG DRUG'
app_data_df['label_id'] = 'KEGG'
app_data_df['set_id'] = 'KEGG'
app_data_df['spl_version'] = 'v0609'
app_data_df['source_method'] = 'GPT'
app_data_df.head(1)

Unnamed: 0,string,index,found_term,meddra_id,term_len,section,drug,label_id,set_id,spl_version,source_method
5,suppression of pituitary-adrenal cortical syst...,27.0,renal,10038359.0,5.0,AR,KEGG DRUG,KEGG,KEGG,v0609,GPT


In [None]:
meddra_llt_pt = pd.read_csv(external_data_folder+'meddra_llt_pt_map.txt', delimiter = '|')
meddra_llt_pt_name_dict = dict(zip(meddra_llt_pt.llt_concept_name, meddra_llt_pt.pt_concept_name))
meddra_pt_pt_name_dict = dict(zip(meddra_llt_pt.pt_concept_name, meddra_llt_pt.pt_concept_name))
meddra_llt_pt_code_dict = dict(zip(meddra_llt_pt.llt_concept_name, meddra_llt_pt.pt_concept_code))
meddra_pt_pt_code_dict = dict(zip(meddra_llt_pt.pt_concept_name, meddra_llt_pt.pt_concept_code))
app_data_df['pt_meddra_term'] = app_data_df['found_term'].apply(lambda x: meddra_pt_pt_name_dict[x] if x in meddra_pt_pt_name_dict.keys() \
                                                                else (meddra_llt_pt_name_dict[x] if x in meddra_llt_pt_name_dict.keys() else \
                                                                      None) )
app_data_df['pt_meddra_id'] = app_data_df['found_term'].apply(lambda x: meddra_pt_pt_code_dict[x] if x in meddra_pt_pt_code_dict.keys() \
                                                                else (meddra_llt_pt_code_dict[x] if x in meddra_llt_pt_code_dict.keys() else \
                                                                      None) )
meddra_llt_pt.head(1)

Unnamed: 0,llt_concept_id,llt_concept_name,llt_concept_code,pt_concept_id,pt_concept_name,pt_concept_code
0,C0000727,acute abdomen,10000647,C0000727,acute abdomen,10000647


In [None]:
#section	drug	label_id	set_id	spl_version	meddra_id	pt_meddra_id	source_method	pt_meddra_term	found_term	string
app_data_df.to_csv(data_folder+'data/ade/rx_raw_ade_gpt_onsides_app.csv', index=False)

In [None]:
#prep to run onsides model
data = data_folder+'data/ade/rx_raw_ade_gpt_onsides_app.csv'
import os
os.chdir('')
!python3 -m pip install -r requirements.txt

In [None]:
#run onsides model
folder = ''
ar_model = folder + '/bestepoch-bydrug-PMB_14-AR-125-all_222_24_25_2.5e-05_256_32.pth'
bw_model = folder + '/bestepoch-bydrug-PMB_14-AR-125-all_222_24_25_2.5e-05_256_32.pth'
!python3 src/predict.py --model $ar_model --examples $data

### 2.4 filter onsides results, and parse into usable format

In [None]:
result_file = data_folder+'data/ade/rx_raw_ade_gpt_onsides_app_.csv'
import os
os.chdir('')
!python3 src/create_onsides_datafiles.py --release v2.0.0-AR --results $result

## 3. compile extracted ades
Reinsert unique ades into the original extracted table

In [None]:
#combine step 1 and 2 into a full table of extracted ades from the unique ades

In [None]:
#reinsert these ades into the original extracted table