In [5]:
import numpy as np
import pandas as pd
import requests
from tqdm import tqdm
from time import sleep
from glob import glob
import ast, re
from sklearn.metrics import classification_report, roc_auc_score, f1_score, precision_score, recall_score
from bs4 import BeautifulSoup
data_folder = ''
api = ''
date = ''

### extract potential specific matching terms

In [7]:
special_patients = pd.read_csv(data_folder+'rx_drug_special_patients_raw_v0120.csv')
special_patients = special_patients[special_patients.special_patients.notna()]
special_patients['special_patients'] = special_patients['special_patients'].apply(lambda x: BeautifulSoup(x, 'html.parser'))
print(special_patients.shape[0])
special_patients.head()

10513


Unnamed: 0,japic_code,special_patients
2,55997,"[[, [9. 特定の背景を有する患者に関する注意], , , [[9.1　合併症・既往歴等..."
16,54356,"[[, [9. 特定の背景を有する患者に関する注意], , , [[9.1　合併症・既往歴等..."
17,54345,"[[, [9. 特定の背景を有する患者に関する注意], , , [[9.1　合併症・既往歴等..."
18,54344,"[[, [9. 特定の背景を有する患者に関する注意], , , [[9.1　合併症・既往歴等..."
20,54341,"[[, [9. 特定の背景を有する患者に関する注意], , , [[9.1　合併症・既往歴等..."


In [None]:
subsections = []
for i, row in tqdm(special_patients.iterrows()):
  japic_code = row['japic_code']
  text = row['special_patients']
  content_blocks = list(zip(text.find_all(class_='contents-title'), text.find_all('div', class_='contents-block')))
  for block in content_blocks:
    if len(block[1].find_all(class_='contents-title')) == 0:
      subsections.append([japic_code, block[0], block[1]])
subsections_df = pd.DataFrame(subsections, columns = ['japic_code', 'subtitle', 'subcontent'])
subsections_df.to_csv(data_folder+'rx_drug_special_patients_subsections_all_v0120.csv', index=False)
subsections_df.head(1)

In [35]:
subsections_df = pd.read_csv(data_folder+'rx_drug_special_patients_subsections_all_v0120.csv')
subsections_df['subtitle'] = subsections_df['subtitle'].apply(lambda x: BeautifulSoup(x, 'html.parser').text)
subsections_df['subcontent'] = subsections_df['subcontent'].apply(lambda x: BeautifulSoup(x, 'html.parser').text)

In [47]:
ped_subsections = subsections_df[(subsections_df.subtitle.str.contains('小児'))|
                                 (subsections_df.subtitle.str.contains('新生児'))|
                                 (subsections_df.subtitle.str.contains('乳児'))|
                                 (subsections_df.subtitle.str.contains('幼児'))]
print(ped_subsections.shape, ped_subsections.japic_code.nunique())
ped_subsections.to_csv(data_folder+'rx_drug_special_patients_subsections_ped_v0120.csv', index=False)

(18967, 3) 5856


In [50]:
ped_subsections = pd.read_csv(data_folder+'rx_drug_special_patients_subsections_ped_v0120.csv')
print(ped_subsections.shape, ped_subsections.japic_code.nunique())
ped_subsections = ped_subsections[(ped_subsections.subcontent.str.contains('実施していない')==False)&
                                  (ped_subsections.subcontent.str.contains('安全性は確立していない')==False)].drop_duplicates()
print(ped_subsections.shape, ped_subsections.japic_code.nunique())
ped_subsections.head(1)

(18967, 3) 5856
(1208, 3) 1103


Unnamed: 0,japic_code,subtitle,subcontent
2,54356,9.7　小児等,長期連用により発育障害をきたすおそれがある。


## Extract and Exact Match w/MedDRA terms

In [52]:
#read in meddra jp database
externals = '/external_data/'
umls_map = pd.read_csv(externals+'umls_meddra_jp.csv')
umls_map['STR'] = umls_map.STR.apply(lambda x: x.lower())
umls_map = umls_map[['STR', 'TTY', 'SDUI']].drop_duplicates(subset='STR')
#umls_map = umls_map[umls_map.TTY.isin(['PT', 'LLT'])]
umls_map_dict = dict(zip(umls_map.STR, umls_map.SDUI))
umls_map.head(1)

Unnamed: 0,STR,TTY,SDUI
0,急性腹症,PT,10000647


In [54]:
exact_terms = []
for i, row in tqdm(ped_subsections.iterrows()):
  label_id = row['japic_code']
  title = row['subtitle']
  text = ''.join(row['subcontent']).lower()
  found_terms = list()
  for mdr_term in umls_map_dict.keys():
    if text.find(mdr_term) == -1:
      continue
    else:
      li = text.split(mdr_term)
      start_pos = 0
      for i in range(len(li)-1):
          # the occurrence of the word is at the end of the previous string
          start_pos = sum([len(li[j]) for j in range(i+1)]) + i*len(mdr_term)
          if not mdr_term == text[start_pos:(start_pos+len(mdr_term))]:
              raise Exception(f" mdr_term: '{mdr_term}', term_in_text: '{text[start_pos:(start_pos+len(mdr_term))]}'")
          found_terms.append((mdr_term, umls_map_dict[mdr_term], start_pos, len(mdr_term)))
  exact_terms.append([label_id, found_terms])

1208it [00:53, 22.52it/s]


In [55]:
umls_en = pd.read_csv(externals+'umls_meddra_en.csv')[['STR', 'TTY', 'SDUI']]
umls_en['STR'] = umls_en.STR.apply(lambda x: x.lower())
#umls_en = umls_en[['STR', 'TTY', 'SDUI']].drop_duplicates(subset='STR')#.drop_duplicates(subset='SDUI')
#if there is a PT for the code, keep it, if not, use an LLT term
umls_en.head(1)

Unnamed: 0,STR,TTY,SDUI
0,acute abdomen,PT,10000647


In [61]:
exact_terms_df = pd.DataFrame(exact_terms, columns=['label_id', 'found_terms'])
exact_terms_df = exact_terms_df.explode('found_terms')
exact_terms_df['found_term'] = exact_terms_df['found_terms'].apply(lambda x: x[0] if str(x) != 'nan' else None)
exact_terms_df['meddra_id'] = exact_terms_df['found_terms'].apply(lambda x: x[1] if str(x) != 'nan' else None)
exact_terms_df['location'] = exact_terms_df['found_terms'].apply(lambda x: x[2] if str(x) != 'nan' else None)
exact_terms_df['len'] = exact_terms_df['found_terms'].apply(lambda x: x[3] if str(x) != 'nan' else None)
exact_terms_df = exact_terms_df.drop('found_terms', axis = 1)
umls_filtered = umls_en[umls_en.TTY == 'PT']
exact_terms_df = exact_terms_df.merge(umls_filtered[['STR', 'SDUI']], left_on = 'meddra_id', right_on = 'SDUI', how = 'left')
exact_terms_df.to_csv(data_folder+'special_patients/ped_patients_parsed_exact_extract_v0120.csv', index=False)
exact_terms_df.head()

Unnamed: 0,label_id,found_term,meddra_id,location,len,STR,SDUI
0,54356,発育障害,10012559.0,7.0,4.0,developmental delay,10012559.0
1,54327,エナメル質形成不全,10044041.0,68.0,9.0,tooth hypoplasia,10044041.0
2,54327,発育不全,10016165.0,86.0,4.0,failure to thrive,10016165.0
3,54327,形成不全,10002961.0,73.0,4.0,aplasia,10002961.0
4,54326,アシドーシス,10000486.0,79.0,6.0,acidosis,10000486.0


---
## Translate Text
We will use gpt-3.5-turbo to translate the japanese drug label sentences. To aid the translation, we use the english MedDRA terms for the corresponding japanese MedDRA term.

In [62]:
filtered_ped_df = pd.read_csv(data_folder+'special_patients/ped_patients_parsed_exact_extract_v0120.csv')
filtered_ped_df.head(1)

Unnamed: 0,label_id,found_term,meddra_id,location,len,STR,SDUI
0,54356,発育障害,10012559.0,7.0,4.0,developmental delay,10012559.0


In [None]:
#implement openai key
openai_api = ''
!pip install openai -q
import os
import openai
from openai import OpenAI
openai.organization = ""
openai.api_key = openai_api

In [None]:
# function for running GPT
def extract_ade_terms(gpt_model, prompt, text, openai_api):
  client = OpenAI(api_key=openai_api,)
  chat_completion = client.chat.completions.create(
      messages=[
          {"role": "system", "content": "You are an expert in pharmacology."},
          {
              "role": "user",
              "content": prompt.format(text)
          }
      ],
      model=gpt_model,
  )
  term = chat_completion.choices[0].message.content
  return term

In [None]:
translations = []
for i, row in tqdm(filtered_ped_df.iterrows()):
  text = row['content']
  if row['japic_code'] in exact_terms_df.label_id:
    translation_keys = exact_terms_df[exact_terms_df.label_id == row['japic_code']]
    en_w = ','.join(translation_keys.STR.tolist())
    jp_w = ','.join(translation_keys.found_term.tolist())
    completion = openai.ChatCompletion.create(
        model="gpt-3.5-turbo",
        messages=[
          {"role": "user", "content": "You will translate the following excerpt from a drug label, but using the exact translation for the following words."\
          "The text is {t}, and the translation is {en_w} for {jp_w} respectively".format(t=text, en_w=en_w, jp_w=jp_w)}
        ]
      )
  else:
    completion = openai.ChatCompletion.create(
        model="gpt-3.5-turbo",
        messages=[
            {"role": "user", "content": "You will translate the following excerpt from a drug label"\
             "The text is {t}.".format(t=text)}
            ])
  sleep(0.5)
  translations.append(completion.choices[0].message['content'])
filtered_ped_df['translation'] = translations
filtered_ped_df.to_csv(data_folder+'special_patients/ped_patients_parsed_translation_extract.csv', index=False)

## run OnSIDES method

In [9]:
ade_df = pd.read_csv(data_folder+'ade/rx_raw_ped_ade_translated.csv')
ade_df.head(1)

Unnamed: 0,drug,content,gpt_output
0,54356,長期連用により発育障害をきたすおそれがある。,There is a possibility of causing growth disor...


In [10]:
f = '/'
meddra_df = pd.read_csv(f+'external_data/umls_meddra_en.csv')
meddra_df['STR'] = meddra_df.STR.apply(lambda x: x.lower())
meddra_df['len'] = meddra_df.STR.apply(lambda x: len(x))
print(meddra_df.shape[0])
meddra_df = meddra_df[(meddra_df['len'] >= 5)]
meddra_df = meddra_df[(meddra_df.TTY.isin(['PT', 'LLT']))]
print(meddra_df.shape[0])
meddra_dict = dict(zip(meddra_df.STR, meddra_df.SDUI))
meddra_df.head(1)

116807
100897


Unnamed: 0,CUI,LAT,TS,LUI,STT,SUI,ISPREF,AUI,SAUI,SCUI,SDUI,SAB,TTY,CODE,STR,SRL,SUPPRESS,CVF,len
0,C0000727,ENG,P,L0000727,VCW,S0584932,N,A0639292,,,10000647,MDR,PT,10000647,acute abdomen,3,N,256.0,13


In [12]:
app_data = []
meddra_names = meddra_df.STR.unique().tolist()
for ade_text in tqdm(ade_df.gpt_output.tolist()):
  if str(ade_text) != 'nan':
    ade_text = ade_text.lower()
    meddra_found_terms = []
    #iterate through list of meddra concepts, if found - add to list
    for concept_name in meddra_names:
      if ade_text.find(concept_name) == -1:
        continue
      else:
        i = ade_text.index(concept_name)
        meddra_found_terms.append([i, concept_name, meddra_dict[concept_name]])
    app_data.append([ade_text, meddra_found_terms])

100%|██████████| 1036/1036 [00:40<00:00, 25.55it/s]


In [15]:
app_data_df = pd.DataFrame(app_data, columns = ['string', 'list'])
app_data_df = app_data_df.explode('list')
app_data_df['location'] = app_data_df['list'].apply(lambda x: x[0] if str(x) != 'nan' else None)
app_data_df['found_term'] = app_data_df['list'].apply(lambda x: x[1] if str(x) != 'nan' else None)
app_data_df['meddra_id'] = app_data_df['list'].apply(lambda x: x[2] if str(x) != 'nan' else None)
app_data_df = app_data_df.drop(['list'], axis = 1)
app_data_df.to_csv(data_folder+'ade/rx_raw_ped_ade_matched_terms.csv', index=False)
app_data_df.head(1)

Unnamed: 0,string,location,found_term,meddra_id
0,there is a possibility of causing growth disor...,34.0,growth disorder,10081945.0


In [17]:
building_strings = []
app_data_df = app_data_df.rename(columns = {'string':'ar_text'})
for i, row in tqdm(app_data_df.iterrows()):
  if str(row['location']) != 'nan':
    term, start_pos, ar_text = row['found_term'], int(row['location']), row['ar_text']
    #default settings
    nwords, prop_before = 125, 0.125
    #pull the full text
    #ar_text = ade_text_table_dict[label_id]

    term_nwords = len(term)
    size_before = max(int((nwords-2*term_nwords)*prop_before), 1)
    size_after = max(int((nwords-2*term_nwords)*(1-prop_before)), 1)

    before_text = ar_text[:start_pos]
    after_text = ar_text[(start_pos+term_nwords):]

    before_parts = before_text.split()[-1*size_before:]
    after_parts = after_text.split()[:size_after]

    li = [term]
    li.extend(before_parts)
    li.append('EVENT')
    li.extend(after_parts)
    example_string = ' '.join(li)
    building_strings.append(example_string)
  else:
    building_strings.append(None)
app_data_df['string'] = building_strings
app_data_df.to_csv(data_folder+'ade/rx_raw_ped_ade_matched_terms.csv', index=False)

3093it [00:00, 8343.77it/s]


---

In [33]:
data_folder = ''
ade_df = pd.read_csv(data_folder+'ade/rx_raw_ped_ade_translated.csv')
ade_df['ar_text'] = ade_df['gpt_output'].apply(lambda x: x.lower())
app_data_df = pd.read_csv(data_folder+'ade/rx_raw_ped_ade_matched_terms.csv')
exact_terms_df = ade_df.merge(app_data_df, how='left', on='ar_text')
exact_terms_df['section'] = 'AR'
exact_terms_df['set_id'] = exact_terms_df['drug']
folder = '/external_data/'
llt_pt = pd.read_csv(folder+'meddra_llt_pt_map.txt', delimiter = '|')
llt_pt_id_dict = dict(zip(llt_pt.llt_concept_code, llt_pt.pt_concept_code))
llt_pt_term_dict = dict(zip(llt_pt.llt_concept_code, llt_pt.pt_concept_name))
exact_terms_df['pt_meddra_id'] = exact_terms_df.meddra_id.apply(lambda x: llt_pt_id_dict[x] if x in llt_pt_id_dict.keys() else None)
exact_terms_df['pt_meddra_term'] =  exact_terms_df.meddra_id.apply(lambda x: llt_pt_term_dict[x] if x in llt_pt_term_dict.keys() else None)
print(exact_terms_df.shape)
exact_terms_df = exact_terms_df.drop(columns = ['gpt_output', 'ar_text'])
exact_terms_df.to_csv(data_folder+'ped/sentences-rx_method14_nwords125_clinical_bert_application_set_AR_v0120.csv', index=False)
exact_terms_df.head()

(4327, 12)


Unnamed: 0,drug,content,location,found_term,meddra_id,string,section,set_id,pt_meddra_id,pt_meddra_term
0,54356,長期連用により発育障害をきたすおそれがある。,34.0,growth disorder,10081945.0,growth disorder there is a possibility of caus...,AR,54356,10081945.0,Growth disorder
1,54327,他の薬剤が使用できないか、無効の場合にのみ適用を考慮すること。小児等（特に歯牙形成期にある8...,185.0,tooth discoloration,10044032.0,tooth discoloration the age of 8 during tooth ...,AR,54327,10044032.0,Tooth discolouration
2,54326,9.7.2　特に必要とする場合には慎重に投与すること。外国において、ベンジルアルコールの静脈...,136.0,acidosis,10000486.0,"acidosis necessary, administer with caution. t...",AR,54326,10000486.0,Acidosis
3,54326,9.7.2　特に必要とする場合には慎重に投与すること。外国において、ベンジルアルコールの静脈...,192.0,infant,10021731.0,infant reports of toxicity symptoms (labored b...,AR,54326,10021731.0,Infant
4,54326,9.7.2　特に必要とする場合には慎重に投与すること。外国において、ベンジルアルコールの静脈...,146.0,seizures,10039906.0,seizures administer with caution. there have b...,AR,54326,10039906.0,Seizure


In [6]:
exact_terms_df = pd.read_csv(data_folder+'ped/sentences-rx_method14_nwords125_clinical_bert_application_set_AR_v0120.csv')
exact_terms_df = exact_terms_df[exact_terms_df.string.notna()]
exact_terms_df.to_csv(data_folder+'ped/sentences-rx_method14_nwords125_clinical_bert_application_set_AR_v0120.csv', index=False)

### run onsides model

In [None]:
#prep onsides materials
folder = 'onsides'
ar_model = folder + '/bestepoch-bydrug-PMB_14-AR-125-all_222_24_25_2.5e-05_256_32.pth'
bw_model = folder + '/bestepoch-bydrug-PMB_14-AR-125-all_222_24_25_2.5e-05_256_32.pth'
import os
os.chdir('')
!python3 -m pip install -r requirements.txt

In [7]:
#use onsides model
data_folder = 'data/'
f = data_folder+'ped/sentences-rx_method14_nwords125_clinical_bert_application_set_AR_v0120.csv'
!python3 src/predict.py --model $ar_model --examples $f

2024-01-27 02:36:56.953512: I tensorflow/core/util/port.cc:113] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2024-01-27 02:36:57.007245: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:9261] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-01-27 02:36:57.007294: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:607] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-01-27 02:36:57.009020: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1515] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2024-01-27 02:36:57.017330: I tensorflow/core/platform/cpu_feature_guar