In [2]:
import numpy as np
import pandas as pd
import requests
from tqdm import tqdm
from glob import glob
import ast, json
from bs4 import BeautifulSoup
#specify home folder (all sub-folders, scripts, data in this folder)
data_folder = ''
api = ''

### extract potential ades

In [4]:
content_df = pd.read_csv(data_folder+'data/drug_content.csv')
content_df = content_df[content_df.section_title.str.contains('4.8 Undesirable effects')]
content_df = content_df[content_df.section_content.notna()]
content_df.to_csv(data_folder+'data/ade/drug_ue_text.csv', index=False)
content_df.head(1)

Unnamed: 0,product_id,section_id,section_content,section_title
16058,trogarzo,4.8 Undesirable effects,4.8 Undesirable effects \n \nSummary of the s...,4.8 Undesirable effects


In [5]:
#read in the meddra map file
f = '/content/drive/MyDrive/pop_pharmacogenomics/onsides_intl/'
meddra_df = pd.read_csv(f+'external_data/umls_meddra_en.csv')
meddra_df['STR'] = meddra_df.STR.apply(lambda x: x.lower())
meddra_df['len'] = meddra_df.STR.apply(lambda x: len(x))
print(meddra_df.shape[0])
meddra_df = meddra_df[(meddra_df['len'] >= 5)]
meddra_df = meddra_df[(meddra_df.TTY.isin(['PT', 'LLT']))]
print(meddra_df.shape[0])
meddra_dict = dict(zip(meddra_df.STR, meddra_df.SDUI))
meddra_names = meddra_df.STR.tolist()
meddra_df.head(1)

116807
100897


Unnamed: 0,CUI,LAT,TS,LUI,STT,SUI,ISPREF,AUI,SAUI,SCUI,SDUI,SAB,TTY,CODE,STR,SRL,SUPPRESS,CVF,len
0,C0000727,ENG,P,L0000727,VCW,S0584932,N,A0639292,,,10000647,MDR,PT,10000647,acute abdomen,3,N,256.0,13


In [6]:
app_data = []
for ade_text in tqdm(content_df.section_content.tolist()):
  if str(ade_text) != 'nan':
    ade_text = ade_text.lower()
    meddra_found_terms = []
    #iterate through list of meddra concepts, if found - add to list
    for concept_name in meddra_names:
      if ade_text.find(concept_name) == -1:
        continue
      else:
        i = ade_text.index(concept_name)
        meddra_found_terms.append([i, concept_name, meddra_dict[concept_name]])
    app_data.append([ade_text, meddra_found_terms])

100%|██████████| 1055/1055 [08:47<00:00,  2.00it/s]


In [7]:
app_data_df = pd.DataFrame(app_data, columns = ['string', 'list'])
app_data_df = app_data_df.explode('list')
app_data_df['index'] = app_data_df['list'].apply(lambda x: x[0] if str(x) != 'nan' else None)
app_data_df['found_term'] = app_data_df['list'].apply(lambda x: x[1] if str(x) != 'nan' else None)
app_data_df['meddra_id'] = app_data_df['list'].apply(lambda x: x[2] if str(x) != 'nan' else None)
app_data_df = app_data_df.drop(['list'], axis = 1)
app_data_df.to_csv(data_folder+'data/ade/drug_ue_text_matched_terms.csv', index=False)
app_data_df.head(1)

Unnamed: 0,string,index,found_term,meddra_id
0,4.8 undesirable effects \n \nsummary of the s...,1669,contusion,10050584


### prepare onsides input

In [16]:
text_df = pd.read_csv(data_folder+'data/ade/drug_ue_text.csv')
text_df.head(1)

Unnamed: 0,product_id,section_id,section_content,section_title
0,trogarzo,4.8 Undesirable effects,4.8 Undesirable effects \n \nSummary of the s...,4.8 Undesirable effects


In [17]:
app_data_df = pd.read_csv(data_folder+'data/ade/drug_ue_text_matched_terms.csv')
print(app_data_df.shape)
app_data_df = app_data_df.drop_duplicates()
print(app_data_df.shape)
app_data_df.head(1)

(181814, 4)
(103574, 4)


Unnamed: 0,string,index,found_term,meddra_id
0,4.8 undesirable effects \n \nsummary of the s...,1669,contusion,10050584


In [18]:
building_strings = []
#ade_text_table_dict = dict(zip(text_df.product_id, text_df.section_content))
for i, row in tqdm(app_data_df.iterrows()):
  if str(row['index']) != 'nan':
    term, start_pos, ar_text = row['found_term'], int(row['index']), row['string']
    #default settings
    nwords, prop_before = 125, 0.125
    #pull the full text
    #ar_text = ade_text_table_dict[label_id]

    term_nwords = len(term)
    size_before = max(int((nwords-2*term_nwords)*prop_before), 1)
    size_after = max(int((nwords-2*term_nwords)*(1-prop_before)), 1)

    before_text = ar_text[:start_pos]
    after_text = ar_text[(start_pos+term_nwords):]

    before_parts = before_text.split()[-1*size_before:]
    after_parts = after_text.split()[:size_after]

    li = [term]
    li.extend(before_parts)
    li.append('EVENT')
    li.extend(after_parts)
    example_string = ' '.join(li)
    building_strings.append(example_string)
  else:
    building_strings.append(None)

103574it [00:17, 5764.38it/s]


In [19]:
text_df['section_content'] = text_df['section_content'].apply(lambda x: x.lower())
app_data_df['section_content'] = app_data_df['string']
app_data_df['string'] = building_strings
exact_terms_df = text_df.merge(app_data_df, on = 'section_content', how = 'left')
exact_terms_df = exact_terms_df.drop(columns = ['section_id', 'section_content', 'section_title'])
exact_terms_df.to_csv(data_folder+'data/ade/'\
                      'sentences-rx_method14_nwords125_clinical_bert_application_set_AR_v0120.csv', index=False)
exact_terms_df.head()

Unnamed: 0,product_id,string,index,found_term,meddra_id
0,trogarzo,contusion administration site conditions fatig...,1669,contusion,10050584
1,trogarzo,"dermatitis diarrhoea, nausea, vomiting common ...",1449,dermatitis,10012431
2,trogarzo,diarrhoea the safety profi le the most frequen...,127,diarrhoea,10012735
3,trogarzo,dizziness profi le the most frequently reporte...,146,dizziness,10013573
4,trogarzo,electrocardiogram system organ class adverse r...,1185,electrocardiogram,10014362


### additional formatting

In [21]:
#required columns : section, drug, label_id, set_id, spl_version, pt_meddra_id, pt_meddra_term
exact_terms_df = pd.read_csv(data_folder+'data/ade/'\
                             'sentences-rx_method14_nwords125_clinical_bert_application_set_AR_v0120.csv')
exact_terms_df['section'] = 'AR'
exact_terms_df['set_id'] = exact_terms_df['product_id']
exact_terms_df['location'] = exact_terms_df['index']
exact_terms_df = exact_terms_df.rename(columns = {'product_id':'label_id', 'onsides_string':'string'}).drop(columns = ['index'])
print(exact_terms_df.shape)
exact_terms_df['drug'] = exact_terms_df['label_id']
folder = '/content/drive/MyDrive/pop_pharmacogenomics/fda/onsides-2.0.0/data/'
llt_pt = pd.read_csv(folder+'meddra_llt_pt_map.txt', delimiter = '|')
llt_pt_id_dict = dict(zip(llt_pt.llt_concept_code, llt_pt.pt_concept_code))
llt_pt_term_dict = dict(zip(llt_pt.llt_concept_code, llt_pt.pt_concept_name))
exact_terms_df['pt_meddra_id'] = exact_terms_df.meddra_id.apply(lambda x: llt_pt_id_dict[x] if x in llt_pt_id_dict.keys() else None)
exact_terms_df['pt_meddra_term'] =  exact_terms_df.meddra_id.apply(lambda x: llt_pt_term_dict[x] if x in llt_pt_term_dict.keys() else None)
print(exact_terms_df.shape)
exact_terms_df = exact_terms_df[exact_terms_df.string.notna()]
exact_terms_df['spl_version'] = None
print(exact_terms_df.shape)
exact_terms_df.to_csv(data_folder+'data/ade/'\
                      'sentences-rx_method14_nwords125_clinical_bert_application_set_AR_v0120.csv', index=False)
exact_terms_df.head(1)

(103574, 7)
(103574, 10)
(103574, 11)


Unnamed: 0,label_id,string,found_term,meddra_id,section,set_id,location,drug,pt_meddra_id,pt_meddra_term,spl_version
0,trogarzo,contusion administration site conditions fatig...,contusion,10050584,AR,trogarzo,1669,trogarzo,10050584.0,Contusion,


### run onsides model

In [22]:
#prep onsides materials
folder = 'onsides'
ar_model = folder + '/bestepoch-bydrug-PMB_14-AR-125-all_222_24_25_2.5e-05_256_32.pth'
bw_model = folder + '/bestepoch-bydrug-PMB_14-AR-125-all_222_24_25_2.5e-05_256_32.pth'
import os
os.chdir('onsides-3.0.0/')
!python3 -m pip install -r requirements.txt

Collecting beautifulsoup4==4.11.1 (from -r requirements.txt (line 1))
  Downloading beautifulsoup4-4.11.1-py3-none-any.whl (128 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m128.2/128.2 kB[0m [31m4.2 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting nltk==3.7 (from -r requirements.txt (line 2))
  Downloading nltk-3.7-py3-none-any.whl (1.5 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.5/1.5 MB[0m [31m64.9 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting numpy==1.23.4 (from -r requirements.txt (line 3))
  Downloading numpy-1.23.4-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (17.1 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m17.1/17.1 MB[0m [31m73.6 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting pandas==1.5.1 (from -r requirements.txt (line 4))
  Downloading pandas-1.5.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (12.1 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m12.1

In [23]:
#use onsides model
f = data_folder+'data/ade/sentences-rx_method14_nwords125_clinical_bert_application_set_AR_v0120.csv'
!python3 src/predict.py --model $ar_model --examples $f

2024-01-28 22:18:57.469146: I tensorflow/core/util/port.cc:113] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2024-01-28 22:18:57.522995: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:9261] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-01-28 22:18:57.523042: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:607] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-01-28 22:18:57.524725: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1515] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2024-01-28 22:18:57.532774: I tensorflow/core/platform/cpu_feature_guar