In [2]:
import numpy as np
import pandas as pd
import requests
from tqdm import tqdm
import ast, re
from time import sleep
from bs4 import BeautifulSoup
data_folder = '/content/drive/MyDrive/pop_pharmacogenomics/onsides_intl/uk_drug_label/'
external_data_folder = '/content/drive/MyDrive/pop_pharmacogenomics/onsides_intl/external_data/'
api = 'bbf40496-e20e-4f79-9636-b63d4a3deea7'
import warnings
warnings.filterwarnings('ignore')

### extract special population sections and filter for pediatric sections


In [2]:
drug_df = pd.read_csv(data_folder+'data/final_UK/drug_data.csv')
print(drug_df.shape)
drug_df.head(1)

(11450, 6)


Unnamed: 0,ingredient_id,ingredient_name,product_id,product_name,active_ingredients,company_name
0,/emc/ingredient/1524,"2,4-dichlorobenzyl alcohol",/emc/product/1589/smpc,\n Boots Antiseptic Cream\n...,"allantoin, cetrimide, 2,4-dichlorobenzyl alcohol",THE BOOTS COMPANY PLC


In [None]:
special_pop_text = []
for product in tqdm(drug_df.product_id.unique().tolist()):
  f = (data_folder+'data/raw/{}.txt'.format(product.split('/')[-2]))
  with open(f) as fi:
      s = BeautifulSoup(fi, 'html.parser')
  for i in s.find_all('details'):
      text = i.text.lower()
      if '4.3 contraindications' in text or '4.4 special warnings and precautions for use' in text or '4.6 fertility, pregnancy and lactation' in i.text:
        special_pop_text.append([product.split('/')[-2], text])

100%|██████████| 9334/9334 [48:55<00:00,  3.18it/s]


In [None]:
special_pop_df = pd.DataFrame(special_pop_text, columns = ['product_id', 'drug_text'])
special_pop_df['section_title'] = special_pop_df.drug_text.apply(lambda x: x.split('\n')[0])
special_pop_df.to_csv(data_folder+'data/drug_special_pop_data_all_raw.csv', index=False)
special_pop_df.head()

Unnamed: 0,product_id,drug_text,section_title
0,1589,4.3 contraindications\n\nhypersensitivity to a...,4.3 contraindications
1,1589,4.4 special warnings and precautions for use\n...,4.4 special warnings and precautions for use
2,4223,4.3 contraindications\n\nstrepsils children 6+...,4.3 contraindications
3,4223,4.4 special warnings and precautions for use\n...,4.4 special warnings and precautions for use
4,5606,4.3 contraindications\n\nhypersensitivity to a...,4.3 contraindications


In [None]:
special_pop_df = pd.read_csv(data_folder+'data/drug_special_pop_data_all_raw.csv')
special_pop_df['drug_text'] = special_pop_df['drug_text'].apply(lambda x: [i.strip() for i in x.split('\n') if i != ''])
print(special_pop_df.shape)
special_pop_df = special_pop_df.explode('drug_text')
print(special_pop_df.shape)
special_pop_df = special_pop_df[(special_pop_df.drug_text.str.contains('children'))|
                                (special_pop_df.drug_text.str.contains('pediatric'))]
print(special_pop_df.shape)
special_pop_df.to_csv(data_folder+'data/drug_special_pop_data_all_raw_ped_str.csv', index=False)
special_pop_df.head()

(18602, 3)
(359586, 3)
(8321, 3)


Unnamed: 0,product_id,drug_text,section_title
1,1589,keep all medicines out of the reach of children.,4.4 special warnings and precautions for use
2,4223,strepsils children 6+ lozenges are contraindic...,4.3 contraindications
3,4223,not to be given to children under 6 years.,4.4 special warnings and precautions for use
5,5606,not to be given to children under 6 years.,4.4 special warnings and precautions for use
9,625,not to be given to children under 6 years,4.4 special warnings and precautions for use


### exact string matching

In [4]:
special_pop_df = pd.read_csv(data_folder+'data/drug_special_pop_data_all_raw_ped_str.csv')
print(special_pop_df.shape[0])
unique_terms = special_pop_df[['drug_text']].drop_duplicates()
print(unique_terms.shape[0])
unique_terms.head(1)

8321
2930


Unnamed: 0,drug_text
0,keep all medicines out of the reach of children.


In [5]:
f = '/content/drive/MyDrive/pop_pharmacogenomics/onsides_intl/'
meddra_df = pd.read_csv(f+'external_data/umls_meddra_en.csv')
meddra_df['STR'] = meddra_df.STR.apply(lambda x: x.lower())
meddra_df['len'] = meddra_df.STR.apply(lambda x: len(x))
print(meddra_df.shape[0])
meddra_df = meddra_df[(meddra_df['len'] >= 5)]
meddra_df = meddra_df[(meddra_df.TTY.isin(['PT', 'LLT']))]
print(meddra_df.shape[0])
meddra_dict = dict(zip(meddra_df.STR, meddra_df.SDUI))
meddra_df.head(1)

116807
100897


Unnamed: 0,CUI,LAT,TS,LUI,STT,SUI,ISPREF,AUI,SAUI,SCUI,SDUI,SAB,TTY,CODE,STR,SRL,SUPPRESS,CVF,len
0,C0000727,ENG,P,L0000727,VCW,S0584932,N,A0639292,,,10000647,MDR,PT,10000647,acute abdomen,3,N,256.0,13


In [None]:
found_ades = []
meddra_names = meddra_df.STR.tolist()
for ade_text in tqdm(unique_terms.drug_text.tolist()):
  ar_text = ' '.join(ade_text.split()).lower()
  found_terms = []
  for concept_name in meddra_names:
    if ar_text.find(concept_name) == -1:
      continue
    else:
      i = ar_text.index(concept_name)
      found_terms.append([i, concept_name, meddra_dict[concept_name]])
  found_ades.append([ade_text, found_terms])

100%|██████████| 2930/2930 [02:40<00:00, 18.20it/s]


In [None]:
found_ades_df = pd.DataFrame(found_ades, columns = ['string', 'list'])
found_ades_df = found_ades_df.explode('list')
found_ades_df['index'] = found_ades_df['list'].apply(lambda x: x[0] if str(x) != 'nan' else None)
found_ades_df['found_term'] = found_ades_df['list'].apply(lambda x: x[1] if str(x) != 'nan' else None)
found_ades_df['meddra_id'] = found_ades_df['list'].apply(lambda x: x[2] if str(x) != 'nan' else None)
found_ades_df = found_ades_df.drop(['list'], axis = 1).drop_duplicates()
found_ades_df = found_ades_df[found_ades_df['found_term'].notna()]
found_ades_df.to_csv(data_folder+'data/drug_special_pop_data_all_raw_ped_matched.csv', index=False)
found_ades_df.head(1)

Unnamed: 0,string,index,found_term,meddra_id
1,strepsils children 6+ lozenges are contraindic...,88.0,hypersensitivity,10020751.0


---
### format strings for onsides input

In [39]:
found_ades_df = pd.read_csv(data_folder+'data/drug_special_pop_data_all_raw_ped_matched.csv')
special_pop_df = pd.read_csv(data_folder+'data/drug_special_pop_data_all_raw_ped_str.csv')
special_pop_df = special_pop_df.merge(found_ades_df, left_on = 'drug_text', right_on = 'string', how = 'inner')
print(special_pop_df.shape[0], special_pop_df.product_id.nunique(), special_pop_df.found_term.nunique())
special_pop_df.head(1)

24278 2809 1285


Unnamed: 0,product_id,drug_text,section_title,string,index,found_term,meddra_id
0,4223,strepsils children 6+ lozenges are contraindic...,4.3 contraindications,strepsils children 6+ lozenges are contraindic...,88.0,hypersensitivity,10020751.0


In [40]:
building_strings = []
for i, row in tqdm(special_pop_df.iterrows()):
  term, label_id, start_pos = row['found_term'], row['product_id'], int(row['index'])
  #default settings
  nwords, prop_before = 125, 0.125
  #pull the full text
  ar_text = row['drug_text']
  term_nwords = len(term)
  size_before = max(int((nwords-2*term_nwords)*prop_before), 1)
  size_after = max(int((nwords-2*term_nwords)*(1-prop_before)), 1)

  before_text = ar_text[:start_pos]
  after_text = ar_text[(start_pos+term_nwords):]

  before_parts = before_text.split()[-1*size_before:]
  after_parts = after_text.split()[:size_after]

  li = [term]
  li.extend(before_parts)
  li.append('EVENT')
  li.extend(after_parts)
  example_string = ' '.join(li)
  building_strings.append(example_string)
special_pop_df['string'] = building_strings
special_pop_df.head()

24278it [00:01, 14754.20it/s]


Unnamed: 0,product_id,drug_text,section_title,string,index,found_term,meddra_id
0,4223,strepsils children 6+ lozenges are contraindic...,4.3 contraindications,hypersensitivity children 6+ lozenges are cont...,88.0,hypersensitivity,10020751.0
1,1307,"because insufficient data are available, the u...",4.4 special warnings and precautions for use,hypersensitivity in children or adolescents is...,136.0,hypersensitivity,10020751.0
2,1307,"because insufficient data are available, the u...",4.4 special warnings and precautions for use,hypersensitivity reaction or adolescents is no...,136.0,hypersensitivity reaction,10020751.0
3,334,there is a theoretical concern that treatment ...,4.4 special warnings and precautions for use,multiple sclerosis for autoimmune processes in...,163.0,multiple sclerosis,10028245.0
4,505,keep out of reach and sight of children. if sy...,4.4 special warnings and precautions for use,fructose intolerance and glucose per pastille....,185.0,fructose intolerance,10072104.0


In [41]:
#required columns : section, drug, label_id, set_id, spl_version, pt_meddra_id, pt_meddra_term
exact_terms_df = special_pop_df[['product_id','string','found_term','meddra_id']].copy().rename(columns={'product_id':'label_id'})
exact_terms_df['section'] = 'AR'
exact_terms_df['set_id'] = exact_terms_df['label_id']
exact_terms_df['spl_version'] = '0120'

folder = '/content/drive/MyDrive/pop_pharmacogenomics/fda/onsides-2.0.0/data/'
llt_pt = pd.read_csv(folder+'meddra_llt_pt_map.txt', delimiter = '|')
llt_pt_id_dict = dict(zip(llt_pt.llt_concept_code, llt_pt.pt_concept_code))
llt_pt_term_dict = dict(zip(llt_pt.llt_concept_code, llt_pt.pt_concept_name))
exact_terms_df['pt_meddra_id'] = exact_terms_df.meddra_id.apply(lambda x: llt_pt_id_dict[x] if x in llt_pt_id_dict.keys() else None)
exact_terms_df['pt_meddra_term'] =  exact_terms_df.meddra_id.apply(lambda x: llt_pt_term_dict[x] if x in llt_pt_term_dict.keys() else None)

#exact_terms_df.columns = ['section', 'string', 'label_id', 'set_id', 'spl_version', 'pt_meddra_id', 'pt_meddra_term', 'found_term', 'meddra_id']
exact_terms_df.to_csv(data_folder+'data/ped/sentences-rx_method14_nwords125_clinical_bert_application_set_AR_v0120.csv', index=False)
exact_terms_df.head(1)

Unnamed: 0,label_id,string,found_term,meddra_id,section,set_id,spl_version,pt_meddra_id,pt_meddra_term
0,4223,hypersensitivity children 6+ lozenges are cont...,hypersensitivity,10020751.0,AR,4223,120,10020751.0,Hypersensitivity


In [58]:
drug_df = drug_df[['label_id', 'drug']].drop_duplicates().groupby('label_id')['drug'].apply(set).reset_index()
drug_df['drug'] = drug_df['drug'].apply(lambda x: str(x)[1:-1].replace("'",''))
drug_df.head()

Unnamed: 0,label_id,drug
0,3,light liquid paraffin
1,4,colistimethate sodium
2,5,desferrioxamine mesilate
3,6,"clindamycin phosphate, benzoyl peroxide"
4,8,protamine sulfate


In [60]:
exact_terms_df = pd.read_csv(data_folder+'data/ped/sentences-rx_method14_nwords125_clinical_bert_application_set_AR_v0120.csv')
print(exact_terms_df.shape)
drug_df = pd.read_csv(data_folder+'data/final_UK/drug_data.csv').rename(columns={'ingredient_name':'drug'})
drug_df['label_id'] = drug_df['product_id'].apply(lambda x: int(x.split('/')[-2]))
drug_df = drug_df[['label_id', 'drug']].drop_duplicates().groupby('label_id')['drug'].apply(set).reset_index()
drug_df['drug'] = drug_df['drug'].apply(lambda x: str(x)[1:-1].replace("'",''))
exact_terms_df = exact_terms_df.merge(drug_df[['label_id', 'drug']].drop_duplicates(), on = 'label_id', how = 'left')
print(exact_terms_df.shape)
exact_terms_df.to_csv(data_folder+'data/ped/sentences-rx_method14_nwords125_clinical_bert_application_set_AR_v0120.csv', index=False)

(24278, 9)
(24278, 10)


In [62]:
exact_terms_df.head()

Unnamed: 0,label_id,string,found_term,meddra_id,section,set_id,spl_version,pt_meddra_id,pt_meddra_term,drug
0,4223,hypersensitivity children 6+ lozenges are cont...,hypersensitivity,10020751.0,AR,4223,120,10020751.0,Hypersensitivity,"amylmetacresol, 2,4-dichlorobenzyl alcohol"
1,1307,hypersensitivity in children or adolescents is...,hypersensitivity,10020751.0,AR,1307,120,10020751.0,Hypersensitivity,"lamivudine, zidovudine, abacavir sulfate"
2,1307,hypersensitivity reaction or adolescents is no...,hypersensitivity reaction,10020751.0,AR,1307,120,10020751.0,Hypersensitivity,"lamivudine, zidovudine, abacavir sulfate"
3,334,multiple sclerosis for autoimmune processes in...,multiple sclerosis,10028245.0,AR,334,120,10028245.0,Multiple sclerosis,abatacept
4,505,fructose intolerance and glucose per pastille....,fructose intolerance,10072104.0,AR,505,120,10072104.0,Fructose intolerance,"pine oil sylvestris, menthol, abietis oil"


### run onsides

In [None]:
#prep onsides materials
folder = '/content/drive/MyDrive/pop_pharmacogenomics/onsides_intl/onsides'
ar_model = folder + '/bestepoch-bydrug-PMB_14-AR-125-all_222_24_25_2.5e-05_256_32.pth'
bw_model = folder + '/bestepoch-bydrug-PMB_14-AR-125-all_222_24_25_2.5e-05_256_32.pth'
import os
os.chdir('/content/drive/MyDrive/pop_pharmacogenomics/fda/onsides-2.0.0/')
!python3 -m pip install -r requirements.txt

In [None]:
#use onsides model
f = data_folder+'data/ped/sentences-rx_method14_nwords125_clinical_bert_application_set_AR_v0120.csv'
!python3 src/predict.py --model $ar_model --examples $f

In [61]:
#build tables
r = data_folder+'data/ped/bestepoch-bydrug-PMB-sentences-rx_ref14-AR-125-all_222_24_25_2.5e-05_256_32.csv.gz'
!python3 src/create_onsides_datafiles.py --release v2.0.0-AR --results $r --examples $f

 prefix: bestepoch-bydrug-PMB-sentences-rx
 refset: ref14-AR-125-all
 np_random_seed: 222
 split_method: 24
 EPOCHS: 25
 LR: 2.5e-05
 threshold: 0.4633
 max_length: 2.5e-05
 batch-size: 32.csv
 compiled file: /content/drive/MyDrive/pop_pharmacogenomics/onsides_intl/uk_drug_label/data/ped/compiled/v2.0.0/AR.csv.gz
Loading results file...
 res.shape: (24278, 2)
Loding examples file...
 ex.shape: (24278, 10)
Concatenating results file to examples file...
Grouping predictions by drug label and adverse event term, and taking the mean prediction score...
  df_grouped = df.groupby(by=['section', 'drug', 'label_id', 'set_id', 'spl_version', 'pt_meddra_id', 'pt_meddra_term']).mean().reset_index()
Applying the pre-determined threshold to the prediction values to get predictions...
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  prediction