In [None]:
import numpy as np
import pandas as pd
import requests
from tqdm import tqdm
import ast, re
from time import sleep
from bs4 import BeautifulSoup
data_folder = ''
api = ''
import warnings
warnings.filterwarnings('ignore')

## Extract "Undesirable Effects" Section

### Extract from raw elements

In [None]:
drug_df = pd.read_csv(data_folder+'drug_data.csv')
print(drug_df.shape)
drug_df.head(1)

(11450, 6)


Unnamed: 0,ingredient_id,ingredient_name,product_id,product_name,active_ingredients,company_name
0,/emc/ingredient/1524,"2,4-dichlorobenzyl alcohol",/emc/product/1589/smpc,\n Boots Antiseptic Cream\n...,"allantoin, cetrimide, 2,4-dichlorobenzyl alcohol",THE BOOTS COMPANY PLC


In [None]:
drug_ade_text = []
for product in tqdm(drug_df.product_id.unique().tolist()):
  f = (data_folder+'raw/{}.txt'.format(product.split('/')[-2]))
  with open(f) as fi:
      s = BeautifulSoup(fi, 'html.parser')
  for i in s.find_all('details'):
      if '4.8 Undesirable effects' in i.text:
        ade_text = i
        drug_ade_text.append([product.split('/')[-2], ade_text])

100%|██████████| 9334/9334 [45:31<00:00,  3.42it/s]


In [None]:
drug_ade_df = pd.DataFrame(drug_ade_text, columns = ['product_id', 'drug_text'])
drug_ade_df.to_csv(data_folder+'drug_ade_data_raw.csv', index=False)
drug_ade_df.head()

Unnamed: 0,product_id,drug_text
0,1589,"[[4.8 Undesirable effects], \n, [\n, [Hypersen..."
1,4223,"[[4.8 Undesirable effects], \n, [\n, [The list..."
2,5606,"[[4.8 Undesirable effects], \n, [\n, [The list..."
3,2700,"[[4.8 Undesirable effects], \n, [\n, [The list..."
4,625,"[[4.8 Undesirable effects], \n, [\n, [The list..."


---
### Extract Individual ADEs from Section Tables
Frequencies :
- Very common : >=10%
- Common : >=1% and <10%
- Uncommon : >=0.1% and <1%
- Rare >0.01% and <0.1%
- Very Rare <0.01%

In [None]:
drug_ade_df = pd.read_csv(data_folder+'drug_ade_data_raw.csv')
print(drug_ade_df.shape)
drug_ade_df.head(1)

(9222, 2)


Unnamed: 0,product_id,drug_text
0,1589,"<details><summary data-evt=""smpcSectionOpen"" i..."


In [None]:
big_table_list = []
for p_id, i in tqdm(zip(drug_ade_df.product_id.tolist(), drug_ade_df.drug_text.tolist())):
  i = BeautifulSoup(i, 'html.parser')
  if len(i.find_all('table')) > 0:
    table = i.find('table')
    rows = table.find_all('tr')
    for row in rows:
      big_table_list.append([p_id, row])

9222it [01:28, 103.74it/s]


In [None]:
freqs = ['very common', 'common', 'uncommon', 'rare', 'very rare', 'not known']
socs =  ['blood and lymphatic system disorders','cardiac disorders', 'congenital, familial and genetic disorders',
         'ear and labyrinth disorders', 'endocrine disorders', 'eye disorders', 'gastrointestinal disorders',
         'general disorders and administration site conditions', 'hepatobiliary disorders', 'immune system disorders',
         'infections and infestations', 'injury, poisoning and procedural complications', 'investigations', 'metabolism and nutrition disorders',
         'musculoskeletal and connective tissue disorders', 'neoplasms benign, malignant and unspecified (incl cysts and polyps)',
         'nervous system disorders', 'pregnancy, puerperium and perinatal conditions', 'psychiatric disorders',
         'renal and urinary disorders', 'reproductive system and breast disorders', 'respiratory, thoracic and mediastinal disorders',
         'skin and subcutaneous tissue disorders', 'social circumstances', 'surgical and medical procedures', 'vascular disorders', 'product issues']
titles = ['system organ class', 'frequency', 'adverse events']

In [None]:
processed_list = []
for row in tqdm(big_table_list):
  drug = row[0]
  r = row[1]
  items = [i.text.replace('\n','').lower() for i in r.find_all('td')]
  if any([True for e in titles if e in items]):
    continue
  else:
    f, s, a = None, None, None
    for i in items:
      i = i.strip().replace('*','')
      if i in freqs: f = i
      elif i in socs: s = i
      else: a = i
    processed_list.append([drug, f, s, a])

100%|██████████| 158931/158931 [00:04<00:00, 36840.54it/s]


In [None]:
processed_df = pd.DataFrame(processed_list, columns = ['product_id', 'freq', 'soc', 'ade'])
processed_df['freq'] = processed_df.apply(lambda x: str(x.ade).split(':')[0] if str(x.ade) !='nan' and str(x.ade).split(':')[0] in freqs else x.freq , axis = 1)
processed_df.to_csv(data_folder+'drug_ade_data_parsed.csv', index=False)
processed_df.head()

Unnamed: 0,product_id,freq,soc,ade
0,4223,not known,immune system disorders,hypersensitivityab1
1,4223,not known,gastrointestinal disorders,"glossodyniaab, oral discomfortab"
2,5606,not known,immune system disorders,hypersensitivityab1
3,5606,not known,gastrointestinal disorders,"glossodyniaab, oral discomfortab"
4,2700,not known,immune system disorders,hypersensitivityab1


### Map Terms to MedDRA codes (section tables)

In [None]:
processed_df = pd.read_csv(data_folder+'drug_ade_data_parsed.csv')
processed_df = processed_df[['product_id', 'freq', 'soc', 'ade']]
processed_df.head()

Unnamed: 0,product_id,freq,soc,ade
0,4223,not known,immune system disorders,hypersensitivityab1
1,4223,not known,gastrointestinal disorders,"glossodyniaab, oral discomfortab"
2,5606,not known,immune system disorders,hypersensitivityab1
3,5606,not known,gastrointestinal disorders,"glossodyniaab, oral discomfortab"
4,2700,not known,immune system disorders,hypersensitivityab1


In [None]:
unique_terms = processed_df[['ade']].drop_duplicates()
unique_terms = unique_terms[unique_terms.ade.notna()]
unique_terms.head()

Unnamed: 0,ade
0,hypersensitivityab1
1,"glossodyniaab, oral discomfortab"
16,adverse reactions involving the treatment site...
17,"erythema, exfoliation, irritation, pain, pruri..."
18,"bleeding, desquamation, discharge, discomfort,..."


In [1]:
f = '/content/drive/MyDrive/pop_pharmacogenomics/onsides_intl/'
meddra_df = pd.read_csv(f+'external_data/umls_meddra_en.csv')
meddra_df['STR'] = meddra_df.STR.apply(lambda x: x.lower())
meddra_df['len'] = meddra_df.STR.apply(lambda x: len(x))
meddra_dict = dict(zip(meddra_df.STR, meddra_df.SDUI))
meddra_df = meddra_df[(meddra_df.TTY == 'PT')|(meddra_df['len'] > 5)]
meddra_df.head(1)

NameError: ignored

In [None]:
found_ades = []
meddra_names = meddra_df.STR.tolist()
for ade_text in tqdm(unique_terms.ade.tolist()):
  ar_text = ' '.join(ade_text.split()).lower()
  found_terms = []
  for concept_name in meddra_names:
    if ar_text.find(concept_name) == -1:
      continue
    else:
      found_terms.append(concept_name)
  found_ades.append(found_terms)

100%|██████████| 22912/22912 [10:54<00:00, 35.03it/s]


In [None]:
unique_terms['exact_match_list'] = found_ades
unique_terms.to_csv(data_folder+'drug_ade_data_parsed_text_unique.csv', index=False)
unique_terms.head()

Unnamed: 0,ade,exact_match_list
0,hypersensitivityab1,"[hypersensitivity, hypersensitivity]"
1,"glossodyniaab, oral discomfortab","[glossodynia, glossodynia, oral discomfort, or..."
16,adverse reactions involving the treatment site...,"[adverse reaction, adverse reaction, local rea..."
17,"erythema, exfoliation, irritation, pain, pruri...","[pain, pruritus, pruritus, erythema, erythema,..."
18,"bleeding, desquamation, discharge, discomfort,...","[oedema, oedema, bleeding, pigmentation, swell..."


In [None]:
processed_df = processed_df.merge(unique_terms, on = 'ade', how = 'left')
processed_df['matched_codes'] = processed_df.exact_match_list.apply(lambda x: [meddra_dict[i] for i in x] if str(x) != 'nan' else None)
processed_df.to_csv(data_folder+'drug_ade_data_parsed.csv', index=False)
processed_df.head(1)

Unnamed: 0,product_id,freq,soc,ade,exact_match_list,matched_codes
0,4223,not known,immune system disorders,hypersensitivityab1,"[hypersensitivity, hypersensitivity]","[10020751, 10020751]"


### Extract Individual ADEs from Section Text

In [None]:
drug_ade_df = pd.read_csv(data_folder+'drug_ade_data_raw.csv')
print(drug_ade_df.shape)
drug_ade_df.head(1)

(9222, 2)


Unnamed: 0,product_id,drug_text
0,1589,"<details><summary data-evt=""smpcSectionOpen"" i..."


In [None]:
text_list = []
for p_id, i in tqdm(zip(drug_ade_df.product_id.tolist(), drug_ade_df.drug_text.tolist())):
  i = BeautifulSoup(i, 'html.parser')
  if len(i.find_all('table')) > 0:
    i.find('table').decompose() #remove the tables
  text_list.append([p_id, i.text])

9222it [01:29, 103.18it/s]


In [None]:
processed_text_df = pd.DataFrame(text_list, columns = ['product_id', 'text'])
processed_text_df['text'] = processed_text_df['text'].apply(lambda x: str(x).lower().split('reporting of suspected adverse reactions')[0].split('4.8 undesirable effects')[-1].replace('\n',''))
processed_text_df.to_csv(data_folder+'drug_ade_text_parsed.csv', index=False)
processed_text_df.head()

Unnamed: 0,product_id,text
0,1589,hypersensitivity reactions may occasionally oc...
1,4223,the list of the following adverse effects rela...
2,5606,the list of the following adverse effects rela...
3,2700,the list of the following adverse effects rela...
4,625,the list of the following adverse effects rela...


---
## Special Populations

### Extract from raw elements

In [None]:
drug_df = pd.read_csv(data_folder+'drug_data.csv')
print(drug_df.shape)
drug_df.head(1)

(11450, 6)


Unnamed: 0,ingredient_id,ingredient_name,product_id,product_name,active_ingredients,company_name
0,/emc/ingredient/1524,"2,4-dichlorobenzyl alcohol",/emc/product/1589/smpc,\n Boots Antiseptic Cream\n...,"allantoin, cetrimide, 2,4-dichlorobenzyl alcohol",THE BOOTS COMPANY PLC


In [None]:
special_pop_text = []
for product in tqdm(drug_df.product_id.unique().tolist()):
  f = (data_folder+'raw/{}.txt'.format(product.split('/')[-2]))
  with open(f) as fi:
      s = BeautifulSoup(fi, 'html.parser')
  for i in s.find_all('details'):
      if '4.3 Contraindications' in i.text or '4.4 Special warnings and precautions for use' in i.text or '4.6 Fertility, pregnancy and lactation' in i.text:
        ade_text = i
        special_pop_text.append([product.split('/')[-2], ade_text])

100%|██████████| 9334/9334 [2:00:44<00:00,  1.29it/s]


In [None]:
special_pop_df = pd.DataFrame(special_pop_text, columns = ['product_id', 'drug_text'])
special_pop_df.to_csv(data_folder+'drug_special_pop_data_raw.csv', index=False)
special_pop_df.head()

Unnamed: 0,product_id,drug_text
0,1589,"[[4.3 Contraindications], \n, [\n, [Hypersensi..."
1,1589,[[4.4 Special warnings and precautions for use...
2,4223,"[[4.3 Contraindications], \n, [\n, [Strepsils ..."
3,4223,[[4.4 Special warnings and precautions for use...
4,4223,"[[4.6 Fertility, pregnancy and lactation], \n,..."
