In [None]:
import numpy as np
import pandas as pd
import requests
from tqdm import tqdm
import ast, re
from time import sleep
from bs4 import BeautifulSoup
data_folder = '/content/drive/MyDrive/pop_pharmacogenomics/onsides_intl/uk_drug_label/'
external_data_folder = '/content/drive/MyDrive/pop_pharmacogenomics/onsides_intl/external_data/'
api = 'bbf40496-e20e-4f79-9636-b63d4a3deea7'
import warnings
warnings.filterwarnings('ignore')

### Extract terms from text data here

In [None]:
text_df = pd.read_csv(data_folder+'data/drug_wp_data_raw.csv')
text_df['section_title'] = text_df['drug_text'].apply(lambda x: BeautifulSoup(x, 'html.parser').find('summary').text)
text_df['section_content'] = text_df.apply(lambda x: BeautifulSoup(x.drug_text, 'html.parser').text.replace(x.section_title, '').strip().lower(),
                                           axis = 1)
text_df.head(1)

Unnamed: 0,product_id,drug_text,section_title,section_content
0,1589,"<details><summary data-evt=""smpcSectionOpen"" i...",4.4 Special warnings and precautions for use,prolonged and repeated applications are inadvi...


In [None]:
unique_text = text_df[['section_content']].drop_duplicates()
print(unique_text.shape[0])

7607


In [None]:
#read in the meddra map file
f = '/content/drive/MyDrive/pop_pharmacogenomics/onsides_intl/'
meddra_df = pd.read_csv(f+'external_data/umls_meddra_en.csv')
meddra_df['STR'] = meddra_df.STR.apply(lambda x: x.lower())
meddra_df['len'] = meddra_df.STR.apply(lambda x: len(x))
print(meddra_df.shape[0])
meddra_df = meddra_df[(meddra_df['len'] >= 5)]
meddra_df = meddra_df[(meddra_df.TTY.isin(['PT', 'LLT']))]
print(meddra_df.shape[0])
meddra_dict = dict(zip(meddra_df.STR, meddra_df.SDUI))
meddra_names = meddra_df.STR.tolist()
meddra_df.head(1)

116807
100897


Unnamed: 0,CUI,LAT,TS,LUI,STT,SUI,ISPREF,AUI,SAUI,SCUI,SDUI,SAB,TTY,CODE,STR,SRL,SUPPRESS,CVF,len
0,C0000727,ENG,P,L0000727,VCW,S0584932,N,A0639292,,,10000647,MDR,PT,10000647,acute abdomen,3,N,256.0,13


In [None]:
app_data = []
for ade_text in tqdm(text_df.section_content.tolist()):
  if str(ade_text) != 'nan':
    ade_text = ade_text.lower()
    meddra_found_terms = []
    #iterate through list of meddra concepts, if found - add to list
    for concept_name in meddra_names:
      if ade_text.find(concept_name) == -1:
        continue
      else:
        i = ade_text.index(concept_name)
        meddra_found_terms.append([i, concept_name, meddra_dict[concept_name]])
    app_data.append([ade_text, meddra_found_terms])

100%|██████████| 9532/9532 [1:01:18<00:00,  2.59it/s]


In [None]:
app_data_df = pd.DataFrame(app_data, columns = ['string', 'list'])
app_data_df = app_data_df.explode('list')
app_data_df['index'] = app_data_df['list'].apply(lambda x: x[0] if str(x) != 'nan' else None)
app_data_df['found_term'] = app_data_df['list'].apply(lambda x: x[1] if str(x) != 'nan' else None)
app_data_df['meddra_id'] = app_data_df['list'].apply(lambda x: x[2] if str(x) != 'nan' else None)
app_data_df = app_data_df.drop(['list'], axis = 1)
app_data_df.to_csv(data_folder+'data/drug_wp_text_matched_terms.csv', index=False)
app_data_df.head(1)

### prepare onsides input

In [None]:
text_df = pd.read_csv(data_folder+'data/drug_wp_data_raw.csv')
text_df['section_title'] = text_df['drug_text'].apply(lambda x: BeautifulSoup(x, 'html.parser').find('summary').text)
text_df['section_content'] = text_df.apply(lambda x: BeautifulSoup(x.drug_text, 'html.parser').text.replace(x.section_title, '').strip().lower(),
                                           axis = 1)
text_df.head(1)

Unnamed: 0,product_id,drug_text,section_title,section_content
0,1589,"<details><summary data-evt=""smpcSectionOpen"" i...",4.4 Special warnings and precautions for use,prolonged and repeated applications are inadvi...


In [None]:
app_data_df = pd.read_csv(data_folder+'data/drug_wp_text_matched_terms.csv')
print(app_data_df.shape)
app_data_df = app_data_df.drop_duplicates()
print(app_data_df.shape)
app_data_df.head(1)

(829969, 4)
(402882, 4)


Unnamed: 0,string,index,found_term,meddra_id
0,prolonged and repeated applications are inadvi...,55.0,hypersensitivity,10020751.0


In [None]:
building_strings = []
#ade_text_table_dict = dict(zip(text_df.product_id, text_df.section_content))
for i, row in tqdm(app_data_df.iterrows()):
  if str(row['index']) != 'nan':
    term, start_pos, ar_text = row['found_term'], int(row['index']), row['string']
    #default settings
    nwords, prop_before = 125, 0.125
    #pull the full text
    #ar_text = ade_text_table_dict[label_id]

    term_nwords = len(term)
    size_before = max(int((nwords-2*term_nwords)*prop_before), 1)
    size_after = max(int((nwords-2*term_nwords)*(1-prop_before)), 1)

    before_text = ar_text[:start_pos]
    after_text = ar_text[(start_pos+term_nwords):]

    before_parts = before_text.split()[-1*size_before:]
    after_parts = after_text.split()[:size_after]

    li = [term]
    li.extend(before_parts)
    li.append('EVENT')
    li.extend(after_parts)
    example_string = ' '.join(li)
    building_strings.append(example_string)
  else:
    building_strings.append(None)

402882it [00:53, 7471.02it/s]


In [None]:
app_data_df['section_content'] = app_data_df['string']
app_data_df['string'] = building_strings
exact_terms_df = text_df.merge(app_data_df, on = 'section_content', how = 'left')
exact_terms_df = exact_terms_df.drop(columns = ['drug_text', 'section_content', 'section_title'])
exact_terms_df.head()

Unnamed: 0,product_id,string,index,found_term,meddra_id
0,1589,hypersensitivity prolonged and repeated applic...,55.0,hypersensitivity,10020751.0
1,4223,"fever or have worsened after 3 days, or if sym...",155.0,fever,10037660.0
2,4223,fructose intolerance are taken a day. patients...,369.0,fructose intolerance,10072104.0
3,4223,"headache after 3 days, or if symptoms are acco...",164.0,headache,10019211.0
4,4223,fructose effect if several are taken a day. pa...,369.0,fructose,10063255.0


In [None]:
exact_terms_df.to_csv(data_folder+'data/serious/sentences-rx_method14_nwords125_clinical_bert_application_set_AR_v0120.csv', index=False)

---
### format dataframe for onsides model

In [None]:
#required columns : section, drug, label_id, set_id, spl_version, pt_meddra_id, pt_meddra_term
exact_terms_df = pd.read_csv(data_folder+'data/serious/sentences-rx_method14_nwords125_clinical_bert_application_set_AR_v0120.csv')
exact_terms_df['section'] = 'AR'
exact_terms_df['set_id'] = exact_terms_df['product_id']
exact_terms_df['location'] = exact_terms_df['index']
exact_terms_df = exact_terms_df.rename(columns = {'product_id':'label_id', 'onsides_string':'string'}).drop(columns = ['index'])
print(exact_terms_df.shape)
drug_df = pd.read_csv(data_folder+'data/final_UK/drug_data.csv').rename(columns={'ingredient_name':'drug'})
drug_df['label_id'] = drug_df['product_id'].apply(lambda x: int(x.split('/')[-2]))
drug_df = drug_df[['label_id', 'drug']].drop_duplicates().groupby('label_id')['drug'].apply(set).reset_index()
drug_df['drug'] = drug_df['drug'].apply(lambda x: str(x)[1:-1].replace("'",''))
exact_terms_df = exact_terms_df.merge(drug_df[['label_id', 'drug']].drop_duplicates(), on = 'label_id', how = 'left')
print(exact_terms_df.shape)
folder = '/content/drive/MyDrive/pop_pharmacogenomics/fda/onsides-2.0.0/data/'
llt_pt = pd.read_csv(folder+'meddra_llt_pt_map.txt', delimiter = '|')
llt_pt_id_dict = dict(zip(llt_pt.llt_concept_code, llt_pt.pt_concept_code))
llt_pt_term_dict = dict(zip(llt_pt.llt_concept_code, llt_pt.pt_concept_name))
exact_terms_df['pt_meddra_id'] = exact_terms_df.meddra_id.apply(lambda x: llt_pt_id_dict[x] if x in llt_pt_id_dict.keys() else None)
exact_terms_df['pt_meddra_term'] =  exact_terms_df.meddra_id.apply(lambda x: llt_pt_term_dict[x] if x in llt_pt_term_dict.keys() else None)
print(exact_terms_df.shape)
exact_terms_df.to_csv(data_folder+'data/serious/sentences-rx_method14_nwords125_clinical_bert_application_set_AR_v0120.csv', index=False)
exact_terms_df.head(1)

(515103, 7)
(515103, 8)
(515103, 10)


Unnamed: 0,label_id,string,found_term,meddra_id,section,set_id,location,drug,pt_meddra_id,pt_meddra_term
0,1589,hypersensitivity prolonged and repeated applic...,hypersensitivity,10020751.0,AR,1589,55.0,"allantoin, 2,4-dichlorobenzyl alcohol, cetrimide",10020751.0,Hypersensitivity


In [None]:
exact_terms_df = pd.read_csv(data_folder+'data/serious/sentences-rx_method14_nwords125_clinical_bert_application_set_AR_v0120.csv')
print(exact_terms_df.shape)
exact_terms_df = exact_terms_df[exact_terms_df.string.notna()]
exact_terms_df['spl_version'] = None
print(exact_terms_df.shape)
exact_terms_df.to_csv(data_folder+'data/serious/sentences-rx_method14_nwords125_clinical_bert_application_set_AR_v0120.csv', index=False)

(515103, 10)
(515026, 11)


### run onsides model

In [None]:
#prep onsides materials
folder = '/content/drive/MyDrive/pop_pharmacogenomics/onsides_intl/onsides'
ar_model = folder + '/bestepoch-bydrug-PMB_14-AR-125-all_222_24_25_2.5e-05_256_32.pth'
bw_model = folder + '/bestepoch-bydrug-PMB_14-AR-125-all_222_24_25_2.5e-05_256_32.pth'
import os
os.chdir('/content/drive/MyDrive/pop_pharmacogenomics/fda/onsides-2.0.0/')
!python3 -m pip install -r requirements.txt

Collecting beautifulsoup4==4.11.1 (from -r requirements.txt (line 1))
  Downloading beautifulsoup4-4.11.1-py3-none-any.whl (128 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/128.2 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m128.2/128.2 kB[0m [31m3.7 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting nltk==3.7 (from -r requirements.txt (line 2))
  Downloading nltk-3.7-py3-none-any.whl (1.5 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.5/1.5 MB[0m [31m15.9 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting numpy==1.23.4 (from -r requirements.txt (line 3))
  Downloading numpy-1.23.4-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (17.1 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m17.1/17.1 MB[0m [31m65.5 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting pandas==1.5.1 (from -r requirements.txt (line 4))
  Downloading pandas-1.5.1-cp310-cp310-manylinux_

In [None]:
#use onsides model
f = data_folder+'data/serious/sentences-rx_method14_nwords125_clinical_bert_application_set_AR_v0120.csv'
!python3 src/predict.py --model $ar_model --examples $f

2024-01-28 14:36:19.788859: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:9261] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-01-28 14:36:19.788905: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:607] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-01-28 14:36:19.790214: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1515] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2024-01-28 14:36:19.797303: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.
Loading model from bestepoch-bydrug-PMB_14-AR-125-all

In [None]:
#build tables
r = data_folder+'data/serious/bestepoch-bydrug-PMB-sentences-rx_ref14-AR-125-all_222_24_25_2.5e-05_256_32.csv.gz'
!python3 src/create_onsides_datafiles.py --release v2.0.0-AR --results $r --examples $f

 prefix: bestepoch-bydrug-PMB-sentences-rx
 refset: ref14-AR-125-all
 np_random_seed: 222
 split_method: 24
 EPOCHS: 25
 LR: 2.5e-05
 threshold: 0.4633
 max_length: 2.5e-05
 batch-size: 32.csv
 compiled file: /content/drive/MyDrive/pop_pharmacogenomics/onsides_intl/uk_drug_label/data/serious/compiled/v2.0.0/AR.csv.gz
Loading results file...
 res.shape: (515026, 2)
Loding examples file...
 ex.shape: (515026, 11)
Concatenating results file to examples file...
Grouping predictions by drug label and adverse event term, and taking the mean prediction score...
  df_grouped = df.groupby(by=['section', 'drug', 'label_id', 'set_id', 'spl_version', 'pt_meddra_id', 'pt_meddra_term']).mean().reset_index()
Applying the pre-determined threshold to the prediction values to get predictions...
Predictions data frame created...
 predictions.shape: (0, 10)
Saving to gzipped file: /content/drive/MyDrive/pop_pharmacogenomics/onsides_intl/uk_drug_label/data/serious/compiled/v2.0.0/AR.csv.gz...


In [None]:
res = data_folder+'data/bestepoch-bydrug-PMB-sentences-rx_ref14-AR-125-all_222_24_25_2.5e-05_256_32.csv.gz'
ex = data_folder+'data/sentences-rx_method14_nwords125_clinical_bert_application_set_AR_v0924.csv'
#threshold = 0.463
res = pd.read_csv(res, header=None, names=['Pred0', 'Pred1'])
ex = pd.read_csv(ex)
df = pd.concat([ex, res], axis=1)
print(df.shape[0])
df = df[df.Pred0 > threshold]
print(df.shape[0])
df.head()

854253
625380


Unnamed: 0,label_id,found_term,meddra_id,location,string,section,set_id,drug,spl_version,pt_meddra_id,pt_meddra_term,Pred0,Pred1
0,1589,hypersensitivity,10020751,0,hypersensitivity EVENT ypersensitivity reactio...,AR,1589,,,10020751.0,Hypersensitivity,1.179923,0.0
1,1589,hypersensitivity reaction,10020751,0,hypersensitivity reaction EVENT persensitivity...,AR,1589,,,10020751.0,Hypersensitivity,1.194894,0.0
2,1589,hyper,10037211,0,hyper EVENT ypersensitivity reactions may occa...,AR,1589,,,10037211.0,Psychomotor hyperactivity,1.665707,0.0
3,4223,angioedema,10002424,729,angioedema presented in order of decreasing se...,AR,4223,,,10002424.0,Angioedema,1.494658,0.0
4,4223,edema,10030095,734,"edema in order of decreasing seriousness.a2,4-...",AR,4223,,,10030095.0,Oedema,2.323206,0.0


In [None]:
df.to_csv(data_folder+'data/ade_text_table_onsides_pred_v0924.csv', index=False)