In [None]:
import numpy as np 
import pandas as pd
import requests
from tqdm import tqdm
import ast, json
from glob import glob
from bs4 import BeautifulSoup
data_folder = ''
external_data_folder = ''
api = ''

### Pull raw table of kegg drugs

In [None]:
kegg_drug_rows = []
for i in tqdm(range(303)):
  url = 'https://www.kegg.jp/medicus-bin/search_drug?uid=1681797824120106&display=kegg_drug&page={}'.format(str(i+1))
  r = requests.get(url)
  r.encoding = r.apparent_encoding
  s = BeautifulSoup(r.text, 'html.parser')
  table = s.find(class_='list1')
  table_rows = table.find_all('tr')
  table_items = [i.find_all('td') for i in table_rows][1:]
  kegg_drug_rows.extend(table_items)

100%|██████████| 303/303 [05:55<00:00,  1.17s/it]


In [None]:
kegg_drug_df = pd.DataFrame(kegg_drug_rows, columns = ['kegg_id', 'general_name', 'product_name', 'indication'])
kegg_drug_df.to_csv(data_folder+'kegg_drug_info_raw.csv', index=False)
kegg_drug_df.head()

Unnamed: 0,kegg_id,general_name,product_name,indication
0,[[D00001]],"[常水 (JP18), [], \n精製水 (JP18), [], \n精製水 (容器入り)...","[大塚蒸留水(大塚製薬工場), [], \n注射用水(ニプロ, 光製薬, 扶桑薬品工業, 日...",[]
1,[[D00002]],"[ナダイド (JAN), [], \nニコチンアミドアデニンジヌクレオチド, [], \nN...",[],[]
2,[[D00003]],"[酸素 (JP18), [], \nOxygen (JP18/USP)]","[カノックス−L(カンサン), [], \nスターサンソL液体酸素(鈴商総合ガスセンター),...",[]
3,[[D00004]],"[二酸化炭素 (JP18), [], \n炭酸ガス, [], \nCarbon dioxid...","[二酸化炭素(Kist, エア・ウォーター, エア・ガシズ北九州, カネコ商会, コイケ酸商...",[]
4,[[D00005]],"[フラビンアデニンジヌクレオチド (JAN), [], \nFlavin adenine d...",[],[]


## Map kegg drugs to rxnorm directly (UMLS file) - 4841 mapped

In [None]:
kegg_drug_df = pd.read_csv(data_folder+'kegg_drug_info_raw.csv')
kegg_drug_df['kegg_id'] = kegg_drug_df.kegg_id.apply(lambda x: BeautifulSoup(x).text)
kegg_drug_df['general_name'] = kegg_drug_df.general_name.apply(lambda x: BeautifulSoup(x))
kegg_drug_df['product_name'] = kegg_drug_df.product_name.apply(lambda x: BeautifulSoup(x))
kegg_drug_df['indication'] = kegg_drug_df.indication.apply(lambda x: BeautifulSoup(x).text)
#parse only english names
kegg_drug_df['general_en_name'] = kegg_drug_df.general_name.apply(lambda x: [i.split(' (')[0].lower() for i in x.text.split('\n') if i.isascii()])
kegg_drug_df.head()

Unnamed: 0,kegg_id,general_name,product_name,indication,general_en_name
0,D00001,"[[[<td class=""data1"">常水 (JP18)<br/>\n精製水 (JP18...","[[[<td class=""data1"" style=""max-width:400px;"">...",,"[water, purified water, purified water in cont..."
1,D00002,"[[[<td class=""data1"">ナダイド (JAN)<br/>\nニコチンアミドア...","[[[<td class=""data1"" style=""max-width:400px;"">...",,"[nadide, nicotinamide adenine dinucleotide]"
2,D00003,"[[[<td class=""data1"">酸素 (JP18)<br/>\nOxygen (J...","[[[<td class=""data1"" style=""max-width:400px;"">...",,[oxygen]
3,D00004,"[[[<td class=""data1"">二酸化炭素 (JP18)<br/>\n炭酸ガス<b...","[[[<td class=""data1"" style=""max-width:400px;"">...",,[carbon dioxide]
4,D00005,"[[[<td class=""data1"">フラビンアデニンジヌクレオチド (JAN)<br/...","[[[<td class=""data1"" style=""max-width:400px;"">...",,[flavin adenine dinucleotide]


In [None]:
rxnorm = pd.read_csv(external_data_folder+'umls_rxnorm.csv')
rxnorm = rxnorm[['CODE', 'STR']]
rxnorm.STR = rxnorm.STR.apply(lambda x: x.lower())
rxnorm.head()

Unnamed: 0,CODE,STR
0,1926948,"1,2-dipalmitoylphosphatidylcholine"
1,38,parlodel
2,44,mesna
3,1489913,droxidopa
4,61,beta-alanine


In [None]:
df = kegg_drug_df[['kegg_id', 'general_en_name']].explode('general_en_name').merge(rxnorm, left_on = 'general_en_name', right_on = 'STR', how = 'left').groupby('kegg_id')['CODE'].apply(set).reset_index()
df['CODE'] = df.CODE.apply(lambda x: [i for i in list(x) if str(i) != 'nan'])
df = df.merge(kegg_drug_df[['kegg_id', 'general_en_name']], on = 'kegg_id', how = 'left')
df.head()

Unnamed: 0,kegg_id,CODE,general_en_name
0,D00001,[11295.0],"[water, purified water, purified water in cont..."
1,D00002,[],"[nadide, nicotinamide adenine dinucleotide]"
2,D00003,[7806.0],[oxygen]
3,D00004,[2034.0],[carbon dioxide]
4,D00005,[],[flavin adenine dinucleotide]


In [None]:
df.to_csv(data_folder+'kegg_drug_info_mapped.csv', index=False)

## Use UMLS API to fuzzy match

In [None]:
df = pd.read_csv(data_folder+'kegg_drug_info_mapped.csv')
df['CODE'] = df['CODE'].apply(lambda x: ast.literal_eval(x))
df['general_en_name'] = df['general_en_name'].apply(lambda x: ast.literal_eval(x))
df.head(1)

Unnamed: 0,kegg_id,CODE,general_en_name
0,D00001,[11295.0],"[water, purified water, purified water in cont..."


In [None]:
codes = []
for i, row in tqdm(df.iterrows()):
  try:
    if len(row['CODE']) == 0:
      drug_codes = []
      for drug in row['general_en_name']:
        url = 'https://rxnav.nlm.nih.gov/REST/rxcui.json?name={}&search=1'.format(drug)
        j = requests.get(url).json()
        try:
          drug_codes.extend(j['idGroup']['rxnormId'])
        except:
          pass
      drug_codes = list(set(drug_codes))
      codes.append([row['kegg_id'], drug_codes])
  except:
    continue

12115it [15:17, 13.21it/s]


In [None]:
codes_dict = {i[0]:i[1] for i in codes} 
df['CODE'] = df.apply(lambda x: codes_dict[x.kegg_id] if x.kegg_id in codes_dict.keys() else x.CODE, axis = 1)

In [None]:
df.to_csv(data_folder+'kegg_drug_info_mapped.csv', index=False)

In [None]:
df.head(1)

Unnamed: 0,kegg_id,CODE,general_en_name
0,D00001,[11295.0],"[water, purified water, purified water in cont..."


## Use athena

In [None]:
df = pd.read_csv(data_folder+'kegg_drug_info_mapped.csv')
df['CODE'] = df['CODE'].apply(lambda x: ast.literal_eval(x))
df['general_en_name'] = df['general_en_name'].apply(lambda x: ast.literal_eval(x))
df.head(1)

Unnamed: 0,kegg_id,CODE,general_en_name
0,D00001,[11295.0],"[water, purified water, purified water in cont..."


In [None]:
rx_ext = pd.read_csv(external_data_folder+'athena_rxnorm_extension/CONCEPT.csv', delimiter = '\t')
rx_ext = rx_ext[rx_ext.vocabulary_id == 'RxNorm Extension'][['concept_id', 'concept_code', 'concept_name']]
rx_ext['concept_name'] = rx_ext.concept_name.apply(lambda x: x.lower())
rx_ext.head(1)

  rx_ext = pd.read_csv(external_data_folder+'athena_rxnorm_extension/CONCEPT.csv', delimiter = '\t')


Unnamed: 0,concept_id,concept_code,concept_name
206017,36889853,OMOP1000000,acetaminophen 325 mg rectal suppository box of 10


In [67]:
found_dict = {}
for i, row in tqdm(df.iterrows()):
  if len(row['CODE']) == 0:
    try:
      name = row['general_en_name'][0]
      codes = rx_ext[rx_ext.concept_name == name].concept_code.tolist()[0]
      if len(codes) > 0:
        found_dict[row['kegg_id']] = codes
    except:
      continue

12115it [16:55, 11.93it/s]


In [None]:
df.head()

Unnamed: 0,kegg_id,CODE,general_en_name
0,D00001,[11295.0],"[water, purified water, purified water in cont..."
1,D00002,[1044975],"[nadide, nicotinamide adenine dinucleotide]"
2,D00003,[7806.0],[oxygen]
3,D00004,[2034.0],[carbon dioxide]
4,D00005,[1314412],[flavin adenine dinucleotide]
