In [2]:
import numpy as np
import pandas as pd
import requests
from tqdm import tqdm
from glob import glob
import ast, json
from bs4 import BeautifulSoup
#specify home folder (all sub-folders, scripts, data in this folder)
data_folder = ''
api = ''
date = ''

In [3]:
#if running on google colab
#from google.colab import drive
#drive.mount('/content/drive')

Mounted at /content/drive


## Make datatable of all drugs available (RX)
Iterate through kegg drug list to make datatable of all rx drugs currently available.

Page : https://www.kegg.jp/medicus-bin/search_drug?uid=1685277724113953&display=med&page=1

Output : kegg_rx_drug_data.csv

In [5]:
kegg_url_init = 'https://www.kegg.jp/medicus-bin/search_drug?display=med&page=1&uid=1685277724113953'
r = requests.get(kegg_url_init)
r.encoding = r.apparent_encoding
soup = BeautifulSoup(r.text, 'html.parser')
#get number of individual rx drugs
rx_num = int(soup.find('li', class_='med on').text.split('(')[-1].replace(')',''))
#get number of rx drug pages to parse
rx_page_num = int(np.ceil(int(rx_num)/40))
rx_num, rx_page_num

(13343, 334)

In [6]:
kegg_rx_drug_list = []
for i in tqdm(range(int(rx_page_num))):
  page_num = i+1
  kegg_url_page = 'https://www.kegg.jp/medicus-bin/search_drug?display=med&page={}&uid=1685277724113953'.format(page_num)
  r = requests.get(kegg_url_page)
  r.encoding = r.apparent_encoding
  page_soup = BeautifulSoup(r.text, 'html.parser')
  drug_table_rows = page_soup.find('table', class_='list1').find_all('tr')
  for row in drug_table_rows[1:]: #skip header
    kegg_rx_drug_list.append(row.find_all('td')) #split into each cell

100%|██████████| 334/334 [05:10<00:00,  1.07it/s]


In [7]:
kegg_rx_drug_df = pd.DataFrame(kegg_rx_drug_list, columns = ['product', 'ingredient', 'indication', 'kegg_drug_id'])
kegg_rx_drug_df['kegg_product_id'] = kegg_rx_drug_df['product'].apply(lambda x: x.find('a', href=True)['href'].split('japic_code=')[-1])
kegg_rx_drug_df['product'] = kegg_rx_drug_df['product'].apply(lambda x: x.text.strip())
kegg_rx_drug_df['ingredient'] = kegg_rx_drug_df.ingredient.apply(lambda x: ';'.join(x.text.split('\n')))
kegg_rx_drug_df['indication'] = kegg_rx_drug_df.indication.apply(lambda x: x.text)
kegg_rx_drug_df['kegg_drug_id'] = kegg_rx_drug_df.kegg_drug_id.apply(lambda x: x.text)
kegg_rx_drug_df.head()

Unnamed: 0,product,ingredient,indication,kegg_drug_id,kegg_product_id
0,生食溶解液キットH (後発品)\n \n \n(ニプロ),塩化ナトリウム;Sodium Chloride,両頭針付溶解剤,D05352,71167
1,クエン酸水和物「コザカイ」原末\n \n \n(小堺製薬),,調剤用薬,D01222,71166
2,クエン酸水和物「コザカイ」原末\n \n \n(小堺製薬),,調剤用薬,D01222,71165
3,重ソー静注8.4％「NS」\n \n \n(日新製薬－山形),炭酸水素ナトリウム;Sodium Bicarbonate,,D01203,71164
4,イムフレックスCPD−MAP\n \n \n(テルモ),,,D08745,71162


In [8]:
kegg_rx_drug_df.to_csv(data_folder+'data/kegg_rx_drug_data_v{}.csv'.format(date), index=False)

## Make datatable of all drugs available (OTC) - currently not used
Iterate through kegg drug list to make datatable of all rx drugs currently available.

Page : https://www.kegg.jp/medicus-bin/search_drug?uid=1685277724113953&display=otc&page=1

Output : kegg_otc_drug_data.csv

In [None]:
kegg_url_init = 'https://www.kegg.jp/medicus-bin/search_drug?display=otc&page=1&uid=1685277724113953'
r = requests.get(kegg_url_init)
r.encoding = r.apparent_encoding
soup = BeautifulSoup(r.text, 'html.parser')
#get number of individual otc drugs
otc_num = int(soup.find('li', class_='otc on').text.split('(')[-1].replace(')',''))
#get number of otc drug pages to parse
otc_page_num = int(np.ceil(int(otc_num)/40))
otc_num, otc_page_num

(10555, 264)

In [None]:
kegg_otc_drug_list = []
for i in tqdm(range(int(otc_page_num))):
  page_num = i+1
  kegg_url_page = 'https://www.kegg.jp/medicus-bin/search_drug?display=otc&page={}&uid=1685277724113953'.format(page_num)
  r = requests.get(kegg_url_page)
  r.encoding = r.apparent_encoding
  page_soup = BeautifulSoup(r.text, 'html.parser')
  drug_table_rows = page_soup.find('table', class_='list1').find_all('tr')
  for row in drug_table_rows[1:]: #skip header
    kegg_otc_drug_list.append(row.find_all('td')) #split into each cell

100%|██████████| 264/264 [04:26<00:00,  1.01s/it]


In [None]:
kegg_otc_drug_df = pd.DataFrame(kegg_otc_drug_list, columns = ['product', 'company', 'indication', 'risk_level'])
kegg_otc_drug_df['kegg_product_id'] = kegg_otc_drug_df['product'].apply(lambda x: x.find('a', href=True)['href'].split('japic_code=')[-1])
kegg_otc_drug_df['product'] = kegg_otc_drug_df['product'].apply(lambda x: x.text.strip())
kegg_otc_drug_df['company'] = kegg_otc_drug_df.company.apply(lambda x: x.text)
kegg_otc_drug_df['indication'] = kegg_otc_drug_df.indication.apply(lambda x: x.text)
kegg_otc_drug_df['risk_level'] = kegg_otc_drug_df.risk_level.apply(lambda x: x.text)
kegg_otc_drug_df.head()

Unnamed: 0,product,company,indication,risk_level,kegg_product_id
0,ベンザルコニウム塩化物液10％「東海」（AP）,（株）アラクス,殺菌消毒薬（特殊絆創膏を含む）,3,K2304000020
1,消毒用エタノールIP「シオエ」,日本新薬（株）,殺菌消毒薬（特殊絆創膏を含む）,,K2304000019
2,アカナキュア,小林製薬（株）,鎮痛・鎮痒・収れん・消炎薬（パップ剤を含む）,2,K2304000017
3,タフメイト3000ライト,田村薬品工業（株）,ビタミン含有保健薬（ビタミン剤等）,,K2304000016
4,パンビタンエース,アリナミン製薬（株）,ビタミン含有保健薬（ビタミン剤等）,2*,K2304000010


In [None]:
kegg_otc_drug_df.to_csv(data_folder+'data/kegg_otc_drug_data.csv', index=False)

## Download all raw html files of drugs (RX)
Takes about 5.5hrs for ~13,000 drug labels

Input : kegg_rx_drug_data.csv

Output : folder (/raw/) filled with individual html drug label files

In [4]:
kegg_df = pd.read_csv(data_folder+'data/kegg_rx_drug_data_v{}.csv'.format(date))
kegg_df.head(1)

Unnamed: 0,product,ingredient,indication,kegg_drug_id,kegg_product_id
0,生食溶解液キットH (後発品)\n \n \n(ニプロ),塩化ナトリウム;Sodium Chloride,両頭針付溶解剤,D05352,71167


In [5]:
raw_file_folder = data_folder+'data/raw_v{}/'.format(date)

In [7]:
def format_code(code):
    #this function takes in the code, and prepend zeros to the left of the string to make it all 8 characters.
    formatted_code = str(code).zfill(8)
    return formatted_code

def download_html(code, out_folder):
  #grab html from kegg website, download into designated raw file folder.
  r = requests.get('https://www.kegg.jp/medicus-bin/japic_med?japic_code={}'.format(code))
  #encode (so japanese characters don't break)
  r.encoding = r.apparent_encoding
  #save raw html
  f = open(out_folder+'{}.txt'.format(code), 'w')
  f.write(r.text)
  f.close()

In [None]:
for code in tqdm(kegg_df.kegg_product_id.tolist()):
  #iterate through codes, download each drug html file
  japic_code = format_code(code)
  download_html(japic_code, out_folder = raw_file_folder)

 92%|█████████▏| 12309/13343 [2:20:02<10:17,  1.68it/s]

In [8]:
#check if all files have been downloaded properly (sometimes the internet connection fails)
error_codes = []
for code in tqdm(kegg_df['kegg_product_id'].tolist()):
  japic_code = format_code(code)
  file_path = raw_file_folder+'{}.txt'.format(japic_code)
  with open(file_path) as f:
    s = BeautifulSoup(f, 'html.parser')
  try:
    if s.find('title').text == '403 Forbidden':
      error_codes.append(japic_code)
      #redownload, try to replace file with correct one
      download_html(japic_code, out_folder = raw_file_folder)
  except:
    pass
len(error_codes)

100%|██████████| 13343/13343 [1:01:36<00:00,  3.61it/s]


0

In [9]:
#check number of total files - should be 14,196 (as of May 28th)
len(glob(raw_file_folder+'/*.txt'))

13343

## Download all raw html files of drugs (OTC)
Takes about 5.5hrs for ~13,000 drug labels

Input : kegg_otc_drug_data.csv

Output : folder (/raw_otc/) filled with individual html drug label files

In [None]:
kegg_df = pd.read_csv(data_folder+'data/kegg_otc_drug_data.csv')
kegg_df.head(1)

Unnamed: 0,product,company,indication,risk_level,kegg_product_id
0,ベンザルコニウム塩化物液10％「東海」（AP）,（株）アラクス,殺菌消毒薬（特殊絆創膏を含む）,3,K2304000020


In [None]:
raw_file_folder = data_folder+'data/raw_otc/'

In [None]:
def format_code(code):
    #this function takes in the code, and prepend zeros to the left of the string to make it all 8 characters.
    formatted_code = str(code).zfill(8)
    return formatted_code

def download_html(code, out_folder):
  #grab html from kegg website, download into designated raw file folder.
  r = requests.get('https://www.kegg.jp/medicus-bin/japic_otc?japic_code={}'.format(code))
  #encode (so japanese characters don't break)
  r.encoding = r.apparent_encoding
  #save raw html
  f = open(out_folder+'{}.txt'.format(japic_code), 'w')
  f.write(r.text)
  f.close()

In [None]:
for code in tqdm(kegg_df.kegg_product_id.tolist()):
  #iterate through codes, download each drug html file
  japic_code = format_code(code)
  download_html(japic_code, out_folder = raw_file_folder)

In [None]:
#check if all files have been downloaded properly (sometimes the internet connection fails)
error_codes = []
for code in tqdm(kegg_df['kegg_product_id'].tolist()):
  japic_code = format_code(code)
  file_path = raw_file_folder+'{}.txt'.format(japic_code)
  with open(file_path) as f:
    s = BeautifulSoup(f, 'html.parser')
  try:
    if s.find('title').text == '403 Forbidden':
      error_codes.append(japic_code)
      #redownload, try to replace file with correct one
      download_html(japic_code, out_folder = raw_file_folder)
  except:
    pass
len(error_codes)

100%|██████████| 2591/2591 [1:05:04<00:00,  1.51s/it]


0

In [None]:
#check number of total files - should be 14,196 (as of May 28th)
len(glob(raw_file_folder+'/*.txt'))

10555