In [1]:
import numpy as np
import pandas as pd
import requests
from tqdm import tqdm
from glob import glob
import ast, json
from bs4 import BeautifulSoup
#specify home folder (all sub-folders, scripts, data in this folder)
data_folder = ''
api = ''
date = ''

In [2]:
label_folder = data_folder+'data/raw/'

In [None]:
##read in table for all rx drugs
kegg_df = pd.DataFrame()
for f in glob(data_folder+'data/kegg_rx_drug_data*'):
  kegg_df = pd.concat([kegg_df, pd.read_csv(f)], axis = 0)
  print(f, kegg_df.shape[0])
kegg_df = kegg_df.drop_duplicates()
print(kegg_df.shape[0])
kegg_df.head(1)

In [4]:
def format_code(code):
    #this function takes in the code, and prepend zeros to the left of the string to make it all 8 characters.
    formatted_code = str(code).zfill(8)
    return formatted_code

----
### generate csv files
slightly more complicated given the new/old format structure differences.

In [None]:
drug_info_list = []
drug_content_list = []
folders = glob(data_folder+'data/raw*')
for folder in folders:
  print(folder)
  for file_path in tqdm(glob(folder+'/*')):
    japic_code = file_path.split('/')[-1].replace('.txt','')

    with open(file_path) as f:
      s = BeautifulSoup(f, 'html.parser')

    drug_info = s.find(class_="drug-info clearfix")
    try:
      drug_info_dict = dict(zip([i.text for i in drug_info.find_all('th')],
                                [i.text for i in drug_info.find_all('td')]))
      drug_info_list.append([japic_code, drug_info_dict])
    except:
      drug_info_list.append([japic_code, {}])

    try:
      labels = [i.text for i in s.find_all(['h4', 'h5'])]
      items = s.find_all(class_=['contents-title', 'contents-block'])
      item_text = [i.text for i in items]
      label_indexes = [item_text.index(l) if l in item_text else None for l in labels]
      label_dict = dict(zip(labels, label_indexes))
      for label in label_dict.keys():
        if label != labels[-1]:
          next_label = labels[labels.index(label)+1]
          content = items[label_dict[label]:label_dict[next_label]]
          drug_content_list.append([japic_code, label, content])
        else:
          content = items[label_dict[label]:]
          drug_content_list.append([japic_code, label, content])
    except:
      try:
        labels = [i.text for i in s.find_all(['h4', 'h5', 'p', 'div'])]
        items = s.find_all(class_=['subtitle', 'block1'])
        item_text = [i.text for i in items]
        label_indexes = [item_text.index(l) if l in item_text else None for l in labels]
        label_dict = dict(zip(labels, label_indexes))
        for label in label_dict.keys():
          if label != labels[-1]:
            next_label = labels[labels.index(label)+1]
            content = items[label_dict[label]:label_dict[next_label]]
            drug_content_list.append([japic_code, label, content])
          else:
            content = items[label_dict[label]:]
            drug_content_list.append([japic_code, label, content])
      except:
        drug_content_list.append([japic_code, None, None])

drug_info_raw_df = pd.DataFrame(drug_info_list, columns = ['drug_id', 'drug_info_dict'])
drug_info_raw_df.to_csv(data_folder+'drug_info_raw_all_v{}.csv'.format(date), index = False)

In [76]:
drug_info_raw_df = pd.read_csv(data_folder+'drug_info_raw_all_v{}.csv'.format(date))
drug_info_raw_df['drug_info_dict'] = drug_info_raw_df['drug_info_dict'].apply(lambda x: ast.literal_eval(x))
keys = []
for i in drug_info_raw_df.drug_info_dict.tolist():
  keys.extend(list(i.keys()))
keys = list(set(keys))
drug_info_df = drug_info_raw_df.copy()
for key in keys:
  drug_info_df[key] = drug_info_df['drug_info_dict'].apply(lambda x: x[key] if key in x.keys() else None)

In [86]:
drug_info_non_df = drug_info_df[drug_info_df.JAPIC.notna()]
drug_info_nan_df = drug_info_df[drug_info_df.JAPIC.isna()]
drug_info_nan_list = []
for i, row in tqdm(drug_info_nan_df.iterrows()):
  drug_id = row['drug_id']
  with open(file_path) as f:
      s = BeautifulSoup(f, 'html.parser')
  drug_info = s.find(class_="drug-info clearfix")
  try:
    drug_info_dict = dict(zip([i.text for i in drug_info.find_all('th')],
                              [i.text for i in drug_info.find_all('td')]))
    drug_info_nan_list.append([japic_code, drug_info_dict])
  except:
    drug_info_nan_list.append([japic_code, {}])
drug_info_nan_df = pd.DataFrame(drug_info_nan_list, columns = ['drug_id', 'drug_info_dict'])
keys = []
for i in drug_info_nan_df.drug_info_dict.tolist():
  keys.extend(list(i.keys()))
keys = list(set(keys))
drug_info_df = drug_info_nan_df.copy()
for key in keys:
  drug_info_df[key] = drug_info_df['drug_info_dict'].apply(lambda x: x[key] if key in x.keys() else None)
drug_info_df_all = pd.concat([drug_info_non_df, drug_info_df])
drug_info_df_all.to_csv(data_folder+'drug_info_processed_all_v{}.csv'.format(date), index=False)

25858it [16:22, 26.33it/s]


In [None]:
drug_content_raw_df = pd.DataFrame(drug_content_list, columns = ['drug_id', 'section_id', 'content'])
drug_content_raw_df.to_csv(data_folder + 'drug_content_raw_all_v{}.csv'.format(date), index = False)

---
### format csv files

| Column | Description |
| --| ----------|
| product_id | id of the drug label. | done
| drug_name | name of drug product. | done
| ingredients | comma-separated list of ingredients in drug. |
| about | regulatory classification of drug. |
| last_updated | date of last update to drug label. |
| company_title | marketing / manufacturer of the drug. |
| contact_items | a dictionary of contact information related to the marketing / manufacturer of the drug. |

In [66]:
drug_content_raw_df = pd.read_csv(data_folder + 'drug_content_raw_all_v{}.csv'.format(date))
print(drug_content_raw_df.shape)
drug_content_raw_df.head(1)

(612837, 3)


  drug_content_raw_df = pd.read_csv(data_folder + 'drug_content_raw_all_v0120.csv')


Unnamed: 0,drug_id,section_id,content
0,55998,\n商品情報\n組成・性状\n,[]


In [68]:
#drug info
drug_info_df_parsed = pd.DataFrame()
for i, row in tqdm(drug_content_raw_df.iterrows()):
  if '商品情報' in str(row['section_id']):
    try:
      t = BeautifulSoup(row['content'][1:-1], 'html.parser').find('div', class_='contents-block').find('table')
      df = pd.read_html(str(t))[0]
      df['drug_id'] = row['drug_id']
      drug_info_df_parsed = pd.concat([drug_info_df_parsed, df])
    except:
      pass
drug_info_df_parsed = drug_info_df_parsed[['drug_id', '販売名', '規制区分', '製造会社']].drop_duplicates() #ingredients, last_updated, contact_items
drug_info_df_parsed.head()

Unnamed: 0,drug_id,販売名,規制区分,製造会社
0,55997,サイモグロブリン点滴静注用25mg,"生物由来製品, 劇薬, 処方箋医薬品",サノフィ
0,54356,アフタゾロン口腔用軟膏0.1％,,あゆみ製薬
0,54345,ソルアセトF輸液,処方箋医薬品注）,テルモ
0,54344,ソルアセトD輸液,処方箋医薬品注）,テルモ
0,54341,ソルラクトD輸液,処方箋医薬品注）,テルモ


In [None]:
updates = []
folders = glob(data_folder+'data/raw*')
for folder in folders:
  print(folder)
  for file_path in tqdm(glob(folder+'/*')):
    japic_code = file_path.split('/')[-1].replace('.txt','')

    with open(file_path) as f:
      s = BeautifulSoup(f, 'html.parser')

    try:
      updates.append([japic_code, s.find(class_='revision').text])
    except:
      continue
updates_df = pd.DataFrame(updates, columns = ['drug_id', 'update'])
updates_df.to_csv(data_folder+'drug_info_update_raw_all_v{}.csv'.format(date), index=False)
updates_df.head(1)

/content/drive/MyDrive/jp_drug_label/data/raw


100%|██████████| 13382/13382 [22:40<00:00,  9.83it/s]


/content/drive/MyDrive/jp_drug_label/data/raw_otc


100%|██████████| 10555/10555 [12:57<00:00, 13.58it/s]


/content/drive/MyDrive/jp_drug_label/data/raw_v0120


100%|██████████| 13343/13343 [22:56<00:00,  9.69it/s]


Unnamed: 0,drug_id,update
0,55998,2021年8月 改訂 (第4版)


In [73]:
updates_df = pd.read_csv(data_folder+'drug_info_update_raw_all_v{}.csv'.format(date))
updates_df = pd.read_csv(data_folder+'drug_info_update_raw_all_v{}.csv'.format(date))
updates_df['version'] = updates_df['update'].apply(lambda x: x.split('改訂')[-1].split('作成')[-1].replace('（','(').replace('）',')').strip())
updates_df['update_year'] = updates_df['update'].apply(lambda x: x.split('年')[0])
updates_df['update_month'] = updates_df['update'].apply(lambda x: x.split('月')[0].split('年')[-1])
updates_df['drug_id'] = updates_df['drug_id'].apply(lambda x: format_code(x))
drug_info_df_parsed['drug_id'] = drug_info_df_parsed['drug_id'].apply(lambda x: format_code(x))
drug_info_df_parsed = drug_info_df_parsed.merge(updates_df, on = 'drug_id', how = 'left').drop_duplicates()
drug_info_df_parsed.head(1)

Unnamed: 0,drug_id,販売名,規制区分,製造会社,update,version,update_year,update_month
0,55997,サイモグロブリン点滴静注用25mg,"生物由来製品, 劇薬, 処方箋医薬品",サノフィ,2023年4月 改訂（第2版）,(第2版),2023,4


In [93]:
drug_info_proc_df = pd.read_csv(data_folder+'drug_info_processed_all_v{}.csv'.format(date))
drug_info_proc_df = drug_info_proc_df.drop(['drug_info_dict', 'KEGG DGROUP', 'JAPIC'], axis = 1)
drug_info_proc_df['drug_id'] = drug_info_proc_df['drug_id'].apply(lambda x: format_code(x))
drug_info_proc_df.head()

Unnamed: 0,drug_id,総称名,一般名,欧文一般名,薬効分類名,ATCコード,製剤名,薬効分類番号,KEGG DRUG
0,55997,サイモグロブリン,抗ヒト胸腺細胞ウサギ免疫グロブリン,"Anti-human Thymocyte Immunoglobulin,Rabbit",免疫抑制剤,L04AA04,抗ヒト胸腺細胞ウサギ免疫グロブリン製剤,6399,\n\nD09190\n抗ヒト胸腺細胞ウサギ免疫グロブリン \n\n\n商品一覧\n米国の商...
1,54356,アフタゾロン,デキサメタゾン,Dexamethasone,口腔粘膜用剤,A01AC02,デキサメタゾン軟膏,2399,\n\nD00292\nデキサメタゾン \n\n\n商品一覧\n米国の商品\n\n
2,54345,ソルアセト,,,酢酸リンゲル液,,,3319,
3,54344,ソルアセト,,,ブドウ糖加酢酸リンゲル液,,,3319,
4,54341,ソルラクト,,,ブドウ糖加乳酸リンゲル液,,,3319,


In [97]:
drug_info_df_all = drug_info_proc_df.merge(drug_info_df_parsed, on = 'drug_id', how = 'left')
drug_info_df_all.columns = ['drug_id', 'generic_name', 'common_name', 'common_name_en', 'therapeutic_category', 'atc_code', 'product_description', 'therapeutic_category_code',
                            'kegg_drug_code', 'product_name', 'regulatory_class', 'marketing_authorization_holder', 'update', 'version', 'update_year', 'update_month']
drug_info_df_all.head()

Unnamed: 0,drug_id,generic_name,common_name,common_name_en,therapeutic_category,atc_code,product_description,therapeutic_category_code,kegg_drug_code,product_name,regulatory_class,marketing_authorization_holder,update,version,update_year,update_month
0,55997,サイモグロブリン,抗ヒト胸腺細胞ウサギ免疫グロブリン,"Anti-human Thymocyte Immunoglobulin,Rabbit",免疫抑制剤,L04AA04,抗ヒト胸腺細胞ウサギ免疫グロブリン製剤,6399,\n\nD09190\n抗ヒト胸腺細胞ウサギ免疫グロブリン \n\n\n商品一覧\n米国の商...,サイモグロブリン点滴静注用25mg,"生物由来製品, 劇薬, 処方箋医薬品",サノフィ,2023年4月 改訂（第2版）,(第2版),2023,4
1,54356,アフタゾロン,デキサメタゾン,Dexamethasone,口腔粘膜用剤,A01AC02,デキサメタゾン軟膏,2399,\n\nD00292\nデキサメタゾン \n\n\n商品一覧\n米国の商品\n\n,アフタゾロン口腔用軟膏0.1％,,あゆみ製薬,2020年4月 改訂（第1版）,(第1版),2020,4
2,54345,ソルアセト,,,酢酸リンゲル液,,,3319,,ソルアセトF輸液,処方箋医薬品注）,テルモ,2023年4月 改訂（第1版）,(第1版),2023,4
3,54344,ソルアセト,,,ブドウ糖加酢酸リンゲル液,,,3319,,ソルアセトD輸液,処方箋医薬品注）,テルモ,2023年4月 改訂（第1版）,(第1版),2023,4
4,54341,ソルラクト,,,ブドウ糖加乳酸リンゲル液,,,3319,,ソルラクトD輸液,処方箋医薬品注）,テルモ,2023年4月 改訂（第1版）,(第1版),2023,4


In [98]:
drug_info_df_all.to_csv(data_folder+'drug_info_all_v{}.csv'.format(date), index=False)

### make into XML files

In [6]:
drug_content_df = pd.read_csv(data_folder + 'drug_content_raw_all_v{}.csv'.format(date))
drug_info_df = pd.read_csv(data_folder+'drug_info_all_v{}.csv'.format(date))

  drug_content_df = pd.read_csv(data_folder + 'drug_content_raw_all_v0120.csv')
  drug_info_df = pd.read_csv(data_folder+'drug_info_all_v0120.csv')


In [29]:
content_list = []
for content in tqdm(drug_content_df['content'].tolist()):
  content = BeautifulSoup(content, 'html.parser').text
  content_list.append(content)
drug_content_df['content'] = content_list

100%|██████████| 612837/612837 [25:16<00:00, 404.20it/s]


In [None]:
drug_info_df['drug_id'] = drug_info_df['drug_id'].apply(lambda x: format_code(x))
drug_content_df['drug_id'] = drug_content_df['drug_id'].apply(lambda x: format_code(x))
drug_content_df.head(1)

In [30]:
import xml.etree.ElementTree as ET
#format into individual drug label XML files
def drug_label_xml(drug_id, out_folder):
    drug_id = format_code(drug_id)
    drug_info = drug_info_df[drug_info_df.drug_id == drug_id]
    drug_content = drug_content_df[drug_content_df.drug_id == drug_id]

    root = ET.Element("drug_label")

    drug_i = ET.Element("drug_info")
    root.append(drug_i)

    for col in drug_info.columns:
      subelement = ET.SubElement(drug_i, col)
      if str(drug_info[col].iloc[0]) != 'nan':
        subelement.text = str(drug_info[col].iloc[0])
      else:
        subelement.text = ''

    drug_label = ET.Element("drug_label_info")
    root.append(drug_label)

    for i, row in drug_content.iterrows():
      subelement = ET.SubElement(drug_label, row['section_id'])
      subelement_title = ET.SubElement(subelement, 'title')
      if str(row['section_id']) != 'nan':
        subelement_title.text = row['section_id']
      else:
        subelement_title.text = ''
      subelement_content = ET.SubElement(subelement, 'content')
      if row['content'] != None and row['content'] != '':
        subelement_content.text = str((row['content']))
      else:
        subelement_content.text = ''

    tree = ET.ElementTree(root)

    with open (out_folder+'{}.xml'.format(str(drug_id)), "wb") as f:
        tree.write(f, encoding = 'utf-8')

In [26]:
!mkdir $data_folder'data/xml_data'

In [31]:
for drug_id in tqdm(drug_info_df.drug_id.unique()):
  drug_label_xml(drug_id, data_folder+'data/xml_data/')

100%|██████████| 7899/7899 [07:14<00:00, 18.19it/s]
