# Header

In [None]:
import os

## Notebook Configuration

In [None]:
# path to the this notebook
# NOTE: Replace this with your project path if needed
PROJECT_PATH = (
    "/content/drive/My Drive/W210"
    if "google.colab" in str(get_ipython())
    else "."
)

# path to the data folder
# NOTE: Replace this with your data path if needed
DATA_PATH = f"{PROJECT_PATH}/data" if "google.colab" in str(get_ipython()) else PROJECT_PATH
# NOTE: For colab we use content so it doesn't load on google drive storage
RAW_DATA_PATH = f"{PROJECT_PATH}/data" if "google.colab" in str(get_ipython()) else f"{PROJECT_PATH}/data"

## Colab Setup

In [None]:
if "google.colab" in str(get_ipython()):
    from google.colab import drive
    drive.mount('/content/drive')

    # setup libraries used by notebook
    #os.system("pip install -q kaggle")

os.chdir(PROJECT_PATH)

Mounted at /content/drive


## Library Import

In [None]:
import itertools
import json
import requests
import shutil
import typing
import zipfile

from io import BytesIO
from pathlib import Path

import matplotlib.pyplot as plt
import numpy as np
import pandas as pd

from tqdm import tqdm

## Functions

In [None]:
def download_from_web(
    save_path: typing.Union[str, Path, typing.IO[bytes], typing.BinaryIO],
    url: str,
    block_size: int = 300 * 1024,
) -> typing.Union[typing.IO[bytes], typing.BinaryIO]:
    """
    Perform download of a file from a public web adress

    :param save_path: path to save the data extraction
    :param url: address where data is stored
    :param block_size: size in bytes from incremental download
    :return: buffer object to file
    """
    # make sure the path points to an buffer object
    if isinstance(save_path, str) or isinstance(save_path, Path):
        file_path: typing.Union[typing.IO[bytes], typing.BinaryIO] = open(save_path, "wb")
    else:
        file_path = save_path

    # generate a request to get the content
    response = requests.get(url, stream=True)
    total_size_in_bytes = int(response.headers.get("content-length", 0))

    # parse the file
    progress_bar = tqdm(total=total_size_in_bytes, unit="iB", unit_scale=True)
    for data in response.iter_content(block_size):
        progress_bar.update(len(data))
        file_path.write(data)
    file_path.close()

    # returns the buffer object
    return file_path

# Convert packaging NDC into 11 digits
def convert_packaging_ndc(input_string: str) -> str:
    if len(input_string) == 13:
      result_string = input_string.replace('-', '')
    elif len(input_string) == 12:
      # Split the input string into three parts based on hyphens
      parts = input_string.split('-')
      if len(parts[0]) == 4:
        # Apply zero-padding to the first part
        parts[0] = parts[0].zfill(5)
        # Join the parts back together with hyphens
        result_string = ''.join(parts)
      elif len(parts[1]) == 3:
        # Apply zero-padding to the second part
        parts[1] = parts[1].zfill(4)
        # Join the parts back together with hyphens
        result_string = ''.join(parts)
      elif len(parts[2]) == 1:
        # Apply zero-padding to the third part
        parts[2] = parts[2].zfill(2)
        # Join the parts back together with hyphens
        result_string = ''.join(parts)
      else:
        result_string = 'Padding Error'
    else:
      result_string = 'Length Error'

    return result_string

# Convert product ndc into 9 digits
def convert_product_ndc(input_string):
    if len(input_string) == 10:
      result_string = input_string.replace('-', '')
    elif len(input_string) == 9:
      # Split the input string into two parts based on hyphens
      parts = input_string.split('-')
      if len(parts[0]) == 4:
        # Apply zero-padding to the first part
        parts[0] = parts[0].zfill(5)
        # Join the parts back together with hyphens
        result_string = ''.join(parts)
      elif len(parts[1]) == 3:
        # Apply zero-padding to the second part
        parts[1] = parts[1].zfill(4)
        # Join the parts back together with hyphens
        result_string = ''.join(parts)
      else:
        result_string = 'Padding Error'
    else:
      result_string = 'Length Error'

    return result_string

---

# Configurations

In [None]:
ROOT_NDC = f"{DATA_PATH}/drug-ndc"
NDC_URL = "https://download.open.fda.gov/drug/ndc/drug-ndc-0001-of-0001.json.zip"
TODAY = pd.to_datetime("today").strftime("%Y%m%d")

# Data Pre-Processing

## FDA Drug Directory

The Drug Listing Act of 1972 requires registered drug establishments to provide the Food and Drug Administration (FDA) with a current list of all drugs manufactured, prepared, propagated, compounded, or processed by it for commercial distribution.

The openFDA drug NDC Directory endpoint returns data from the NDC Directory, a database that contains information on the National Drug Code (NDC). FDA publishes the listed NDC numbers and the information submitted as part of the listing information in the NDC Directory which is updated daily.

The information submitted as part of the listing process, the NDC number, and the NDC Directory are used in the implementation and enforcement of the Act.

In [None]:
ndc_path = f"{ROOT_NDC}/{TODAY} - drug-ndc-0001-of-0001.json.zip"

all_ndc = list()
all_generic_name = list()
all_route = list()
all_active_ingredients = list()
all_packaging = list()
all_manufacturer = list()
all_rxcui = list()
all_spl_set = list()
all_upc = list()
all_nui = list()
all_pharm_class_cs = list()
all_pharm_class_epc = list()
all_pharm_class_pe = list()
all_pharm_class_moa = list()
all_unii = list()

if not os.path.exists(ndc_path):
  download_from_web(save_path=ndc_path, url=NDC_URL)

for file_name in os.listdir(ROOT_NDC):
  with zipfile.ZipFile(f"{ROOT_NDC}/{file_name}") as z:
    try:
      raw_ndc = json.load(z.open("drug-ndc-0001-of-0001.json"))
    except KeyError:
      continue

  ndc_list = list()
  gen_list = list()
  route_list = list()
  ai_list = list()
  pack_list = list()
  open_fda = dict()
  for r in tqdm(raw_ndc["results"]):
    id_dict = {"product_ndc": r["product_ndc"], "product_id": r["product_id"]}

    ndc_dict = dict()
    for k, v in r.items():
      if isinstance(v, list) or isinstance(v, dict) or k == "generic_name":
        continue
      ndc_dict[k] = v
    ndc_list.append(ndc_dict)

    for gn in r.get("generic_name", "").split(","):
      gn_dict = dict()
      gn_dict.update(id_dict)
      gn_dict["generic_name"] = gn
      gen_list.append(gn_dict)

    for route in r.get("route", list()):
      route_dict = dict()
      route_dict.update(id_dict)
      route_dict["route"] = route
      route_list.append(route_dict)

    for i in r.get("active_ingredients", list()):
      ai_dict = dict()
      ai_dict.update(id_dict)
      for k, v in i.items():
        ai_dict[k] = v
      ai_list.append(ai_dict)

    for i in r.get("packaging", list()):
      pack_dict = dict()
      pack_dict.update(id_dict)
      for k, v in i.items():
        pack_dict[k] = v
      pack_list.append(pack_dict)

    for k, oa in r["openfda"].items():
      if k not in open_fda:
        open_fda[k] = list()
      for v in oa:
        oa_dict = dict()
        oa_dict.update(id_dict)
        oa_dict[k] = v
        open_fda[k].append(oa_dict)

  # create the data frames and add them to the list
  date = file_name[:8]
  all_ndc.append(pd.DataFrame(ndc_list).assign(date=date))
  all_generic_name.append(pd.DataFrame(gen_list).assign(date=date))
  all_route.append(pd.DataFrame(route_list).assign(date=date))
  all_active_ingredients.append(pd.DataFrame(ai_list).assign(date=date))
  all_packaging.append(pd.DataFrame(pack_list).assign(date=date))
  all_manufacturer.append(pd.DataFrame(open_fda["manufacturer_name"]).assign(date=date))
  all_rxcui.append(pd.DataFrame(open_fda["rxcui"]).assign(date=date))
  all_spl_set.append(pd.DataFrame(open_fda["spl_set_id"]).assign(date=date))
  all_upc.append(pd.DataFrame(open_fda["upc"]).assign(date=date))
  all_nui.append(pd.DataFrame(open_fda["nui"]).assign(date=date))
  all_pharm_class_cs.append(pd.DataFrame(open_fda["pharm_class_cs"]).assign(date=date))
  all_pharm_class_epc.append(pd.DataFrame(open_fda["pharm_class_epc"]).assign(date=date))
  all_pharm_class_pe.append(pd.DataFrame(open_fda["pharm_class_pe"]).assign(date=date))
  all_pharm_class_moa.append(pd.DataFrame(open_fda["pharm_class_moa"]).assign(date=date))
  all_unii.append(pd.DataFrame(open_fda["unii"]).assign(date=date))

# parse everything into a single data frame
ndc_df = pd.concat(all_ndc)
generic_name_df = pd.concat(all_generic_name)
route_df = pd.concat(all_route)
active_ingredients_df = pd.concat(all_active_ingredients)
packaging_df = pd.concat(all_packaging)
manufacturer_df = pd.concat(all_manufacturer)
rxcui_df = pd.concat(all_rxcui)
spl_set_df = pd.concat(all_spl_set)
upc_df = pd.concat(all_upc)
nui_df = pd.concat(all_nui)
pharm_class_cs_df = pd.concat(all_pharm_class_cs)
pharm_class_epc_df = pd.concat(all_pharm_class_epc)
pharm_class_pe_df = pd.concat(all_pharm_class_pe)
pharm_class_moa_df = pd.concat(all_pharm_class_moa)
unii_df = pd.concat(all_unii)

100%|██████████| 26.5M/26.5M [00:00<00:00, 33.6MiB/s]
100%|██████████| 127802/127802 [00:03<00:00, 42396.08it/s]
100%|██████████| 119870/119870 [00:03<00:00, 36801.44it/s]
100%|██████████| 135522/135522 [00:03<00:00, 36836.29it/s]
100%|██████████| 134544/134544 [00:03<00:00, 35185.93it/s]
100%|██████████| 127733/127733 [00:03<00:00, 34424.84it/s]
100%|██████████| 127733/127733 [00:03<00:00, 33641.02it/s]
100%|██████████| 128782/128782 [00:03<00:00, 34201.84it/s]
100%|██████████| 128782/128782 [00:03<00:00, 33558.19it/s]
100%|██████████| 128782/128782 [00:03<00:00, 33714.07it/s]
100%|██████████| 128809/128809 [00:03<00:00, 33566.25it/s]


## NDC List

!pip install -q google-colab-selenium

import google_colab_selenium as gs
from selenium.webdriver.chrome.options import Options
from bs4 import BeautifulSoup

options = Options()
options.add_argument("user-agent==Mozilla/5.0 (iPhone; CPU iPhone OS 15_4 like Mac OS X) AppleWebKit/605.1.15 (KHTML, like Gecko) CriOS/101.0.4951.44 Mobile/15E148 Safari/604.1")

driver = gs.Chrome(options=options)

driver.get("https://ndclist.com/ndc/0006-3941")
soup = BeautifulSoup(driver.page_source)

for info in soup.find("section", {"id": "product-information"}).find_all("div", {"class": "row"}):
  print(info)
  print()

In [None]:
from pathlib import Path
import os
import json
import requests
from bs4 import BeautifulSoup


def parse_pharamceutical_class(product_ndc, pc_string):

  cs_info = {
      "product_ndc": product_ndc,
  }
  pe_info = {
      "product_ndc": product_ndc,
  }
  moa_info = {
      "product_ndc": product_ndc,
  }
  epc_info = {
      "product_ndc": product_ndc,
  }

  if pc_string is not None:

    pcs = pc_string.strip().split(',')

    for pc in pcs:
      if '[CS]' in pc:
        cs_info = {
            "product_ndc": product_ndc,
            "pharm_class_cs": pc
        }
      if '[PE]' in pc:
        pe_info = {
            "product_ndc": product_ndc,
            "pharm_class_pe": pc
        }
      if '[MoA]' in pc:
        moa_info = {
            "product_ndc": product_ndc,
            "pharm_class_moa": pc
        }
      if '[EPC]' in pc:
        epc_info = {
            "product_ndc": product_ndc,
            "pharm_class_epc": pc
        }

  return cs_info, pe_info, moa_info, epc_info


def get_ndc_data(ndc: str):
  url = f"https://www.hipaaspace.com/medical_billing/coding/national.drug.codes/{ndc}.json"
  response = requests.get(url)

  path = Path(f"{DATA_PATH}/_search")
  path.mkdir(exist_ok=True, parents=True)
  try:
    with open(path / f"{ndc}.json", "r") as f:
      ndc_data = json.load(f)
  except:
    if f"{ndc}.json" not in os.listdir(path):
      soup = BeautifulSoup(response.text)
      json_content = soup.find('code')
      try:
        ndc_data = json.loads(json_content.contents[0])
      except AttributeError:
        return
      with open(path / f"{ndc}.json", "w") as f:
        json.dump(ndc_data, f)


  ndc_info = {
    "product_ndc": ndc_data['NDC']['ProductNDC'],
    "labeler_name": ndc_data['NDC']['LabelerName'],
    "brand_name": ndc_data['NDC']['ProprietaryName'],
    "brand_name_suffix": ndc_data['NDC']['ProprietaryNameSuffix'],
    #"finished": ndc_data['NDC']['NDCCode'],
    "listing_expiration_date": ndc_data['NDC']['ListingRecordCertifiedThrough'],
    "marketing_category": ndc_data['NDC']['MarketingCategoryName'],
    "dosage_form": ndc_data['NDC']['DosageFormName'],
    # "spl_id": ndc_data['NDC']['NDCCode'],
    "product_type": ndc_data['NDC']['ProductTypeName'],
    "marketing_start_date": ndc_data['NDC']['StartMarketingDate'],
    "application_number": ndc_data['NDC']['ApplicationNumber'],
    "brand_name_base": ndc_data['NDC']['ProprietaryName'],
    "marketing_end_date": ndc_data['NDC']['EndMarketingDatePackage'],
    "dea_schedule": ndc_data['NDC']['DEASchedule'],
    "status": ndc_data['NDC']['Status'],
  }

  generic_info = {
      "product_ndc": ndc_data['NDC']['ProductNDC'],
      "generic_name": ndc_data['NDC']['NonProprietaryName'],
  }

  route_info = {
      "product_ndc": ndc_data['NDC']['ProductNDC'],
      "route": ndc_data['NDC']['RouteName'],
  }

  active_ingredient_info = {
      "product_ndc": ndc_data['NDC']['ProductNDC'],
      "name": ndc_data['NDC']['NonProprietaryName'],
  }
  if ndc_data['NDC']['StrengthNumber'] is not None and ndc_data['NDC']['StrengthUnit'] is not None:
    active_ingredient_info = {
        "product_ndc": ndc_data['NDC']['ProductNDC'],
        "name": ndc_data['NDC']['NonProprietaryName'],
        "strength": ndc_data['NDC']['StrengthNumber'] + " " + ndc_data['NDC']['StrengthUnit'],
    }

  manufacturer_info = {
      "product_ndc": ndc_data['NDC']['ProductNDC'],
      "manufacturer_name": ndc_data['NDC']['LabelerName'],
  }

  unii_info = {
      "product_ndc": ndc_data['NDC']['ProductNDC'],
      "unii": None,
      "substance_name": ndc_data['NDC']['SubstanceName'],
  }

  packaging_info = {
      "product_ndc": ndc_data['NDC']['ProductNDC'],
      "package_ndc": ndc,
      "description": ndc_data['NDC']['PackageDescription'],
      "sample": "",
      "marketing_start_date": ndc_data['NDC']['StartMarketingDate'],
      "marketing_end_date": ndc_data['NDC']['EndMarketingDatePackage'],
  }

  pc_cs_info, pc_pe_info, pc_moa_info, pc_epc_info = parse_pharamceutical_class(ndc_data['NDC']['ProductNDC'], ndc_data['NDC']['Pharm_Classes'])

  return ndc_info, generic_info, route_info, active_ingredient_info, manufacturer_info, unii_info, packaging_info, pc_cs_info, pc_pe_info, pc_moa_info, pc_epc_info

In [None]:
outputs = dict()
fda_list = list(set(packaging_df["package_ndc"].to_list()))

ndcs_to_parse = list()
for i in tqdm(range(100)):
    # get a list of all labelers
    r = requests.get(f"https://www.hipaaspace.com/medical.coding.library/national.drug.code.directory/{i:02d}")
    links = [a["href"] for a in BeautifulSoup(r.text).find_all("a", {"class": "lookup_item_title"})]

    for href in links:
        r = requests.get(href)
        ndcs = [a["href"].split("/")[-1] for a in BeautifulSoup(r.text).find_all("a", {"class": "lookup_item_title"})]

        for ndc in ndcs:
            if ndc not in fda_list:
                ndcs_to_parse.append(ndc)

for ndc in tqdm(ndcs_to_parse):
    outputs[ndc] = get_ndc_data(ndc)

100%|██████████| 100/100 [5:25:03<00:00, 195.03s/it]
 25%|██▍       | 67107/271552 [18:30:28<34:38:11,  1.64it/s]

# Export

In [None]:
ndc_df.to_parquet(f"{DATA_PATH}/preprocessed/ndc.parquet", index=False)
generic_name_df.to_parquet(f"{DATA_PATH}/preprocessed/generic_name.parquet", index=False)
route_df.to_parquet(f"{DATA_PATH}/preprocessed/route.parquet", index=False)
active_ingredients_df.to_parquet(f"{DATA_PATH}/preprocessed/active_ingredients.parquet", index=False)
packaging_df.assign(
    ndc=lambda f: f["package_ndc"].apply(convert_packaging_ndc)
).to_parquet(f"{DATA_PATH}/preprocessed/packaging.parquet", index=False)
manufacturer_df.to_parquet(f"{DATA_PATH}/preprocessed/manufacturer.parquet", index=False)
rxcui_df.to_parquet(f"{DATA_PATH}/preprocessed/rxcui.parquet", index=False)
spl_set_df.to_parquet(f"{DATA_PATH}/preprocessed/spl_set.parquet", index=False)
upc_df.to_parquet(f"{DATA_PATH}/preprocessed/upc.parquet", index=False)
nui_df.to_parquet(f"{DATA_PATH}/preprocessed/nui.parquet", index=False)
pharm_class_cs_df.to_parquet(f"{DATA_PATH}/preprocessed/pharm_class_cs.parquet", index=False)
pharm_class_epc_df.to_parquet(f"{DATA_PATH}/preprocessed/pharm_class_epc.parquet", index=False)
pharm_class_pe_df.to_parquet(f"{DATA_PATH}/preprocessed/pharm_class_pe.parquet", index=False)
pharm_class_moa_df.to_parquet(f"{DATA_PATH}/preprocessed/pharm_class_moa.parquet", index=False)
unii_df.to_parquet(f"{DATA_PATH}/preprocessed/unii.parquet", index=False)

---