# Header

In [1]:
import os

## Notebook Configuration

In [2]:
# path to the this notebook
# NOTE: Replace this with your project path if needed
PROJECT_PATH = (
    "/content/drive/My Drive/Colab Notebooks"
    if "google.colab" in str(get_ipython())
    else "."
)

# path to the data folder
# NOTE: Replace this with your data path if needed
DATA_PATH = f"{PROJECT_PATH}/data" if "google.colab" in str(get_ipython()) else PROJECT_PATH
# NOTE: For colab we use content so it doesn"t load on google drive storage
RAW_DATA_PATH = f"{PROJECT_PATH}/data" if "google.colab" in str(get_ipython()) else f"{PROJECT_PATH}/data"

## Colab Setup

In [3]:
if "google.colab" in str(get_ipython()):
    from google.colab import drive
    drive.mount("/content/drive")

    # setup libraries used by notebook
    os.system("pip install beautifulsoup4")

os.chdir(PROJECT_PATH)

Mounted at /content/drive


## Library Import

In [4]:
import itertools
import json
import requests
import shutil
import time
import typing
import zipfile

from io import BytesIO
from pathlib import Path

import matplotlib.pyplot as plt
import numpy as np
import pandas as pd

from tqdm import tqdm

## Functions

In [5]:
def parse_pharamceutical_class(product_ndc, pc_string):

  cs_info = {
      "product_ndc": product_ndc,
  }
  pe_info = {
      "product_ndc": product_ndc,
  }
  moa_info = {
      "product_ndc": product_ndc,
  }
  epc_info = {
      "product_ndc": product_ndc,
  }

  if pc_string is not None:

    pcs = pc_string.strip().split(',')

    for pc in pcs:
      if '[CS]' in pc:
        cs_info = {
            "product_ndc": product_ndc,
            "pharm_class_cs": pc
        }
      if '[PE]' in pc:
        pe_info = {
            "product_ndc": product_ndc,
            "pharm_class_pe": pc
        }
      if '[MoA]' in pc:
        moa_info = {
            "product_ndc": product_ndc,
            "pharm_class_moa": pc
        }
      if '[EPC]' in pc:
        epc_info = {
            "product_ndc": product_ndc,
            "pharm_class_epc": pc
        }

  return cs_info, pe_info, moa_info, epc_info


def get_missing_data(missing_ndcs_list):
  ndc_data_list = []
  generic_list = []
  route_list = []
  active_ingredient_list = []
  manufacturer_list = []
  unii_list = []
  packaging_list = []
  pc_cs_list = []
  pc_pe_list = []
  pc_moa_list = []
  pc_epc_list = []

  for ndc in missing_ndcs_list:
    url = f"https://www.hipaaspace.com/medical_billing/coding/national.drug.codes/{ndc}.json"
    print(url)
    response = requests.get(url)

    from bs4 import BeautifulSoup

    soup = BeautifulSoup(response.text)
    json_content = soup.find('code')
    try:
      ndc_data = json.loads(json_content.contents[0])
    except AttributeError:
      continue

    ndc_info = {
      "product_ndc": ndc_data['NDC']['ProductNDC'],
      "labeler_name": ndc_data['NDC']['LabelerName'],
      "brand_name": ndc_data['NDC']['ProprietaryName'],
      "brand_name_suffix": ndc_data['NDC']['ProprietaryNameSuffix'],
      #"finished": ndc_data['NDC']['NDCCode'],
      "listing_expiration_date": ndc_data['NDC']['ListingRecordCertifiedThrough'],
      "marketing_category": ndc_data['NDC']['MarketingCategoryName'],
      "dosage_form": ndc_data['NDC']['DosageFormName'],
      # "spl_id": ndc_data['NDC']['NDCCode'],
      "product_type": ndc_data['NDC']['ProductTypeName'],
      "marketing_start_date": ndc_data['NDC']['StartMarketingDate'],
      "application_number": ndc_data['NDC']['ApplicationNumber'],
      "brand_name_base": ndc_data['NDC']['ProprietaryName'],
      "marketing_end_date": ndc_data['NDC']['EndMarketingDatePackage'],
      "dea_schedule": ndc_data['NDC']['DEASchedule'],
    }

    generic_info = {
        "product_ndc": ndc_data['NDC']['ProductNDC'],
        "generic_name": ndc_data['NDC']['NonProprietaryName'],
    }

    route_info = {
        "product_ndc": ndc_data['NDC']['ProductNDC'],
        "route": ndc_data['NDC']['RouteName'],
    }

    active_ingredient_info = {
        "product_ndc": ndc_data['NDC']['ProductNDC'],
        "name": ndc_data['NDC']['NonProprietaryName'],
    }
    if ndc_data['NDC']['StrengthNumber'] is not None and ndc_data['NDC']['StrengthUnit'] is not None:
      active_ingredient_info = {
          "product_ndc": ndc_data['NDC']['ProductNDC'],
          "name": ndc_data['NDC']['NonProprietaryName'],
          "strength": ndc_data['NDC']['StrengthNumber'] + " " + ndc_data['NDC']['StrengthUnit'],
      }

    manufacturer_info = {
        "product_ndc": ndc_data['NDC']['ProductNDC'],
        "manufacturer_name": ndc_data['NDC']['LabelerName'],
    }

    unii_info = {
        "product_ndc": ndc_data['NDC']['ProductNDC'],
        "unii": None,
        "substance_name": ndc_data['NDC']['SubstanceName'],
    }

    packaging_info = {
        "product_ndc": ndc_data['NDC']['ProductNDC'],
        "package_ndc": ndc,
        "description": ndc_data['NDC']['PackageDescription'],
        "sample": "",
        "marketing_start_date": ndc_data['NDC']['StartMarketingDate'],
        "marketing_end_date": ndc_data['NDC']['EndMarketingDatePackage'],
    }

    ndc_data_list.append(ndc_info)
    generic_list.append(generic_info)
    route_list.append(route_info)
    active_ingredient_list.append(active_ingredient_info)
    manufacturer_list.append(manufacturer_info)
    unii_list.append(unii_info)
    packaging_list.append(packaging_info)

    pc_cs_info, pc_pe_info, pc_moa_info, pc_epc_info = parse_pharamceutical_class(ndc_data['NDC']['ProductNDC'], ndc_data['NDC']['Pharm_Classes'])
    pc_cs_list.append(pc_cs_info)
    pc_pe_list.append(pc_pe_info)
    pc_moa_list.append(pc_moa_info)
    pc_epc_list.append(pc_epc_info)

    time.sleep(1)

  missing_ndc_df = pd.DataFrame(ndc_data_list)
  missing_ndc_df = missing_ndc_df.drop_duplicates(subset=['product_ndc'], keep='first')

  missing_generic_df = pd.DataFrame(generic_list)
  missing_generic_df = missing_generic_df.drop_duplicates(subset=['product_ndc'], keep='first')
  missing_route_df = pd.DataFrame(route_list)
  missing_route_df = missing_route_df.drop_duplicates(subset=['product_ndc'], keep='first')
  missing_active_ingredient_df = pd.DataFrame(active_ingredient_list)
  missing_active_ingredient_df = missing_active_ingredient_df.drop_duplicates(subset=['product_ndc'], keep='first')
  missing_manufacturer_df = pd.DataFrame(manufacturer_list)
  missing_manufacturer_df = missing_manufacturer_df.drop_duplicates(subset=['product_ndc'], keep='first')
  missing_unii_df = pd.DataFrame(unii_list)
  missing_unii_df = missing_unii_df.drop_duplicates(subset=['product_ndc'], keep='first')

  missing_pc_cs_df = pd.DataFrame(pc_cs_list)
  missing_pc_cs_df = missing_pc_cs_df.drop_duplicates(subset=['product_ndc'], keep='first')
  missing_pc_pe_df = pd.DataFrame(pc_pe_list)
  missing_pc_pe_df = missing_pc_pe_df.drop_duplicates(subset=['product_ndc'], keep='first')
  missing_pc_moa_df = pd.DataFrame(pc_moa_list)
  missing_pc_moa_df = missing_pc_moa_df.drop_duplicates(subset=['product_ndc'], keep='first')
  missing_ps_epc_df = pd.DataFrame(pc_epc_list)
  missing_ps_epc_df = missing_ps_epc_df.drop_duplicates(subset=['product_ndc'], keep='first')

  missing_packaging_df = pd.DataFrame(packaging_list)
  missing_packaging_df = missing_packaging_df.drop_duplicates(subset=['package_ndc'], keep='first')

  return (
    missing_ndc_df,
    missing_generic_df,
    missing_route_df,
    missing_active_ingredient_df,
    missing_manufacturer_df,
    missing_unii_df,
    missing_pc_cs_df,
    missing_pc_pe_df,
    missing_pc_moa_df,
    missing_ps_epc_df,
    missing_packaging_df
  )

---

# Data Load

In [6]:
fda = pd.read_csv(f"{DATA_PATH}/FDA_data.csv", dtype=str)
package_df = pd.read_parquet(f"{DATA_PATH}/preprocessed/packaging.parquet")
shortages = pd.read_parquet(f"{DATA_PATH}/preprocessed/shortages.parquet")

In [7]:
ndc_df = pd.read_parquet(f"{DATA_PATH}/preprocessed/ndc.parquet")
generic_name_df = pd.read_parquet(f"{DATA_PATH}/preprocessed/generic_name.parquet")
route_df = pd.read_parquet(f"{DATA_PATH}/preprocessed/route.parquet")
active_ingredients_df = pd.read_parquet(f"{DATA_PATH}/preprocessed/active_ingredients.parquet")
packaging_df = pd.read_parquet(f"{DATA_PATH}/preprocessed/packaging.parquet")
manufacturer_df = pd.read_parquet(f"{DATA_PATH}/preprocessed/manufacturer.parquet")
rxcui_df = pd.read_parquet(f"{DATA_PATH}/preprocessed/rxcui.parquet")
spl_set_df = pd.read_parquet(f"{DATA_PATH}/preprocessed/spl_set.parquet")
upc_df = pd.read_parquet(f"{DATA_PATH}/preprocessed/upc.parquet")
nui_df = pd.read_parquet(f"{DATA_PATH}/preprocessed/nui.parquet")
pharm_class_cs_df = pd.read_parquet(f"{DATA_PATH}/preprocessed/pharm_class_cs.parquet")
pharm_class_epc_df = pd.read_parquet(f"{DATA_PATH}/preprocessed/pharm_class_epc.parquet")
pharm_class_pe_df = pd.read_parquet(f"{DATA_PATH}/preprocessed/pharm_class_pe.parquet")
pharm_class_moa_df = pd.read_parquet(f"{DATA_PATH}/preprocessed/pharm_class_moa.parquet")
unii_df = pd.read_parquet(f"{DATA_PATH}/preprocessed/unii.parquet")

# Data Pre-Processing

In [8]:
# get the missing ndcs
unique_ndcs = list(package_df["ndc"].unique())
missing_ndcs = shortages.loc[lambda f: ~f["ndc"].isin(unique_ndcs), ["ndc"]].drop_duplicates()

# get additional data for each missing NDC - specifically we need the Presentation to extract the formatted NDC
missing_ndcs_merged = missing_ndcs.merge(
    fda[["NDC", "Presentation"]].rename(columns={"NDC": "ndc"}).drop_duplicates(),
    on="ndc",
    how="left"
)

# regex to parse the formatted NDC from the presentation column in 4 different formats
missing_ndcs_merged[["parsed_ndc1","parsed_ndc2","parsed_ndc3","parsed_ndc4"]] = (
    missing_ndcs_merged["Presentation"].str.extract(
        r"\b(?:(\d{4}-\d{4}-\d{2})|(\d{5}-\d{3}-\d{2})|(\d{5}-\d{4}-\d{1})|(\d{5}-\d{4}-\d{2}))\b"
    )
)
missing_ndcs_merged["parsed_ndc"] = np.nan

# combine the parsed_ndc# column into a single column
missing_ndcs_merged["parsed_ndc"] = missing_ndcs_merged["parsed_ndc"].fillna(missing_ndcs_merged["parsed_ndc1"])
missing_ndcs_merged["parsed_ndc"] = missing_ndcs_merged["parsed_ndc"].fillna(missing_ndcs_merged["parsed_ndc2"])
missing_ndcs_merged["parsed_ndc"] = missing_ndcs_merged["parsed_ndc"].fillna(missing_ndcs_merged["parsed_ndc3"])
missing_ndcs_merged["parsed_ndc"] = missing_ndcs_merged["parsed_ndc"].fillna(missing_ndcs_merged["parsed_ndc4"])

# clean up any leftovers that didn"t parse properly
# 25021820010 = 25021-820-10
# 17478007031 = 17478-070-31
missing_ndcs_merged.loc[missing_ndcs_merged["ndc"] == "25021820010", "parsed_ndc"] = "25021-820-10"
missing_ndcs_merged.loc[missing_ndcs_merged["ndc"] == "17478007031", "parsed_ndc"] = "17478-070-31"

# these packagings are not present in HIPAASpace so we replace the packaging with another from the same product
missing_ndcs_merged.loc[missing_ndcs_merged["ndc"] == "00264705505", "parsed_ndc"] = "0264-7055-10"
missing_ndcs_merged.loc[missing_ndcs_merged["ndc"] == "68462042560", "parsed_ndc"] = "68462-425-60"
missing_ndcs_merged.loc[missing_ndcs_merged["ndc"] == "68462042630", "parsed_ndc"] = "68462-426-30"
missing_ndcs_merged.loc[missing_ndcs_merged["ndc"] == "00338954950", "parsed_ndc"] = "0338-9549-24"
missing_ndcs_merged.loc[missing_ndcs_merged["ndc"] == "00009001103", "parsed_ndc"] = "0009-0011-04"
missing_ndcs_merged.loc[missing_ndcs_merged["ndc"] == "70860030204", "parsed_ndc"] = "70860-302-04"
missing_ndcs_merged.loc[missing_ndcs_merged["ndc"] == "70860030242", "parsed_ndc"] = "70860-302-04"
missing_ndcs_merged.loc[missing_ndcs_merged["ndc"] == "70860030243", "parsed_ndc"] = "70860-302-04"
missing_ndcs_merged.loc[missing_ndcs_merged["ndc"] == "23155023541", "parsed_ndc"] = "23155-235-01"
missing_ndcs_merged.loc[missing_ndcs_merged["ndc"] == "69543038625", "parsed_ndc"] = "69543-386-25"
missing_ndcs_merged.loc[missing_ndcs_merged["ndc"] == "14789010708", "parsed_ndc"] = "14789-107-05"
missing_ndcs_merged.loc[missing_ndcs_merged["ndc"] == "00338955350", "parsed_ndc"] = "0338-9553-24"
missing_ndcs_merged.loc[missing_ndcs_merged["ndc"] == "23155034513", "parsed_ndc"] = "23155-345-41"
missing_ndcs_merged.loc[missing_ndcs_merged["ndc"] == "14789010916", "parsed_ndc"] = "14789-109-10"
missing_ndcs_merged.loc[missing_ndcs_merged["ndc"] == "00338381450", "parsed_ndc"] = "0338-3814-24"
missing_ndcs_merged.loc[missing_ndcs_merged["ndc"] == "14789010808", "parsed_ndc"] = "14789-108-05"
missing_ndcs_merged.loc[missing_ndcs_merged["ndc"] == "14789010816", "parsed_ndc"] = "14789-108-05"
missing_ndcs_merged.loc[missing_ndcs_merged["ndc"] == "70860078441", "parsed_ndc"] = "70860-784-05"
missing_ndcs_merged.loc[missing_ndcs_merged["ndc"] == "70121138901", "parsed_ndc"] = "70121-1389-7"
missing_ndcs_merged.loc[missing_ndcs_merged["ndc"] == "23155023544", "parsed_ndc"] = "23155-235-01"
missing_ndcs_merged.loc[missing_ndcs_merged["ndc"] == "14789010716", "parsed_ndc"] = "14789-107-10"
missing_ndcs_merged.loc[missing_ndcs_merged["ndc"] == "64980058805", "parsed_ndc"] = "64980-588-51"
missing_ndcs_merged.loc[missing_ndcs_merged["ndc"] == "72611075701", "parsed_ndc"] = "72611-757-10"
missing_ndcs_merged.loc[missing_ndcs_merged["ndc"] == "00338954550", "parsed_ndc"] = "0338-9545-24"
missing_ndcs_merged.loc[missing_ndcs_merged["ndc"] == "00338361250", "parsed_ndc"] = "0338-3612-24"
missing_ndcs_merged.loc[missing_ndcs_merged["ndc"] == "55150022210", "parsed_ndc"] = "55150-222-20"
missing_ndcs_merged.loc[missing_ndcs_merged["ndc"] == "70860030210", "parsed_ndc"] = "70860-302-10"
missing_ndcs_merged.loc[missing_ndcs_merged["ndc"] == "72611075601", "parsed_ndc"] = "72611-756-10"
missing_ndcs_merged.loc[missing_ndcs_merged["ndc"] == "70860030241", "parsed_ndc"] = "70860-302-02"
missing_ndcs_merged.loc[missing_ndcs_merged["ndc"] == "70860030202", "parsed_ndc"] = "70860-302-02"
missing_ndcs_merged.loc[missing_ndcs_merged["ndc"] == "70121138801", "parsed_ndc"] = "70121-1388-8"
missing_ndcs_merged.loc[missing_ndcs_merged["ndc"] == "00093747308", "parsed_ndc"] = "0093-7473-06"
missing_ndcs_merged.loc[missing_ndcs_merged["ndc"] == "00781139101", "parsed_ndc"] = "0781-13-9101"
missing_ndcs_merged.loc[missing_ndcs_merged["ndc"] == "00781139113", "parsed_ndc"] = "0781-13-9113"
missing_ndcs_merged.loc[missing_ndcs_merged["ndc"] == "00781139201", "parsed_ndc"] = "0781-13-9201"
missing_ndcs_merged.loc[missing_ndcs_merged["ndc"] == "00781139213", "parsed_ndc"] = "0781-13-9213"
missing_ndcs_merged.loc[missing_ndcs_merged["ndc"] == "00781139601", "parsed_ndc"] = "0781-13-9601"
missing_ndcs_merged.loc[missing_ndcs_merged["ndc"] == "00781139610", "parsed_ndc"] = "0781-13-9610"
missing_ndcs_merged.loc[missing_ndcs_merged["ndc"] == "00781139613", "parsed_ndc"] = "0781-13-9613"
missing_ndcs_merged.loc[missing_ndcs_merged["ndc"] == "00781139713", "parsed_ndc"] = "0781-13-9713"
missing_ndcs_merged.loc[missing_ndcs_merged["ndc"] == "00781139701", "parsed_ndc"] = "0781-13-9701"
missing_ndcs_merged.loc[missing_ndcs_merged["ndc"] == "00781139801", "parsed_ndc"] = "0781-13-9801"
missing_ndcs_merged.loc[missing_ndcs_merged["ndc"] == "00781839101", "parsed_ndc"] = "0781-83-9101"
missing_ndcs_merged.loc[missing_ndcs_merged["ndc"] == "00781839201", "parsed_ndc"] = "0781-83-9201"
missing_ndcs_merged.loc[missing_ndcs_merged["ndc"] == "00781839301", "parsed_ndc"] = "0781-83-9301"
missing_ndcs_merged.loc[missing_ndcs_merged["ndc"] == "00781839601", "parsed_ndc"] = "0781-83-9601"
missing_ndcs_merged.loc[missing_ndcs_merged["ndc"] == "00781839610", "parsed_ndc"] = "0781-83-9610"
missing_ndcs_merged.loc[missing_ndcs_merged["ndc"] == "00781839701", "parsed_ndc"] = "0781-83-9701"


# these products are not present in HIPAASpace so we just remove them
missing_ndcs_merged = missing_ndcs_merged[missing_ndcs_merged["parsed_ndc"] != "42852-014-15"]
missing_ndcs_merged = missing_ndcs_merged[missing_ndcs_merged["parsed_ndc"] != "04306-6007-10"]
missing_ndcs_merged = missing_ndcs_merged[missing_ndcs_merged["parsed_ndc"] != "0338-0535-03"]
missing_ndcs_merged = missing_ndcs_merged[missing_ndcs_merged["parsed_ndc"] != "59762-040-11"]
missing_ndcs_merged = missing_ndcs_merged[missing_ndcs_merged["parsed_ndc"] != "60595-6277-0"]
missing_ndcs_merged = missing_ndcs_merged[missing_ndcs_merged["parsed_ndc"] != "42852-017-12"]
missing_ndcs_merged = missing_ndcs_merged[missing_ndcs_merged["parsed_ndc"] != "42852-020-15"]
missing_ndcs_merged = missing_ndcs_merged[missing_ndcs_merged["parsed_ndc"] != "09907-922-02"]
missing_ndcs_merged = missing_ndcs_merged[missing_ndcs_merged["parsed_ndc"] != "09907-922-61"]
missing_ndcs_merged = missing_ndcs_merged[missing_ndcs_merged["parsed_ndc"] != "42852-017-15"]
missing_ndcs_merged = missing_ndcs_merged[missing_ndcs_merged["parsed_ndc"] != "09907-923-20"]
missing_ndcs_merged = missing_ndcs_merged[missing_ndcs_merged["parsed_ndc"] != "09907-923-23"]
missing_ndcs_merged = missing_ndcs_merged[missing_ndcs_merged["parsed_ndc"] != "66318-223-58"]
missing_ndcs_merged = missing_ndcs_merged[missing_ndcs_merged["parsed_ndc"] != "09907-923-37"]
missing_ndcs_merged = missing_ndcs_merged[missing_ndcs_merged["parsed_ndc"] != "42852-014-12"]
missing_ndcs_merged = missing_ndcs_merged[missing_ndcs_merged["parsed_ndc"] != "42852-017-10"]
missing_ndcs_merged = missing_ndcs_merged[missing_ndcs_merged["parsed_ndc"] != "65219-333-81"]
missing_ndcs_merged = missing_ndcs_merged[missing_ndcs_merged["parsed_ndc"] != "70436-204-80"]
missing_ndcs_merged = missing_ndcs_merged[missing_ndcs_merged["parsed_ndc"] != "04306-6013-10"]
missing_ndcs_merged = missing_ndcs_merged[missing_ndcs_merged["parsed_ndc"] != "59762-040-15"]
missing_ndcs_merged = missing_ndcs_merged[missing_ndcs_merged["parsed_ndc"] != "42852-014-10"]
missing_ndcs_merged = missing_ndcs_merged[missing_ndcs_merged["parsed_ndc"] != "42852-020-10"]
missing_ndcs_merged = missing_ndcs_merged[missing_ndcs_merged["parsed_ndc"] != "09907-923-13"]
missing_ndcs_merged = missing_ndcs_merged[missing_ndcs_merged["parsed_ndc"] != "09907-923-36"]
missing_ndcs_merged = missing_ndcs_merged[missing_ndcs_merged["parsed_ndc"] != "42852-020-12"]
missing_ndcs_merged = missing_ndcs_merged[missing_ndcs_merged["parsed_ndc"] != "66318-223-23"]
missing_ndcs_merged = missing_ndcs_merged[missing_ndcs_merged["parsed_ndc"] != "0338-3410-50"]

missing_ndcs_clean = missing_ndcs_merged.drop_duplicates(subset=["parsed_ndc"], keep="first")
assert(len(missing_ndcs_clean[missing_ndcs_clean["parsed_ndc"].isna()]) == 0)

missing_ndcs_list = list(missing_ndcs_clean["parsed_ndc"].unique())

In [9]:
# find dups in the missing_ndcs_list
missing_ndcs_merged = missing_ndcs_clean.groupby(['parsed_ndc']).agg('size').rename('count').reset_index()
missing_ndcs_merged[missing_ndcs_merged['count'] > 1]

# missing_ndcs = missing_ndcs.groupby(['ndc']).agg('size').rename('count').reset_index()
# missing_ndcs[missing_ndcs['count'] > 1]

Unnamed: 0,parsed_ndc,count


In [10]:
(
  missing_ndc_df,
  missing_generic_df,
  missing_route_df,
  missing_active_ingredient_df,
  missing_manufacturer_df,
  missing_unii_df,
  missing_pc_cs_df,
  missing_pc_pe_df,
  missing_pc_moa_df,
  missing_ps_epc_df,
  missing_packaging_df
) = get_missing_data(missing_ndcs_list)

https://www.hipaaspace.com/medical_billing/coding/national.drug.codes/0009-0011-04.json
https://www.hipaaspace.com/medical_billing/coding/national.drug.codes/0054-0548-44.json
https://www.hipaaspace.com/medical_billing/coding/national.drug.codes/0065-9204-07.json
https://www.hipaaspace.com/medical_billing/coding/national.drug.codes/0071-0222-23.json
https://www.hipaaspace.com/medical_billing/coding/national.drug.codes/0074-4378-05.json
https://www.hipaaspace.com/medical_billing/coding/national.drug.codes/0074-4380-10.json
https://www.hipaaspace.com/medical_billing/coding/national.drug.codes/0074-4382-20.json
https://www.hipaaspace.com/medical_billing/coding/national.drug.codes/0093-3193-01.json
https://www.hipaaspace.com/medical_billing/coding/national.drug.codes/0093-3195-01.json
https://www.hipaaspace.com/medical_billing/coding/national.drug.codes/0093-3195-05.json
https://www.hipaaspace.com/medical_billing/coding/national.drug.codes/00264-3173-11.json
https://www.hipaaspace.com/medi

KeyboardInterrupt: 

In [None]:
comp_ndc_df = pd.concat([ndc_df, missing_ndc_df.assign(date=pd.to_datetime("today").strftime("%Y%m%d"))])
comp_generic_name_df = pd.concat([generic_name_df, missing_generic_df.assign(date=pd.to_datetime("today").strftime("%Y%m%d"))])
comp_route_df = pd.concat([route_df, missing_route_df.assign(date=pd.to_datetime("today").strftime("%Y%m%d"))])
comp_active_ingredients_df = pd.concat([active_ingredients_df, missing_active_ingredient_df.assign(date=pd.to_datetime("today").strftime("%Y%m%d"))])
comp_packaging_df = pd.concat(
  [
      packaging_df,
      missing_packaging_df.merge(missing_ndcs_clean[["ndc", "parsed_ndc"]].rename(columns={"parsed_ndc": "package_ndc"}))
      .assign(date=pd.to_datetime("today").strftime("%Y%m%d"))
      .assign(sample=False)
  ]
)
comp_manufacturer_df = pd.concat([manufacturer_df, missing_manufacturer_df.assign(date=pd.to_datetime("today").strftime("%Y%m%d"))])
# comp_rxcui_df = pd.concat([rxcui_df, missing_rxcui_df.assign(date=pd.to_datetime("today").strftime("%Y%m%d"))])
# comp_spl_set_df = pd.concat([spl_set_df, missing_spl_set_df.assign(date=pd.to_datetime("today").strftime("%Y%m%d"))])
# comp_upc_df = pd.concat([upc_df, missing_upc_df.assign(date=pd.to_datetime("today").strftime("%Y%m%d"))])
# comp_nui_df = pd.concat([nui_df, missing_nui_df.assign(date=pd.to_datetime("today").strftime("%Y%m%d"))])
comp_pharm_class_cs_df = pd.concat([pharm_class_cs_df, missing_pc_cs_df.assign(date=pd.to_datetime("today").strftime("%Y%m%d"))])
comp_pharm_class_epc_df = pd.concat([pharm_class_epc_df, missing_ps_epc_df.assign(date=pd.to_datetime("today").strftime("%Y%m%d"))])
comp_pharm_class_pe_df = pd.concat([pharm_class_pe_df, missing_pc_pe_df.assign(date=pd.to_datetime("today").strftime("%Y%m%d"))])
comp_pharm_class_moa_df = pd.concat([pharm_class_moa_df, missing_pc_moa_df.assign(date=pd.to_datetime("today").strftime("%Y%m%d"))])
comp_unii_df = pd.concat([unii_df, missing_unii_df.assign(date=pd.to_datetime("today").strftime("%Y%m%d"))])

# Export

In [None]:
comp_ndc_df.to_parquet(f"{DATA_PATH}/preprocessed/comp_ndc.parquet", index=False)
comp_generic_name_df.to_parquet(f"{DATA_PATH}/preprocessed/comp_generic_name.parquet", index=False)
route_df.to_parquet(f"{DATA_PATH}/preprocessed/comp_route.parquet", index=False)
comp_active_ingredients_df.to_parquet(f"{DATA_PATH}/preprocessed/comp_active_ingredients.parquet", index=False)
comp_packaging_df.to_parquet(f"{DATA_PATH}/preprocessed/comp_packaging.parquet", index=False)
comp_manufacturer_df.to_parquet(f"{DATA_PATH}/preprocessed/comp_manufacturer.parquet", index=False)
comp_pharm_class_cs_df.to_parquet(f"{DATA_PATH}/preprocessed/comp_pharm_class_cs.parquet", index=False)
comp_pharm_class_epc_df.to_parquet(f"{DATA_PATH}/preprocessed/comp_pharm_class_epc.parquet", index=False)
comp_pharm_class_pe_df.to_parquet(f"{DATA_PATH}/preprocessed/comp_pharm_class_pe.parquet", index=False)
comp_pharm_class_moa_df.to_parquet(f"{DATA_PATH}/preprocessed/comp_pharm_class_moa.parquet", index=False)
comp_unii_df.to_parquet(f"{DATA_PATH}/preprocessed/comp_unii.parquet", index=False)

---