# Header

In [None]:
import os

## Notebook Configuration

In [None]:
# path to the this notebook
# NOTE: Replace this with your project path if needed
PROJECT_PATH = (
    "/content/drive/My Drive/Colab Notebooks"
    if "google.colab" in str(get_ipython())
    else "."
)

# path to the data folder
# NOTE: Replace this with your data path if needed
DATA_PATH = f"{PROJECT_PATH}/data" if "google.colab" in str(get_ipython()) else PROJECT_PATH
# NOTE: For colab we use content so it doesn't load on google drive storage
RAW_DATA_PATH = f"{PROJECT_PATH}/data" if "google.colab" in str(get_ipython()) else f"{PROJECT_PATH}/data"

## Colab Setup

In [None]:
if "google.colab" in str(get_ipython()):
    from google.colab import drive
    drive.mount('/content/drive')

    # setup libraries used by notebook
    #os.system("pip install -q kaggle")

os.chdir(PROJECT_PATH)

Mounted at /content/drive


## Library Import

In [None]:
import itertools
import json
import requests
import shutil
import typing
import zipfile

from io import BytesIO
from pathlib import Path

import matplotlib.pyplot as plt
import numpy as np
import pandas as pd

from tqdm import tqdm

---

# Data Load

In [None]:
ndc_df = pd.read_parquet(f"{DATA_PATH}/preprocessed/comp_ndc.parquet")
packaging_df = pd.read_parquet(f"{DATA_PATH}/preprocessed/comp_packaging.parquet")
generic_name_df = pd.read_parquet(f"{DATA_PATH}/preprocessed/comp_generic_name.parquet")
active_ingredients_df = pd.read_parquet(f"{DATA_PATH}/preprocessed/comp_active_ingredients.parquet")

# Data Pre-Processing

In [None]:
ndc_directory = (
    packaging_df[["product_ndc", "package_ndc", "ndc"]].drop_duplicates()
    .merge(
        ndc_df[["product_ndc", "dosage_form"]]
        .assign(dosage_form=lambda f: f["dosage_form"].str.strip().str.upper())
        .drop_duplicates(),
        on=["product_ndc"],
        how='left'
    )
    .merge(
        generic_name_df[["product_ndc", "generic_name"]]
        .assign(generic_name=lambda f: f["generic_name"].str.strip().str.upper())
        .drop_duplicates(),
        on=["product_ndc"],
        how='left'
    )
    .merge(
        active_ingredients_df[["product_ndc", "name", "strength"]]
        .rename(columns={"name": "generic_name"})
        .assign(generic_name=lambda f: f["generic_name"].str.strip().str.upper())
        .drop_duplicates(),
        on=["product_ndc", "generic_name"],
        how='left'
    )
)

# adjust strength
nulls_to_add = ndc_directory.loc[lambda f: f["strength"].isnull()].drop(columns=["strength"]).merge(
  active_ingredients_df[["product_ndc", "strength"]].drop_duplicates(),
  on=["product_ndc"],
  how='left'
)
ndc_directory = pd.concat([ndc_directory.loc[lambda f: f["strength"].notnull()], nulls_to_add])

# Create a unique identifier for each row
ndc_directory["gpi14"] = (
    ndc_directory['generic_name'].fillna("N/A")
    + " | "
    + ndc_directory['dosage_form'].fillna("N/A")
    + " | "
    + ndc_directory['strength'].fillna("N/A")
)

# Use factorize to assign unique integer IDs to the unique combinations
ndc_directory['gpi14_id'] = pd.factorize(ndc_directory['gpi14'])[0]

#ndc_directory = ndc_directory.drop(columns=["dosage_form", "generic_name", "strength"])

In [None]:
ndc_directory.loc[lambda f: f["product_ndc"] == "41701-013"]

Unnamed: 0,product_ndc,package_ndc,ndc,dosage_form,generic_name,strength,gpi14,gpi14_id
2913,41701-013,41701-013-76,41701001376,POWDER,LEUPROLIDE ACETATE,80 g/80g,LEUPROLIDE ACETATE | POWDER | 80 g/80g,1046
2914,41701-013,41701-013-76,41701001376,POWDER,LEUPROLIDE ACETATE,217 g/217g,LEUPROLIDE ACETATE | POWDER | 217 g/217g,1047
2915,41701-013,41701-013-76,41701001376,POWDER,LEUPROLIDE ACETATE,87.5 g/87.5g,LEUPROLIDE ACETATE | POWDER | 87.5 g/87.5g,1048
2916,41701-013,41701-013-76,41701001376,POWDER,LEUPROLIDE ACETATE,96 g/96g,LEUPROLIDE ACETATE | POWDER | 96 g/96g,1049
2917,41701-013,41701-013-76,41701001376,POWDER,LEUPROLIDE ACETATE,215 g/215g,LEUPROLIDE ACETATE | POWDER | 215 g/215g,1050
...,...,...,...,...,...,...,...,...
289714,41701-013,41701-013-68,41701001368,POWDER,LEUPROLIDE ACETATE,240 g/240g,LEUPROLIDE ACETATE | POWDER | 240 g/240g,1077
289715,41701-013,41701-013-68,41701001368,POWDER,LEUPROLIDE ACETATE,64 g/64g,LEUPROLIDE ACETATE | POWDER | 64 g/64g,1078
289716,41701-013,41701-013-68,41701001368,POWDER,LEUPROLIDE ACETATE,295 g/295g,LEUPROLIDE ACETATE | POWDER | 295 g/295g,1079
289717,41701-013,41701-013-68,41701001368,POWDER,LEUPROLIDE ACETATE,157 g/157g,LEUPROLIDE ACETATE | POWDER | 157 g/157g,1080


In [None]:
packaging_df.sample(10).values

array([['68001-412', '68001-412_c065a8d4-58d2-a61d-e053-2995a90a496c',
        '68001-412-03', '500 TABLET in 1 BOTTLE (68001-412-03)',
        '20190930', False, None, '20240127', '68001041203'],
       ['63187-578', '63187-578_df562b71-4edb-4e13-bf1e-23031eaf0975',
        '63187-578-90', '90 CAPSULE in 1 BOTTLE (63187-578-90)',
        '20181201', False, None, '20240211', '63187057890'],
       ['24385-598', '24385-598_05c0ff6b-c219-4968-8b8f-8354fe9fcc57',
        '24385-598-71',
        '50 BLISTER PACK in 1 CARTON (24385-598-71)  / 1 GUM, CHEWING in 1 BLISTER PACK',
        '20050831', False, None, '20240127', '24385059871'],
       ['82260-102', '82260-102_278d0147-c5b0-4d85-80f5-500ec567feed',
        '82260-102-10',
        '1 BOTTLE, DROPPER in 1 CARTON (82260-102-10)  / 10 mL in 1 BOTTLE, DROPPER',
        '20230331', False, None, '20240127', '82260010210'],
       ['72647-372', '72647-372_7b37f3e4-e86d-4ae0-83b1-432e19b04153',
        '72647-372-30',
        '30 PACKET in 1

In [None]:
packaging_df.loc[lambda f: f["product_ndc"] == "41701-013"].values[0]

array(['41701-013', '41701-013_438f5f55-e8fd-4ce3-be9b-3a13b25ab9d9',
       '41701-013-76',
       '1 BOTTLE in 1 BOTTLE (41701-013-76)  / 80 g in 1 BOTTLE (41701-013-75)',
       '18-MAR-11', None, None, '20240213', '41701001376'], dtype=object)

# Export

In [None]:
ndc_directory.to_parquet(f"{DATA_PATH}/preprocessed/gpi14.parquet", index=False)

---