# Header

In [None]:
import os

## Notebook Configuration

In [None]:
# path to the this notebook
# NOTE: Replace this with your project path if needed
PROJECT_PATH = (
    "/content/drive/My Drive/W210"
    if "google.colab" in str(get_ipython())
    else "."
)

# path to the data folder
# NOTE: Replace this with your data path if needed
DATA_PATH = f"{PROJECT_PATH}/data" if "google.colab" in str(get_ipython()) else PROJECT_PATH
# NOTE: For colab we use content so it doesn't load on google drive storage
RAW_DATA_PATH = f"{PROJECT_PATH}/data" if "google.colab" in str(get_ipython()) else f"{PROJECT_PATH}/data"

## Colab Setup

In [None]:
if "google.colab" in str(get_ipython()):
    from google.colab import drive
    drive.mount('/content/drive')

    # setup libraries used by notebook
    #os.system("pip install -q kaggle")

os.chdir(PROJECT_PATH)

Mounted at /content/drive


## Library Import

In [None]:
import itertools
import json
import requests
import shutil
import typing
import zipfile

from io import BytesIO
from pathlib import Path

import matplotlib.pyplot as plt
import numpy as np
import pandas as pd

from tqdm import tqdm

## Functions

In [None]:
def download_from_web(
    save_path: typing.Union[str, Path, typing.IO[bytes], typing.BinaryIO],
    url: str,
    block_size: int = 300 * 1024,
) -> typing.Union[typing.IO[bytes], typing.BinaryIO]:
    """
    Perform download of a file from a public web adress

    :param save_path: path to save the data extraction
    :param url: address where data is stored
    :param block_size: size in bytes from incremental download
    :return: buffer object to file
    """
    # make sure the path points to an buffer object
    if isinstance(save_path, str) or isinstance(save_path, Path):
        file_path: typing.Union[typing.IO[bytes], typing.BinaryIO] = open(save_path, "wb")
    else:
        file_path = save_path

    # generate a request to get the content
    response = requests.get(url, stream=True)
    total_size_in_bytes = int(response.headers.get("content-length", 0))

    # parse the file
    progress_bar = tqdm(total=total_size_in_bytes, unit="iB", unit_scale=True)
    for data in response.iter_content(block_size):
        progress_bar.update(len(data))
        file_path.write(data)
    file_path.close()

    # returns the buffer object
    return file_path

---

# Data Load

In [None]:
recalls = pd.read_excel(f"{DATA_PATH}/other/recalls.xlsx")
inspections = pd.read_excel(f"{DATA_PATH}/other/inspections.xlsx")
compliance_actions = pd.read_excel(f"{DATA_PATH}/other/compliance_actions.xlsx")

# Data Pre-Processing

## Recalls

In [None]:
recall_df = (
    recalls.loc[lambda f: f["Product Type"] == "Drugs"]
    .loc[lambda f: f["Product Description"].str.contains("NDC ")]
    .loc[lambda f: f["Center Classification Date"].dt.year >= 2020]
    .reindex(columns=["FEI Number", "Event ID", "Product ID", "Center Classification Date", "Reason for Recall", "Product Description"])
    .rename(
        columns={
            "FEI Number": "fei_number",
            "Event ID": "event_id",
            "Product ID": "product_id",
            "Center Classification Date": "recall_date",
            "Reason for Recall": "reason_for_recall",
            "Product Description": "product_description"
        }
    )
)

codes = recall_df["product_description"].str.extract(
    r"(?:(\d{4}-\d{4}-\d{2})|(\d{5}-\d{3}-\d{2})|(\d{5}-\d{4}-\d{1})|(\d{5}-\d{4}-\d{2}))"
)

recall_df["package_ndc"] = codes.stack().groupby(level=0).first().reindex(codes.index)
recall_df = recall_df.loc[lambda f: f["package_ndc"].notnull()]

In [None]:
recall_df.head(5)

Unnamed: 0,fei_number,event_id,product_id,recall_date,reason_for_recall,product_description,package_ndc
40,3001581899,94066,206238,2024-03-01,Defective Container: Out of specification for ...,"Clindamycin Phosphate, Topical Solution USP, 1...",10135-0691-6
41,3001581899,94066,206239,2024-03-01,Defective Container: Out of specification for ...,"Clindamycin Phosphate Topical Solution USP, 1%...",71428-0003-6
93,3013438582,94071,206249,2024-02-29,Presence of Particulate Matter: glass vials fr...,"Moxifloxacin PF, 1mg/ml, in Sterile Balanced S...",71449-096-42
94,3013438582,94071,206250,2024-02-29,Presence of Particulate Matter: glass vials fr...,"Moxifloxacin 5mg/ml, 1 ml in a Single- Dose Vi...",71449-097-42
95,3013438582,94071,206252,2024-02-29,Presence of Particulate Matter: glass vials fr...,"Lidocaine HCL 1% (10mg/mL), PHENYLephrine HCL...",71449-090-42


## Inspections

In [None]:
inspections_df = (
    inspections.loc[lambda f: f["Fiscal Year"] >= 2020]
    .loc[lambda f: f["Product Type"] == "Drugs"]
    .assign(in_vai=lambda f: f["Classification"].str.contains("(VAI)").astype(int))
    .assign(in_oai=lambda f: f["Classification"].str.contains("(OAI)").astype(int))
    .reindex(columns=["FEI Number", "Inspection ID", "Legal Name", "Inspection End Date", "in_vai", "in_oai"])
    .rename(
        columns={
            "FEI Number": "fei_number",
            "Inspection ID": "inspection_id",
            "Inspection End Date": "inspection_date",
            "Legal Name": "manufacturer",
        }
    )
)

  .assign(in_vai=lambda f: f["Classification"].str.contains("(VAI)").astype(int))
  .assign(in_oai=lambda f: f["Classification"].str.contains("(OAI)").astype(int))


In [None]:
inspections_df.head(5)

Unnamed: 0,fei_number,inspection_id,manufacturer,inspection_date,in_vai,in_oai
61,3009710062,1229332,"PediFix, Inc.",2024-02-12,1,0
74,1000221288,1229516,HK Kolmar USA LLC,2024-02-09,0,0
85,3011238454,1228883,LabChemS Corp.,2024-02-08,1,0
123,3007640515,1228841,Sentio Biosciences LLC,2024-02-06,1,0
196,3014316138,1228754,"FUNDA MERIC-BERNSTAM, M.D.",2024-02-01,0,0


## Compliance Actions

In [None]:
compliance_df = (
    compliance_actions.loc[lambda f: f["Action Taken Date"] >= "2020-01-01"]
    .loc[lambda f: f["Product Type"] == "Drugs"]
    .assign(in_injunction=lambda f: (f["Action Type"] == "Injunction").astype(int))
    .assign(in_seizure=lambda f: (f["Action Type"] == "Seizure").astype(int))
    .reindex(columns=["FEI Number", "Case/Injunction ID", "Legal Name", "Action Taken Date", "in_injunction", "in_seizure"])
    .rename(
        columns={
            "FEI Number": "fei_number",
            "Case/Injunction ID": "case_id",
            "Action Taken Date": "case_date",
            "Legal Name": "manufacturer",
        }
    )
)

In [None]:
compliance_df.head(5)

Unnamed: 0,fei_number,case_id,manufacturer,case_date,in_injunction,in_seizure
0,1622491,671239,"Cosmetic Specialty Labs, Inc",2024-02-29,0,0
70,3014651598,674277,NATIVE SALTS LLC,2024-02-28,0,0
72,3003067499,672712,"Thai Nakorn Patana Co., Ltd,",2024-02-27,0,0
171,3002984417,669170,"Higley Industries, Inc.",2024-02-22,0,0
172,3010382813,672952,Innovative Formulations LLC,2024-02-22,0,0


# Export

In [None]:
recall_df.to_parquet(f"{DATA_PATH}/preprocessed/recalls.parquet", index=False)
compliance_df.to_parquet(f"{DATA_PATH}/preprocessed/compliance.parquet", index=False)
inspections_df.to_parquet(f"{DATA_PATH}/preprocessed/inspections.parquet", index=False)

---