# Extract relevant information from the FDA product file

2019-05-07

In [1]:
import pandas as pd

# Read product info

In [2]:
product = pd.read_csv("../../data/fda_ndc/product.txt", sep='\t', encoding="latin")

In [3]:
product.shape

(129085, 20)

In [4]:
product.head()

Unnamed: 0,PRODUCTID,PRODUCTNDC,PRODUCTTYPENAME,PROPRIETARYNAME,PROPRIETARYNAMESUFFIX,NONPROPRIETARYNAME,DOSAGEFORMNAME,ROUTENAME,STARTMARKETINGDATE,ENDMARKETINGDATE,MARKETINGCATEGORYNAME,APPLICATIONNUMBER,LABELERNAME,SUBSTANCENAME,ACTIVE_NUMERATOR_STRENGTH,ACTIVE_INGRED_UNIT,PHARM_CLASSES,DEASCHEDULE,NDC_EXCLUDE_FLAG,LISTING_RECORD_CERTIFIED_THROUGH
0,0002-0800_4bb5d1cb-0fa7-48c7-9f6d-8d45f9b91649,0002-0800,HUMAN OTC DRUG,Sterile Diluent,,diluent,"INJECTION, SOLUTION",SUBCUTANEOUS,19870710.0,,NDA,NDA018781,Eli Lilly and Company,WATER,1.0,mL/mL,,,N,20191231.0
1,0002-1200_957ee1b5-dfa7-4e3f-96e1-6bed1ffc0abe,0002-1200,HUMAN PRESCRIPTION DRUG,Amyvid,,Florbetapir F 18,"INJECTION, SOLUTION",INTRAVENOUS,20120601.0,,NDA,NDA202008,Eli Lilly and Company,FLORBETAPIR F-18,51.0,mCi/mL,"Radioactive Diagnostic Agent [EPC],Positron Em...",,N,20191231.0
2,0002-1407_14757f9d-f641-4836-acf3-229265588d1d,0002-1407,HUMAN PRESCRIPTION DRUG,Quinidine Gluconate,,Quinidine Gluconate,SOLUTION,INTRAVENOUS,19500712.0,,NDA,NDA007529,Eli Lilly and Company,QUINIDINE GLUCONATE,80.0,mg/mL,"Antiarrhythmic [EPC],Cytochrome P450 2D6 Inhib...",,N,20191231.0
3,0002-1433_4468578a-47d2-488e-9fd4-a8322070392f,0002-1433,HUMAN PRESCRIPTION DRUG,Trulicity,,Dulaglutide,"INJECTION, SOLUTION",SUBCUTANEOUS,20140918.0,,BLA,BLA125469,Eli Lilly and Company,DULAGLUTIDE,0.75,mg/.5mL,"GLP-1 Receptor Agonist [EPC],Glucagon-Like Pep...",,N,20201231.0
4,0002-1434_4468578a-47d2-488e-9fd4-a8322070392f,0002-1434,HUMAN PRESCRIPTION DRUG,Trulicity,,Dulaglutide,"INJECTION, SOLUTION",SUBCUTANEOUS,20140918.0,,BLA,BLA125469,Eli Lilly and Company,DULAGLUTIDE,1.5,mg/.5mL,"GLP-1 Receptor Agonist [EPC],Glucagon-Like Pep...",,N,20201231.0


### Look at missing data

In [5]:
(product
    .isnull()
    .sum()
    .to_frame("num_null")
    .assign(pct_null = lambda df: df["num_null"].divide(len(product)).multiply(100))
)

Unnamed: 0,num_null,pct_null
PRODUCTID,0,0.0
PRODUCTNDC,0,0.0
PRODUCTTYPENAME,0,0.0
PROPRIETARYNAME,4,0.003099
PROPRIETARYNAMESUFFIX,111674,86.511988
NONPROPRIETARYNAME,7,0.005423
DOSAGEFORMNAME,0,0.0
ROUTENAME,2277,1.763954
STARTMARKETINGDATE,5,0.003873
ENDMARKETINGDATE,125157,96.957044


For our purposes we can remove information about marketing dates.

#### Product type

In [6]:
product["PRODUCTTYPENAME"].value_counts()

HUMAN OTC DRUG                 67447
HUMAN PRESCRIPTION DRUG        58394
NON-STANDARDIZED ALLERGENIC     2640
PLASMA DERIVATIVE                325
STANDARDIZED ALLERGENIC          149
VACCINE                          123
CELLULAR THERAPY                   7
Name: PRODUCTTYPENAME, dtype: int64

#### Exclude flag

In [7]:
product["NDC_EXCLUDE_FLAG"].value_counts()

N    97474
E    31611
Name: NDC_EXCLUDE_FLAG, dtype: int64

#### DEA schedules

In [8]:
product[product["DEASCHEDULE"].notnull()].head()

Unnamed: 0,PRODUCTID,PRODUCTNDC,PRODUCTTYPENAME,PROPRIETARYNAME,PROPRIETARYNAMESUFFIX,NONPROPRIETARYNAME,DOSAGEFORMNAME,ROUTENAME,STARTMARKETINGDATE,ENDMARKETINGDATE,MARKETINGCATEGORYNAME,APPLICATIONNUMBER,LABELERNAME,SUBSTANCENAME,ACTIVE_NUMERATOR_STRENGTH,ACTIVE_INGRED_UNIT,PHARM_CLASSES,DEASCHEDULE,NDC_EXCLUDE_FLAG,LISTING_RECORD_CERTIFIED_THROUGH
135,0004-0058_5426a9bb-5c51-41b3-a957-ce7242d77bb9,0004-0058,HUMAN PRESCRIPTION DRUG,Klonopin,,Clonazepam,TABLET,ORAL,19750602.0,,NDA,NDA017533,"Genentech, Inc.",CLONAZEPAM,1.0,mg/1,"Benzodiazepine [EPC],Benzodiazepines [CS]",CIV,N,20191231.0
136,0004-0068_5426a9bb-5c51-41b3-a957-ce7242d77bb9,0004-0068,HUMAN PRESCRIPTION DRUG,Klonopin,,Clonazepam,TABLET,ORAL,19750602.0,,NDA,NDA017533,"Genentech, Inc.",CLONAZEPAM,0.5,mg/1,"Benzodiazepine [EPC],Benzodiazepines [CS]",CIV,N,20191231.0
137,0004-0098_5426a9bb-5c51-41b3-a957-ce7242d77bb9,0004-0098,HUMAN PRESCRIPTION DRUG,Klonopin,,Clonazepam,TABLET,ORAL,19750602.0,,NDA,NDA017533,"Genentech, Inc.",CLONAZEPAM,2.0,mg/1,"Benzodiazepine [EPC],Benzodiazepines [CS]",CIV,N,20191231.0
167,0006-0005_bb901df0-9843-4179-8d00-45d144ab7c82,0006-0005,HUMAN PRESCRIPTION DRUG,BELSOMRA,,suvorexant,"TABLET, FILM COATED",ORAL,20140829.0,,NDA,NDA204569,Merck Sharp & Dohme Corp.,SUVOREXANT,5.0,mg/1,"Orexin Receptor Antagonist [EPC],Orexin Recept...",CIV,N,20191231.0
171,0006-0033_bb901df0-9843-4179-8d00-45d144ab7c82,0006-0033,HUMAN PRESCRIPTION DRUG,BELSOMRA,,suvorexant,"TABLET, FILM COATED",ORAL,20140829.0,,NDA,NDA204569,Merck Sharp & Dohme Corp.,SUVOREXANT,10.0,mg/1,"Orexin Receptor Antagonist [EPC],Orexin Recept...",CIV,N,20191231.0


In [9]:
product[product["DEASCHEDULE"].notnull()]["DEASCHEDULE"].value_counts()

CIV     2100
CII     2072
CIII     569
CV       201
Name: DEASCHEDULE, dtype: int64

This might be useful as an indicator for how "dangerous" a drug is to the patient, since they can cause dependency.

#### Suffix

In [10]:
product[product["PROPRIETARYNAMESUFFIX"].notnull()].head()

Unnamed: 0,PRODUCTID,PRODUCTNDC,PRODUCTTYPENAME,PROPRIETARYNAME,PROPRIETARYNAMESUFFIX,NONPROPRIETARYNAME,DOSAGEFORMNAME,ROUTENAME,STARTMARKETINGDATE,ENDMARKETINGDATE,MARKETINGCATEGORYNAME,APPLICATIONNUMBER,LABELERNAME,SUBSTANCENAME,ACTIVE_NUMERATOR_STRENGTH,ACTIVE_INGRED_UNIT,PHARM_CLASSES,DEASCHEDULE,NDC_EXCLUDE_FLAG,LISTING_RECORD_CERTIFIED_THROUGH
8,0002-3004_f1404e9d-0c95-44e4-9eb0-7a2836a67bd8,0002-3004,HUMAN PRESCRIPTION DRUG,Prozac,Weekly,Fluoxetine hydrochloride,"CAPSULE, DELAYED RELEASE",ORAL,20010316.0,,NDA,NDA021235,Eli Lilly and Company,FLUOXETINE HYDROCHLORIDE,90,mg/1,"Serotonin Reuptake Inhibitor [EPC],Serotonin U...",,N,20191231.0
32,0002-4453_cd0ef401-5aed-4406-a26a-ab4c377409fe,0002-4453,HUMAN PRESCRIPTION DRUG,ZYPREXA,Zydis,Olanzapine,"TABLET, ORALLY DISINTEGRATING",ORAL,20000601.0,,NDA,NDA021086,Eli Lilly and Company,OLANZAPINE,5,mg/1,Atypical Antipsychotic [EPC],,N,20191231.0
33,0002-4454_cd0ef401-5aed-4406-a26a-ab4c377409fe,0002-4454,HUMAN PRESCRIPTION DRUG,ZYPREXA,Zydis,Olanzapine,"TABLET, ORALLY DISINTEGRATING",ORAL,20000601.0,,NDA,NDA021086,Eli Lilly and Company,OLANZAPINE,10,mg/1,Atypical Antipsychotic [EPC],,N,20191231.0
34,0002-4455_cd0ef401-5aed-4406-a26a-ab4c377409fe,0002-4455,HUMAN PRESCRIPTION DRUG,ZYPREXA,Zydis,Olanzapine,"TABLET, ORALLY DISINTEGRATING",ORAL,20010901.0,,NDA,NDA021086,Eli Lilly and Company,OLANZAPINE,15,mg/1,Atypical Antipsychotic [EPC],,N,20191231.0
35,0002-4456_cd0ef401-5aed-4406-a26a-ab4c377409fe,0002-4456,HUMAN PRESCRIPTION DRUG,ZYPREXA,Zydis,Olanzapine,"TABLET, ORALLY DISINTEGRATING",ORAL,20010901.0,,NDA,NDA021086,Eli Lilly and Company,OLANZAPINE,20,mg/1,Atypical Antipsychotic [EPC],,N,20191231.0


Suffix is not very useful for now, only for naming the full name of the drug.

### Simplify product table

In [11]:
gproduct = product.drop(
    ["PROPRIETARYNAMESUFFIX", "STARTMARKETINGDATE", "ENDMARKETINGDATE"], axis=1
)

In [12]:
gproduct.shape

(129085, 17)

In [13]:
gproduct.head()

Unnamed: 0,PRODUCTID,PRODUCTNDC,PRODUCTTYPENAME,PROPRIETARYNAME,NONPROPRIETARYNAME,DOSAGEFORMNAME,ROUTENAME,MARKETINGCATEGORYNAME,APPLICATIONNUMBER,LABELERNAME,SUBSTANCENAME,ACTIVE_NUMERATOR_STRENGTH,ACTIVE_INGRED_UNIT,PHARM_CLASSES,DEASCHEDULE,NDC_EXCLUDE_FLAG,LISTING_RECORD_CERTIFIED_THROUGH
0,0002-0800_4bb5d1cb-0fa7-48c7-9f6d-8d45f9b91649,0002-0800,HUMAN OTC DRUG,Sterile Diluent,diluent,"INJECTION, SOLUTION",SUBCUTANEOUS,NDA,NDA018781,Eli Lilly and Company,WATER,1.0,mL/mL,,,N,20191231.0
1,0002-1200_957ee1b5-dfa7-4e3f-96e1-6bed1ffc0abe,0002-1200,HUMAN PRESCRIPTION DRUG,Amyvid,Florbetapir F 18,"INJECTION, SOLUTION",INTRAVENOUS,NDA,NDA202008,Eli Lilly and Company,FLORBETAPIR F-18,51.0,mCi/mL,"Radioactive Diagnostic Agent [EPC],Positron Em...",,N,20191231.0
2,0002-1407_14757f9d-f641-4836-acf3-229265588d1d,0002-1407,HUMAN PRESCRIPTION DRUG,Quinidine Gluconate,Quinidine Gluconate,SOLUTION,INTRAVENOUS,NDA,NDA007529,Eli Lilly and Company,QUINIDINE GLUCONATE,80.0,mg/mL,"Antiarrhythmic [EPC],Cytochrome P450 2D6 Inhib...",,N,20191231.0
3,0002-1433_4468578a-47d2-488e-9fd4-a8322070392f,0002-1433,HUMAN PRESCRIPTION DRUG,Trulicity,Dulaglutide,"INJECTION, SOLUTION",SUBCUTANEOUS,BLA,BLA125469,Eli Lilly and Company,DULAGLUTIDE,0.75,mg/.5mL,"GLP-1 Receptor Agonist [EPC],Glucagon-Like Pep...",,N,20201231.0
4,0002-1434_4468578a-47d2-488e-9fd4-a8322070392f,0002-1434,HUMAN PRESCRIPTION DRUG,Trulicity,Dulaglutide,"INJECTION, SOLUTION",SUBCUTANEOUS,BLA,BLA125469,Eli Lilly and Company,DULAGLUTIDE,1.5,mg/.5mL,"GLP-1 Receptor Agonist [EPC],Glucagon-Like Pep...",,N,20201231.0


In [14]:
gproduct.isnull().sum()

PRODUCTID                                0
PRODUCTNDC                               0
PRODUCTTYPENAME                          0
PROPRIETARYNAME                          4
NONPROPRIETARYNAME                       7
DOSAGEFORMNAME                           0
ROUTENAME                             2277
MARKETINGCATEGORYNAME                    0
APPLICATIONNUMBER                    19611
LABELERNAME                              0
SUBSTANCENAME                         2760
ACTIVE_NUMERATOR_STRENGTH             2761
ACTIVE_INGRED_UNIT                    2760
PHARM_CLASSES                        73524
DEASCHEDULE                         124143
NDC_EXCLUDE_FLAG                         0
LISTING_RECORD_CERTIFIED_THROUGH      3930
dtype: int64

According to the documentation, it seems like non-proprietary name and substance name are the most useful for identifying the active ingredient.

Documentation link: https://www.fda.gov/drugs/drug-approvals-and-databases/ndc-product-file-definitions

## Save to file

In [15]:
gproduct.to_csv("../../pipeline/fda_ndc/simple_product.tsv", sep='\t', index=False)