# Extract relevant information from the NDC package file

2019-06-03

Extract relevant information from the NDC package file.

In [1]:
import pandas as pd
import re

## Read package file

In [2]:
package = pd.read_csv("../../data/fda_ndc/package.txt", sep='\t')

In [3]:
package.shape

(246380, 8)

In [4]:
package.head()

Unnamed: 0,PRODUCTID,PRODUCTNDC,NDCPACKAGECODE,PACKAGEDESCRIPTION,STARTMARKETINGDATE,ENDMARKETINGDATE,NDC_EXCLUDE_FLAG,SAMPLE_PACKAGE
0,0002-0800_4bb5d1cb-0fa7-48c7-9f6d-8d45f9b91649,0002-0800,0002-0800-01,1 VIAL in 1 CARTON (0002-0800-01) > 10 mL in ...,19870710.0,,N,N
1,0002-1200_957ee1b5-dfa7-4e3f-96e1-6bed1ffc0abe,0002-1200,0002-1200-30,"1 VIAL, MULTI-DOSE in 1 CAN (0002-1200-30) > ...",20120601.0,,N,N
2,0002-1200_957ee1b5-dfa7-4e3f-96e1-6bed1ffc0abe,0002-1200,0002-1200-50,"1 VIAL, MULTI-DOSE in 1 CAN (0002-1200-50) > ...",20120601.0,,N,N
3,0002-1407_14757f9d-f641-4836-acf3-229265588d1d,0002-1407,0002-1407-01,10 mL in 1 VIAL (0002-1407-01),19510301.0,,N,N
4,0002-1433_4468578a-47d2-488e-9fd4-a8322070392f,0002-1433,0002-1433-61,2 SYRINGE in 1 CARTON (0002-1433-61) > .5 mL ...,20141107.0,,N,Y


### Examine missing data

In [5]:
package.isnull().sum()

PRODUCTID                  0
PRODUCTNDC                 0
NDCPACKAGECODE             0
PACKAGEDESCRIPTION         0
STARTMARKETINGDATE        27
ENDMARKETINGDATE      240806
NDC_EXCLUDE_FLAG           0
SAMPLE_PACKAGE             0
dtype: int64

Not sure we care about the marketing dates, so we can drop those.

### Excluded data

In [6]:
package["NDC_EXCLUDE_FLAG"].value_counts()

N    246380
Name: NDC_EXCLUDE_FLAG, dtype: int64

All the drugs here are not excluded, so we can drop this column due to redundancy.

### What does sample package mean?

No information provided in the reference either.

In [7]:
package["SAMPLE_PACKAGE"].value_counts()

N    245825
Y       555
Name: SAMPLE_PACKAGE, dtype: int64

In [8]:
package.query("SAMPLE_PACKAGE == 'Y'").head()

Unnamed: 0,PRODUCTID,PRODUCTNDC,NDCPACKAGECODE,PACKAGEDESCRIPTION,STARTMARKETINGDATE,ENDMARKETINGDATE,NDC_EXCLUDE_FLAG,SAMPLE_PACKAGE
4,0002-1433_4468578a-47d2-488e-9fd4-a8322070392f,0002-1433,0002-1433-61,2 SYRINGE in 1 CARTON (0002-1433-61) > .5 mL ...,20141107.0,,N,Y
6,0002-1434_4468578a-47d2-488e-9fd4-a8322070392f,0002-1434,0002-1434-61,2 SYRINGE in 1 CARTON (0002-1434-61) > .5 mL ...,20141107.0,,N,Y
9,0002-1436_ad6f74e8-b0ef-4a96-9249-c1225c5cd6a7,0002-1436,0002-1436-61,2 SYRINGE in 1 CARTON (0002-1436-61) > 1 mL i...,20180927.0,,N,Y
13,0002-1445_03dc46ee-3620-47a2-9293-90aa3e6c62cf,0002-1445,0002-1445-61,1 SYRINGE in 1 CARTON (0002-1445-61) > 1 mL i...,20160419.0,,N,Y
37,0002-4182_1606b529-d77d-41d8-9379-094caf0241c2,0002-4182,0002-4182-61,"30 TABLET, FILM COATED in 1 BOTTLE (0002-4182-...",20180531.0,,N,Y


Not really sure what the "sample package" column means. Will ignore it for now.

### Simplify package table

In [9]:
gpackage = package.drop(
    ["STARTMARKETINGDATE", "ENDMARKETINGDATE",
     "NDC_EXCLUDE_FLAG", "SAMPLE_PACKAGE"], axis=1
)

In [10]:
gpackage.shape

(246380, 4)

In [11]:
gpackage.head()

Unnamed: 0,PRODUCTID,PRODUCTNDC,NDCPACKAGECODE,PACKAGEDESCRIPTION
0,0002-0800_4bb5d1cb-0fa7-48c7-9f6d-8d45f9b91649,0002-0800,0002-0800-01,1 VIAL in 1 CARTON (0002-0800-01) > 10 mL in ...
1,0002-1200_957ee1b5-dfa7-4e3f-96e1-6bed1ffc0abe,0002-1200,0002-1200-30,"1 VIAL, MULTI-DOSE in 1 CAN (0002-1200-30) > ..."
2,0002-1200_957ee1b5-dfa7-4e3f-96e1-6bed1ffc0abe,0002-1200,0002-1200-50,"1 VIAL, MULTI-DOSE in 1 CAN (0002-1200-50) > ..."
3,0002-1407_14757f9d-f641-4836-acf3-229265588d1d,0002-1407,0002-1407-01,10 mL in 1 VIAL (0002-1407-01)
4,0002-1433_4468578a-47d2-488e-9fd4-a8322070392f,0002-1433,0002-1433-61,2 SYRINGE in 1 CARTON (0002-1433-61) > .5 mL ...


We only really care about the full NDC code along with the package description and the unique product id.

### Check that NDCs are properly formatted

Check that the NDCs are in 4-4-2, 5-3-2, or 5-4-1 format.

In [12]:
def is_ndc_type_a(code):
    return re.match(r'^\d{4}-\d{4}-\d{2}$', code) is not None

def is_ndc_type_b(code):
    return re.match(r'^\d{5}-\d{3}-\d{2}$', code) is not None

def is_ndc_type_c(code):
    return re.match(r'^\d{5}-\d{4}-\d{1}$', code) is not None

In [13]:
def get_ndc_type(code):
    """What format is this NDC in?"""
    
    # includes two dashes
    NDC_CODE_LENGTH = 12
    
    assert len(code) == NDC_CODE_LENGTH
    
    res = [
        is_ndc_type_a(code),
        is_ndc_type_b(code),
        is_ndc_type_c(code)
    ]
    
    assert sum(res) == 1
    
    for val, code_type in zip(res, "ABC"):
        if val:
            return code_type

In [14]:
info = gpackage.assign(
    ndc_type = lambda df: df["NDCPACKAGECODE"].map(get_ndc_type)
)

In [15]:
info.shape

(246380, 5)

In [16]:
info.head()

Unnamed: 0,PRODUCTID,PRODUCTNDC,NDCPACKAGECODE,PACKAGEDESCRIPTION,ndc_type
0,0002-0800_4bb5d1cb-0fa7-48c7-9f6d-8d45f9b91649,0002-0800,0002-0800-01,1 VIAL in 1 CARTON (0002-0800-01) > 10 mL in ...,A
1,0002-1200_957ee1b5-dfa7-4e3f-96e1-6bed1ffc0abe,0002-1200,0002-1200-30,"1 VIAL, MULTI-DOSE in 1 CAN (0002-1200-30) > ...",A
2,0002-1200_957ee1b5-dfa7-4e3f-96e1-6bed1ffc0abe,0002-1200,0002-1200-50,"1 VIAL, MULTI-DOSE in 1 CAN (0002-1200-50) > ...",A
3,0002-1407_14757f9d-f641-4836-acf3-229265588d1d,0002-1407,0002-1407-01,10 mL in 1 VIAL (0002-1407-01),A
4,0002-1433_4468578a-47d2-488e-9fd4-a8322070392f,0002-1433,0002-1433-61,2 SYRINGE in 1 CARTON (0002-1433-61) > .5 mL ...,A


In [17]:
info["ndc_type"].value_counts()

B    143889
C     78454
A     24037
Name: ndc_type, dtype: int64

All NDCs in the package file are properly formatted, but there are three versions of the NDC.

### Number of unique NDCs

In [18]:
info["NDCPACKAGECODE"].nunique()

246361

## Save to file

In [19]:
gpackage.to_csv("../../pipeline/fda_ndc/simple_package.tsv", sep='\t', index=False)