# Clean up the NDC info we extracted out of `RXNSAT.RRF`

2019-04-18

Clean up the data so hopefully we can join it up with the FDA provided NDC info.

In [1]:
import pandas as pd
import re

## Read RxNorm NDC info

In [2]:
ndcs = pd.read_csv("../../pipeline/rxnorm/raw_ndc_info.tsv", sep='\t', low_memory=False)

In [3]:
ndcs.shape

(1656783, 10)

In [4]:
ndcs.head()

Unnamed: 0,rxcui,rxaui,stype,code,atui,atn,sab,atv,suppress,cvf
0,1356,1473211,AUI,4009746,,NDC,VANDF,395021301,O,
1,1356,1473211,AUI,4009746,,NDC,VANDF,395021391,O,
2,1356,1473211,AUI,4009746,,NDC,VANDF,17317004101,O,
3,1356,1473211,AUI,4009746,,NDC,VANDF,17317004105,O,
4,1356,1473211,AUI,4009746,,NDC,VANDF,49452815001,O,


## Basic info

In [5]:
ndcs.isnull().sum()

rxcui             0
rxaui             0
stype             0
code              0
atui        1656782
atn               0
sab               0
atv               0
suppress          0
cvf         1198256
dtype: int64

### Suppress

In [6]:
ndcs["suppress"].value_counts()

N    942717
O    659319
Y     54747
Name: suppress, dtype: int64

Not sure what to make of this column for now.

### ATUI column: only one value filled

In [7]:
ndcs[ndcs["atui"].notnull()]

Unnamed: 0,rxcui,rxaui,stype,code,atui,atn,sab,atv,suppress,cvf
498381,310818,1429432,AUI,4002728,AT54210078,NDC,VANDF,364224202,O,


We can drop the single row with a not null ATUI code.

## Clean up NDC table

In [8]:
gndcs = ndcs.drop("atui", axis=1)

In [9]:
gndcs.shape

(1656783, 9)

In [10]:
gndcs.head()

Unnamed: 0,rxcui,rxaui,stype,code,atn,sab,atv,suppress,cvf
0,1356,1473211,AUI,4009746,NDC,VANDF,395021301,O,
1,1356,1473211,AUI,4009746,NDC,VANDF,395021391,O,
2,1356,1473211,AUI,4009746,NDC,VANDF,17317004101,O,
3,1356,1473211,AUI,4009746,NDC,VANDF,17317004105,O,
4,1356,1473211,AUI,4009746,NDC,VANDF,49452815001,O,


---

## Number of unique drugs

In [11]:
gndcs["rxcui"].nunique()

98256

In [12]:
gndcs["atv"].nunique()

1307212

Knowing that there are 250k NDCs, does that mean there are only 98k real unique drugs?

### Number of NDCs per RXCUI

In [13]:
gndcs.groupby("rxcui")["atv"].nunique().value_counts().head()

1    38093
2    21382
3     6894
4     6035
6     3283
Name: atv, dtype: int64

Keep in mind that there are multiple NDC formats in the data, and is therefore only an estimate of the number of unique NDCs per RXCUI.

---

## Clean up NDC format

RxNorm has many different formats for the NDC.
We want to standardize the NDC format so that we can merge the RxNorm data with the FDA data.

In [14]:
info = gndcs.assign(atv_len = lambda df: df["atv"].str.len())

In [15]:
info.shape

(1656783, 10)

In [16]:
info.head()

Unnamed: 0,rxcui,rxaui,stype,code,atn,sab,atv,suppress,cvf,atv_len
0,1356,1473211,AUI,4009746,NDC,VANDF,395021301,O,,12
1,1356,1473211,AUI,4009746,NDC,VANDF,395021391,O,,12
2,1356,1473211,AUI,4009746,NDC,VANDF,17317004101,O,,12
3,1356,1473211,AUI,4009746,NDC,VANDF,17317004105,O,,12
4,1356,1473211,AUI,4009746,NDC,VANDF,49452815001,O,,12


### NDC code lengths

In [17]:
info["atv_len"].value_counts()

11    671390
12    568741
13    416652
Name: atv_len, dtype: int64

### Code length from each data source

In [18]:
info.groupby(["atv_len", "sab"]).size()

atv_len  sab   
11       MMSL      191175
         NDDF      226094
         RXNORM    254120
         VANDF          1
12       MTHSPL    309278
         VANDF     259463
13       CVX          549
         GS        123074
         MMX       293027
         MTHSPL         2
dtype: int64

---

# Start with 12 digit codes

Since we know that the official standardized format of the NDC is 10 digits with two dashes, we will start with the 12 digit codes.

We will use the FDA NDC data to determine how well we have mapped the NDC space.

In [19]:
twelve = (info
    .query("atv_len == 12")
    .reset_index(drop=True)
)

In [20]:
twelve.head()

Unnamed: 0,rxcui,rxaui,stype,code,atn,sab,atv,suppress,cvf,atv_len
0,1356,1473211,AUI,4009746,NDC,VANDF,395021301,O,,12
1,1356,1473211,AUI,4009746,NDC,VANDF,395021391,O,,12
2,1356,1473211,AUI,4009746,NDC,VANDF,17317004101,O,,12
3,1356,1473211,AUI,4009746,NDC,VANDF,17317004105,O,,12
4,1356,1473211,AUI,4009746,NDC,VANDF,49452815001,O,,12


### VANDF origin

In [21]:
twelve.query("sab == 'VANDF'").head()

Unnamed: 0,rxcui,rxaui,stype,code,atn,sab,atv,suppress,cvf,atv_len
0,1356,1473211,AUI,4009746,NDC,VANDF,395021301,O,,12
1,1356,1473211,AUI,4009746,NDC,VANDF,395021391,O,,12
2,1356,1473211,AUI,4009746,NDC,VANDF,17317004101,O,,12
3,1356,1473211,AUI,4009746,NDC,VANDF,17317004105,O,,12
4,1356,1473211,AUI,4009746,NDC,VANDF,49452815001,O,,12


### MTHSPL origin

In [22]:
twelve.query("sab == 'MTHSPL'").head()

Unnamed: 0,rxcui,rxaui,stype,code,atn,sab,atv,suppress,cvf,atv_len
435,91349,3507080,AUI,12745-202,NDC,MTHSPL,12745-202-01,N,4096.0,12
436,91349,3507080,AUI,12745-202,NDC,MTHSPL,12745-202-02,N,4096.0,12
437,91349,3507080,AUI,12745-202,NDC,MTHSPL,12745-202-03,N,4096.0,12
438,91349,3518228,AUI,34645-8030,NDC,MTHSPL,34645-8030-4,N,4096.0,12
439,91349,3520567,AUI,55316-871,NDC,MTHSPL,55316-871-43,N,4096.0,12


Notice that the MTHSPL codes are in the format we expect, while the VANDF codes are in an unknown 12 digit format.

HIPAA normalization converts to a 11 digit code, so it is unclear why the VANDF codes are 12 digits in length.

## Verify NDC format consistency

In [23]:
(twelve
    .assign(num_dashes = lambda df: df["atv"].str.count("-"))
    .groupby(["num_dashes", "sab"])
    .size()
)

num_dashes  sab   
0           VANDF     259463
2           MTHSPL    309278
dtype: int64

Seems like the MTHSPL source (FDA structured product labels) has the proper two dash format, while the VANDF (Veterans Health Administration National Drug File) has a weird 12 digit format, which is not the HIPAA format.

Originally I assumed that the VANDF entries were HIPAA normalized NDCs with a leading zero, but I verified that this was not the case.
Since I could find no documentation on the VANDF NDCs, I decided to ignore them for now because I cannot cross-reference them with the FDA's data.

## Verify format of 12 digit MTHSPL NDCs

In [24]:
dashed = (twelve
    .query("sab == 'MTHSPL'")
    .drop("atv_len", axis=1)
    .reset_index(drop=True)
)

In [25]:
dashed.shape

(309278, 9)

In [26]:
dashed.head()

Unnamed: 0,rxcui,rxaui,stype,code,atn,sab,atv,suppress,cvf
0,91349,3507080,AUI,12745-202,NDC,MTHSPL,12745-202-01,N,4096.0
1,91349,3507080,AUI,12745-202,NDC,MTHSPL,12745-202-02,N,4096.0
2,91349,3507080,AUI,12745-202,NDC,MTHSPL,12745-202-03,N,4096.0
3,91349,3518228,AUI,34645-8030,NDC,MTHSPL,34645-8030-4,N,4096.0
4,91349,3520567,AUI,55316-871,NDC,MTHSPL,55316-871-43,N,4096.0


### Verify NDC format

In [27]:
def is_ndc_type_a(code):
    return re.match(r'^\d{4}-\d{4}-\d{2}$', code) is not None

def is_ndc_type_b(code):
    return re.match(r'^\d{5}-\d{3}-\d{2}$', code) is not None

def is_ndc_type_c(code):
    return re.match(r'^\d{5}-\d{4}-\d{1}$', code) is not None

In [28]:
def get_ndc_type(code):
    """What format is this NDC in?"""
    
    # includes two dashes
    NDC_CODE_LENGTH = 12
    
    assert len(code) == NDC_CODE_LENGTH
    
    res = [
        is_ndc_type_a(code),
        is_ndc_type_b(code),
        is_ndc_type_c(code)
    ]
    
    assert sum(res) == 1
    
    for val, code_type in zip(res, "ABC"):
        if val:
            return code_type

In [29]:
dashed = dashed.assign(ndc_type = lambda df: df["atv"].map(get_ndc_type))

In [30]:
dashed.shape

(309278, 10)

In [31]:
dashed.head()

Unnamed: 0,rxcui,rxaui,stype,code,atn,sab,atv,suppress,cvf,ndc_type
0,91349,3507080,AUI,12745-202,NDC,MTHSPL,12745-202-01,N,4096.0,B
1,91349,3507080,AUI,12745-202,NDC,MTHSPL,12745-202-02,N,4096.0,B
2,91349,3507080,AUI,12745-202,NDC,MTHSPL,12745-202-03,N,4096.0,B
3,91349,3518228,AUI,34645-8030,NDC,MTHSPL,34645-8030-4,N,4096.0,C
4,91349,3520567,AUI,55316-871,NDC,MTHSPL,55316-871-43,N,4096.0,B


In [32]:
dashed["ndc_type"].value_counts()

B    171592
C    107389
A     30297
Name: ndc_type, dtype: int64

All the data from MTHSPL is good, and has the correct NDC format.

### Number of unique drugs

In [33]:
dashed["rxcui"].nunique()

47091

In [34]:
dashed["atv"].nunique()

280374

In [35]:
dashed["stype"].value_counts()

AUI    309278
Name: stype, dtype: int64

In [36]:
dashed["suppress"].value_counts()

N    299518
Y      7529
O      2231
Name: suppress, dtype: int64

This seems promising, and the number of NDCs is very close to the FDA's number.
We will see how well these NDCs match the FDA's data before using NDCs from other sources.

## Save NDC to RxNorm mappings to file

In [37]:
(dashed
    .drop("ndc_type", axis=1) 
    .to_csv("../../pipeline/rxnorm/ndc_to_rxcui.tsv", sep='\t', index=False)
)