# Cross reference the FDA's NDCs with RxNorm's NDCs

2019-06-03

Determine how well RxNorm's data allows us to map the FDA's NDCs to RXCUIs.

In [1]:
import pandas as pd

## Read FDA NDCs

In [2]:
ndc_info = pd.read_csv("../../pipeline/fda_ndc/ndc_info.tsv", sep='\t')

In [3]:
ndc_info.shape

(246372, 18)

In [4]:
ndc_info.head()

Unnamed: 0,PRODUCTNDC,NDCPACKAGECODE,PACKAGEDESCRIPTION,PRODUCTTYPENAME,PROPRIETARYNAME,NONPROPRIETARYNAME,DOSAGEFORMNAME,ROUTENAME,MARKETINGCATEGORYNAME,APPLICATIONNUMBER,LABELERNAME,SUBSTANCENAME,ACTIVE_NUMERATOR_STRENGTH,ACTIVE_INGRED_UNIT,PHARM_CLASSES,DEASCHEDULE,NDC_EXCLUDE_FLAG,LISTING_RECORD_CERTIFIED_THROUGH
0,0002-0800,0002-0800-01,1 VIAL in 1 CARTON (0002-0800-01) > 10 mL in ...,HUMAN OTC DRUG,Sterile Diluent,diluent,"INJECTION, SOLUTION",SUBCUTANEOUS,NDA,NDA018781,Eli Lilly and Company,WATER,1.0,mL/mL,,,N,20191231.0
1,0002-1200,0002-1200-30,"1 VIAL, MULTI-DOSE in 1 CAN (0002-1200-30) > ...",HUMAN PRESCRIPTION DRUG,Amyvid,Florbetapir F 18,"INJECTION, SOLUTION",INTRAVENOUS,NDA,NDA202008,Eli Lilly and Company,FLORBETAPIR F-18,51.0,mCi/mL,"Radioactive Diagnostic Agent [EPC],Positron Em...",,N,20191231.0
2,0002-1200,0002-1200-50,"1 VIAL, MULTI-DOSE in 1 CAN (0002-1200-50) > ...",HUMAN PRESCRIPTION DRUG,Amyvid,Florbetapir F 18,"INJECTION, SOLUTION",INTRAVENOUS,NDA,NDA202008,Eli Lilly and Company,FLORBETAPIR F-18,51.0,mCi/mL,"Radioactive Diagnostic Agent [EPC],Positron Em...",,N,20191231.0
3,0002-1407,0002-1407-01,10 mL in 1 VIAL (0002-1407-01),HUMAN PRESCRIPTION DRUG,Quinidine Gluconate,Quinidine Gluconate,SOLUTION,INTRAVENOUS,NDA,NDA007529,Eli Lilly and Company,QUINIDINE GLUCONATE,80.0,mg/mL,"Antiarrhythmic [EPC],Cytochrome P450 2D6 Inhib...",,N,20191231.0
4,0002-1433,0002-1433-61,2 SYRINGE in 1 CARTON (0002-1433-61) > .5 mL ...,HUMAN PRESCRIPTION DRUG,Trulicity,Dulaglutide,"INJECTION, SOLUTION",SUBCUTANEOUS,BLA,BLA125469,Eli Lilly and Company,DULAGLUTIDE,0.75,mg/.5mL,"GLP-1 Receptor Agonist [EPC],Glucagon-Like Pep...",,N,20201231.0


## Read RxNorm info

In [5]:
rxnorm = pd.read_csv("../../pipeline/rxnorm/ndc_to_rxcui.tsv", sep='\t')

In [6]:
rxnorm.shape

(314790, 9)

In [7]:
rxnorm.head()

Unnamed: 0,rxcui,rxaui,stype,code,atn,sab,atv,suppress,cvf
0,91349,3507080,AUI,12745-202,NDC,MTHSPL,12745-202-01,N,4096.0
1,91349,3507080,AUI,12745-202,NDC,MTHSPL,12745-202-02,N,4096.0
2,91349,3507080,AUI,12745-202,NDC,MTHSPL,12745-202-03,N,4096.0
3,91349,3518228,AUI,34645-8030,NDC,MTHSPL,34645-8030-4,N,4096.0
4,91349,3520567,AUI,55316-871,NDC,MTHSPL,55316-871-43,N,4096.0


---

# How many of the FDA NDCs can be mapped to RxNorm CUIs?

In [8]:
fdaids = set(ndc_info["NDCPACKAGECODE"])
rxids = set(rxnorm["atv"])

In [9]:
len(fdaids)

246357

In [10]:
len(rxids)

285653

In [11]:
fdaids <= rxids

False

Too optimistic.

In [12]:
len(fdaids & rxids)

244989

In [13]:
len(fdaids & rxids) / len(fdaids) * 100

99.44470828919007

99% of the FDA NDCs can be mapped to RxNorm CUIs! This is excellent.

### Unmappable ids

In [14]:
len(fdaids - rxids)

1368

In [15]:
len(fdaids - rxids) / len(fdaids) * 100

0.5552917108099221

Only 0.84% of the FDA data can't be matched to a RXCUI.

In [16]:
len(rxids - fdaids)

40664

Finally there's some RxNorm data which the FDA does not know about. Might be old drugs?

## Conclusion

We can map 99% of FDA NDCs to RXCUIs.
We will move forward onto determining the active ingredient.

---

# Merge tables and focus on matchable info

We will ignore any data that is not common to both sources for now.

In [17]:
common = (rxnorm
    [["atv", "rxcui", "rxaui", "suppress"]]
    .rename(columns={"atv": "NDCPACKAGECODE"})
    .merge(ndc_info[["NDCPACKAGECODE"]], how="inner", on="NDCPACKAGECODE")
    .drop_duplicates()
    .reset_index(drop=True)
)

In [18]:
common.shape

(270992, 4)

In [19]:
common.head()

Unnamed: 0,NDCPACKAGECODE,rxcui,rxaui,suppress
0,12745-202-01,91349,3507080,N
1,12745-202-02,91349,3507080,N
2,12745-202-03,91349,3507080,N
3,34645-8030-4,91349,3518228,N
4,55316-871-43,91349,3520567,N


In [20]:
common["suppress"].value_counts()

N    264037
Y      6947
O         8
Name: suppress, dtype: int64

## Verify data consistency

We want to ensure the following:
- Only one value of suppress for each (NDC, RXCUI) pair

In [21]:
nsup = (common
    .drop("rxaui", axis=1)
    .drop_duplicates()
    .groupby(["NDCPACKAGECODE", "rxcui"])
    .size()
    .to_frame("num_suppress")
    .reset_index()
)

In [22]:
nsup.shape

(246677, 3)

In [23]:
nsup.head()

Unnamed: 0,NDCPACKAGECODE,rxcui,num_suppress
0,0002-0800-01,540930,1
1,0002-1200-30,1297712,1
2,0002-1200-50,1297712,1
3,0002-1407-01,853004,1
4,0002-1433-61,1551300,1


### Number of unique suppress values per (NDC, RXCUI) pair

In [24]:
nsup["num_suppress"].value_counts()

1    246674
2         3
Name: num_suppress, dtype: int64

In [25]:
nsup.query("num_suppress > 1")

Unnamed: 0,NDCPACKAGECODE,rxcui,num_suppress
87830,46581-110-20,1793149,2
87831,46581-110-60,1793149,2
87832,46581-110-99,1793149,2


### Multiple suppress values for the same (NDC, RXCUI) pair

In [26]:
(nsup
    .query("num_suppress > 1")
    .drop("num_suppress", axis=1)
    .merge(common, how="left", on=["NDCPACKAGECODE", "rxcui"])
)

Unnamed: 0,NDCPACKAGECODE,rxcui,rxaui,suppress
0,46581-110-20,1793149,7241476,Y
1,46581-110-20,1793149,7241477,N
2,46581-110-20,1793149,7241478,N
3,46581-110-60,1793149,7241476,Y
4,46581-110-60,1793149,7241477,N
5,46581-110-60,1793149,7241478,N
6,46581-110-99,1793149,7241476,Y
7,46581-110-99,1793149,7241477,N
8,46581-110-99,1793149,7241478,N


We see here that despite the same NDC mapping to the same RXCUI, the suppress values may be different depending on which RXAUI the mapping used.
Since we do not care about the RXAUIs, we will harmonize the suppress value.

### Clean up multiple suppress values

In [27]:
multi = (nsup
    .query("num_suppress > 1")
    .drop("num_suppress", axis=1)
    .merge(common, how="left", on=["NDCPACKAGECODE", "rxcui"])
       
    .groupby(["NDCPACKAGECODE", "rxcui"])
    .apply(
        lambda df: "N" if (df["suppress"] == "N").any() else "Y"
    )
    .to_frame("suppress")
    .reset_index()
)

In [28]:
multi.head()

Unnamed: 0,NDCPACKAGECODE,rxcui,suppress
0,46581-110-20,1793149,N
1,46581-110-60,1793149,N
2,46581-110-99,1793149,N


In [29]:
single = (nsup
    .query("num_suppress == 1")
    .drop("num_suppress", axis=1)
    .merge(common, how="left", on=["NDCPACKAGECODE", "rxcui"])

    .drop("rxaui", axis=1)
    .drop_duplicates()
)

In [30]:
single.head()

Unnamed: 0,NDCPACKAGECODE,rxcui,suppress
0,0002-0800-01,540930,N
1,0002-1200-30,1297712,N
2,0002-1200-50,1297712,N
3,0002-1407-01,853004,N
4,0002-1433-61,1551300,N


## Recreate good dataframe

In [31]:
clean = (single
    .append(multi)
    .sort_values(["NDCPACKAGECODE", "rxcui", "suppress"])
    .reset_index(drop=True)
)

In [32]:
clean.shape

(246677, 3)

In [33]:
clean.head()

Unnamed: 0,NDCPACKAGECODE,rxcui,suppress
0,0002-0800-01,540930,N
1,0002-1200-30,1297712,N
2,0002-1200-50,1297712,N
3,0002-1407-01,853004,N
4,0002-1433-61,1551300,N


In [34]:
clean["suppress"].value_counts()

N    240291
Y      6378
O         8
Name: suppress, dtype: int64

### Simplify the suppress column to a boolean

We don't care about the distinction between a term being suppressed by the original source vs by the RxNorm editors.

In [35]:
clean = clean.assign(suppress = lambda df: df["suppress"] != "N")

In [36]:
clean["suppress"].value_counts()

False    240291
True       6386
Name: suppress, dtype: int64

In [37]:
clean.head()

Unnamed: 0,NDCPACKAGECODE,rxcui,suppress
0,0002-0800-01,540930,False
1,0002-1200-30,1297712,False
2,0002-1200-50,1297712,False
3,0002-1407-01,853004,False
4,0002-1433-61,1551300,False


### Data stats

In [38]:
clean["NDCPACKAGECODE"].nunique()

244989

In [39]:
clean["rxcui"].nunique()

43556

Only 43k unique drugs for all 244k NDCs? Must be more packaging numbers than we originally thought.

### Number of unique RXCUIs per NDC

In [40]:
ncuis = (clean
    .groupby("NDCPACKAGECODE")
    ["rxcui"]
    .nunique()
    .to_frame("ncuis")
    .reset_index()
)

In [41]:
ncuis.head()

Unnamed: 0,NDCPACKAGECODE,ncuis
0,0002-0800-01,1
1,0002-1200-30,1
2,0002-1200-50,1
3,0002-1407-01,1
4,0002-1433-61,1


In [42]:
ncuis["ncuis"].value_counts()

1    243321
2      1648
3        20
Name: ncuis, dtype: int64

The vast majority of the NDCs only map to a single RXCUI, but a small subset map to multiple.

### Are there multiple not suppressed RXCUIs for each NDC?

In [43]:
(ncuis
    .query("ncuis == 1")
    .drop("ncuis", axis=1)
    .merge(clean, how="left", on="NDCPACKAGECODE")

    ["suppress"]
    .value_counts()
)

False    238645
True       4676
Name: suppress, dtype: int64

For the NDCs with only one RXCUI, a small percentage use mappings which are suppressed.

In [44]:
(ncuis
    .query("ncuis > 1")
    .drop("ncuis", axis=1)
    .merge(clean, how="left", on="NDCPACKAGECODE")

    .groupby("NDCPACKAGECODE")
    .apply(
        lambda df: (~df["suppress"]).sum()
    )
    .value_counts()
)

1    1646
0      22
dtype: int64

For the NDCs with multiple RXCUIs, they have at most one not suppressed RXCUI.

## Conclusion

NDCs may map to multiple unique RXCUIs, but some or all of them might be suppressed (old or deprecated).
These multiple RXCUIs represent multiple chances to identify the active ingredient.
However the results we get will need to be harmonized prior to analysis.

---

## Create final file

In [45]:
final = clean.merge(ndc_info, how="left", on="NDCPACKAGECODE")

In [46]:
final.shape

(246695, 20)

In [47]:
final.head()

Unnamed: 0,NDCPACKAGECODE,rxcui,suppress,PRODUCTNDC,PACKAGEDESCRIPTION,PRODUCTTYPENAME,PROPRIETARYNAME,NONPROPRIETARYNAME,DOSAGEFORMNAME,ROUTENAME,MARKETINGCATEGORYNAME,APPLICATIONNUMBER,LABELERNAME,SUBSTANCENAME,ACTIVE_NUMERATOR_STRENGTH,ACTIVE_INGRED_UNIT,PHARM_CLASSES,DEASCHEDULE,NDC_EXCLUDE_FLAG,LISTING_RECORD_CERTIFIED_THROUGH
0,0002-0800-01,540930,False,0002-0800,1 VIAL in 1 CARTON (0002-0800-01) > 10 mL in ...,HUMAN OTC DRUG,Sterile Diluent,diluent,"INJECTION, SOLUTION",SUBCUTANEOUS,NDA,NDA018781,Eli Lilly and Company,WATER,1.0,mL/mL,,,N,20191231.0
1,0002-1200-30,1297712,False,0002-1200,"1 VIAL, MULTI-DOSE in 1 CAN (0002-1200-30) > ...",HUMAN PRESCRIPTION DRUG,Amyvid,Florbetapir F 18,"INJECTION, SOLUTION",INTRAVENOUS,NDA,NDA202008,Eli Lilly and Company,FLORBETAPIR F-18,51.0,mCi/mL,"Radioactive Diagnostic Agent [EPC],Positron Em...",,N,20191231.0
2,0002-1200-50,1297712,False,0002-1200,"1 VIAL, MULTI-DOSE in 1 CAN (0002-1200-50) > ...",HUMAN PRESCRIPTION DRUG,Amyvid,Florbetapir F 18,"INJECTION, SOLUTION",INTRAVENOUS,NDA,NDA202008,Eli Lilly and Company,FLORBETAPIR F-18,51.0,mCi/mL,"Radioactive Diagnostic Agent [EPC],Positron Em...",,N,20191231.0
3,0002-1407-01,853004,False,0002-1407,10 mL in 1 VIAL (0002-1407-01),HUMAN PRESCRIPTION DRUG,Quinidine Gluconate,Quinidine Gluconate,SOLUTION,INTRAVENOUS,NDA,NDA007529,Eli Lilly and Company,QUINIDINE GLUCONATE,80.0,mg/mL,"Antiarrhythmic [EPC],Cytochrome P450 2D6 Inhib...",,N,20191231.0
4,0002-1433-61,1551300,False,0002-1433,2 SYRINGE in 1 CARTON (0002-1433-61) > .5 mL ...,HUMAN PRESCRIPTION DRUG,Trulicity,Dulaglutide,"INJECTION, SOLUTION",SUBCUTANEOUS,BLA,BLA125469,Eli Lilly and Company,DULAGLUTIDE,0.75,mg/.5mL,"GLP-1 Receptor Agonist [EPC],Glucagon-Like Pep...",,N,20201231.0


## Save to file

In [48]:
final.to_csv("../../pipeline/merged_ndc_info.tsv", sep='\t', index=False)