# Create the final mapping from NDCs to active ingredients

2019-05-07

In [1]:
import pandas as pd
import itertools

## Read consolidated NDC to RXCUI active ingredient mappings

In [2]:
mapping = pd.read_csv("../../pipeline/ingredients/ndc_tables/ndc_to_rxcui_map_version_2.tsv", sep='\t')

In [3]:
mapping.shape

(242988, 4)

In [4]:
mapping.head()

Unnamed: 0,NDCPACKAGECODE,rxcui,suppress,active_ingredients
0,0002-0800-01,540930,False,11295
1,0002-1200-30,1297712,False,-1
2,0002-1200-50,1297712,False,-1
3,0002-1407-01,853004,False,35220
4,0002-1433-61,1551300,False,1551291


# Convert precise ingredients to regular ingredients

### Read term types

In [5]:
conso = pd.read_csv("../../pipeline/rxnorm/rxconso_info.tsv", sep='\t')

In [6]:
conso.head(3)

Unnamed: 0,rxcui,rxaui,tty,str,suppress,cvf
0,38,829,BN,Parlodel,N,4096.0
1,44,947,IN,Mesna,N,4096.0
2,61,1424,IN,beta-Alanine,N,4096.0


### Read RXCUI relationships

In [7]:
rels = pd.read_csv("../../pipeline/rxnorm/rxcui_rels.tsv", sep='\t')

In [8]:
rels.head(3)

Unnamed: 0,rxcui1,rel,rxcui2,rela
0,38,RB,1760,has_tradename
1,38,RO,105050,has_ingredient
2,38,RO,105445,has_ingredient


### Check that precise ingredients have `form_of` edges

In [9]:
form_of = rels.query("rela == 'form_of'")

In [10]:
set(conso.query("tty == 'PIN'")["rxcui"]) == set(form_of["rxcui2"])

True

### Check that precise ingredients go to ingredient nodes

In [11]:
set(form_of["rxcui1"]) <= set(conso.query("tty == 'IN'")["rxcui"])

True

### Check that there's only one `form_of` edge for each precise ingredient

In [12]:
form_of.groupby("rxcui2")["rxcui1"].nunique().value_counts()

1    2853
Name: rxcui1, dtype: int64

---

## Convert back to regular ingredients

In [13]:
conv = {
    str(row.rxcui2): str(row.rxcui1)
    for row in form_of.itertuples()
}

In [14]:
def convert(pins):
    return ",".join(
        sorted(  
            set(
                conv.get(val, val) for val in pins.split(",")
            )
        )
    )

res = (mapping
    .rename(columns={"active_ingredients": "precise_ingredients"})
    .assign(active_ingredients = lambda df: df["precise_ingredients"].map(convert))
)

In [15]:
res.shape

(242988, 5)

In [16]:
res.head()

Unnamed: 0,NDCPACKAGECODE,rxcui,suppress,precise_ingredients,active_ingredients
0,0002-0800-01,540930,False,11295,11295
1,0002-1200-30,1297712,False,-1,-1
2,0002-1200-50,1297712,False,-1,-1
3,0002-1407-01,853004,False,35220,9068
4,0002-1433-61,1551300,False,1551291,1551291


---

## Analyze changes

### Number of differences

In [17]:
(res["precise_ingredients"] != res["active_ingredients"]).value_counts()

False    179664
True      63324
dtype: int64

## Precise ingredients

### Number of original ingredients

In [18]:
pins = res.query("precise_ingredients != '-1'")["precise_ingredients"].str.split(",")

In [19]:
len(set(
    itertools.chain.from_iterable(pins)
))

2893

### Number of ingredients per NDC

In [20]:
(pins
    .map(len)
    .value_counts()

    .head()
)

1    167757
2     15642
3      5238
4      2073
5       250
Name: precise_ingredients, dtype: int64

### Ingredients which show up in the greatest number of NDCs

In [21]:
(pd
    .Series(itertools.chain.from_iterable(pins))
    .value_counts()
    .head()
)

161     8319
7806    6754
448     4868
5640    3829
6750    3717
dtype: int64

## Simplified ingredients

### Number of simplified ingredients

In [22]:
simp = res.query("active_ingredients != '-1'")["active_ingredients"].str.split(",")

In [23]:
len(set(
    itertools.chain.from_iterable(simp)
))

2452

### Number of ingredients per NDC

In [24]:
(simp
    .map(len)
    .value_counts()

    .head()
)

1    168339
2     15482
3      5172
4      1798
5       273
Name: active_ingredients, dtype: int64

### Ingredients which show up in the greatest number of NDCs

In [25]:
(pd
    .Series(itertools.chain.from_iterable(simp))
    .value_counts()
    .head()
)

161     8319
7806    6754
448     4868
5640    3832
6750    3726
dtype: int64

---

## Finalize table

In [26]:
final = res.rename(columns={"NDCPACKAGECODE": "ndc"})

In [27]:
final.shape

(242988, 5)

In [28]:
final.head()

Unnamed: 0,ndc,rxcui,suppress,precise_ingredients,active_ingredients
0,0002-0800-01,540930,False,11295,11295
1,0002-1200-30,1297712,False,-1,-1
2,0002-1200-50,1297712,False,-1,-1
3,0002-1407-01,853004,False,35220,9068
4,0002-1433-61,1551300,False,1551291,1551291


In [29]:
final["ndc"].nunique()

242966

## Save to file

In [30]:
final.to_csv("../../output/ndc_active_ingredients.tsv", sep='\t', index=False)