# Extract the concept types of RxNorm concepts in the semantic network

2019-05-07

We will use the term types to further refine which nodes to include as ingredients in the active ingredient finding algorithm.

In [1]:
import pandas as pd
from collections import defaultdict

## Which concept sources contain all the RXCUIs in the semantic network?

We will use this source for the term types.

## Read relationships

In [2]:
rels = pd.read_csv("../../pipeline/rxnorm/rxcui_rels.tsv", sep='\t')

In [3]:
rels.shape

(1489730, 4)

In [4]:
rels.head()

Unnamed: 0,rxcui1,rel,rxcui2,rela
0,38,RB,1760,has_tradename
1,38,RO,105050,has_ingredient
2,38,RO,105445,has_ingredient
3,38,RO,105446,has_ingredient
4,38,RO,105447,has_ingredient


In [5]:
rel_cuis = set(rels["rxcui1"])

In [6]:
len(rel_cuis)

195978

## Read `RXNCONSO.RRF` file

In [7]:
rxconso = (pd
    .read_csv("../../data/rxnorm/RXNCONSO.RRF", sep='|', names=[
        "rxcui", "lat", "ts", "lui", "stt",
        "sui", "ispref", "rxaui", "saui", "scui", "sdui",
        "sab", "tty", "code", "str", "srl",
        "suppress", "cvf",
        "temp"
    ])
    .dropna(axis=1, how="all")
)

In [8]:
rxconso.shape

(1028082, 12)

In [9]:
rxconso.head()

Unnamed: 0,rxcui,lat,rxaui,saui,scui,sdui,sab,tty,code,str,suppress,cvf
0,3,ENG,8717795,,58488005,,SNOMEDCT_US,PT,58488005,"1,4-alpha-Glucan branching enzyme",N,
1,3,ENG,8717796,,58488005,,SNOMEDCT_US,FN,58488005,"1,4-alpha-Glucan branching enzyme (substance)",N,
2,3,ENG,8717808,,58488005,,SNOMEDCT_US,SY,58488005,"Amylo-(1,4,6)-transglycosylase",N,
3,3,ENG,8718164,,58488005,,SNOMEDCT_US,SY,58488005,Branching enzyme,N,
4,19,ENG,10794494,,112116001,,SNOMEDCT_US,SY,112116001,17-hydrocorticosteroid,N,


In [10]:
rxconso.isnull().sum()

rxcui             0
lat               0
rxaui             0
saui         710554
scui         562008
sdui        1004564
sab               0
tty               0
code              1
str               0
suppress          0
cvf          782872
dtype: int64

## Number of unique RXCUIs in each source

In [11]:
rxconso.groupby("sab")["rxcui"].nunique().sort_values(ascending=False)

sab
RXNORM         203436
MMSL            67387
MTHSPL          60443
MMX             55761
NDDF            43520
SNOMEDCT_US     36874
VANDF           34549
GS              30299
MSH              9102
DRUGBANK         7916
ATC              5485
USP              4359
CVX               439
MTHCMSFRF           8
Name: rxcui, dtype: int64

## Number of relationship RXCUIs in each source

In [12]:
overlap = defaultdict(list)
for sab, df in rxconso.groupby("sab"):
    overlap["sab"].append(sab)
    overlap["num_common"].append(len(rel_cuis & set(df["rxcui"])))
    
overlap = (pd
    .DataFrame(overlap)
    .sort_values("num_common", ascending=False)
    .reset_index(drop=True)
)

In [13]:
overlap

Unnamed: 0,sab,num_common
0,RXNORM,195978
1,MMSL,36549
2,MMX,27203
3,MTHSPL,23157
4,GS,20951
5,NDDF,19518
6,VANDF,16813
7,SNOMEDCT_US,13011
8,MSH,7722
9,DRUGBANK,3718


In [14]:
rel_cuis <= set(rxconso.query("sab == 'RXNORM'")["rxcui"])

True

We have confirmed that only the RXNORM subset of concepts contains all the concepts in the semantic network.

---

# Filter out RXNORM concepts

In [15]:
good = (rxconso
    .query("sab == 'RXNORM'")
    .dropna(how="all", axis=1)
    .reset_index(drop=True)
)

In [16]:
good.shape

(317528, 11)

In [17]:
good.head()

Unnamed: 0,rxcui,lat,rxaui,saui,scui,sab,tty,code,str,suppress,cvf
0,38,ENG,829,829.0,38,RXNORM,BN,38,Parlodel,N,4096.0
1,44,ENG,947,947.0,44,RXNORM,IN,44,Mesna,N,4096.0
2,61,ENG,1424,1424.0,61,RXNORM,IN,61,beta-Alanine,N,4096.0
3,73,ENG,2458041,2458041.0,73,RXNORM,IN,73,Docosahexaenoate,N,4096.0
4,74,ENG,1684,1684.0,74,RXNORM,IN,74,4-Aminobenzoic Acid,N,4096.0


In [18]:
good.isnull().sum()

rxcui            0
lat              0
rxaui            0
saui             0
scui             0
sab              0
tty              0
code             0
str              0
suppress         0
cvf         184319
dtype: int64

In [19]:
good.dtypes

rxcui         int64
lat          object
rxaui         int64
saui        float64
scui         object
sab          object
tty          object
code         object
str          object
suppress     object
cvf         float64
dtype: object

### Verify data columns

In [20]:
(good["saui"].astype("int64") == good["rxaui"]).all()

True

In [21]:
(good["scui"].astype("int64") == good["rxcui"]).all()

True

In [22]:
good["lat"].value_counts()

ENG    317528
Name: lat, dtype: int64

In [23]:
(good["code"].astype("int64") == good["rxcui"]).all()

True

These columns are redundant.

---

In [24]:
good["suppress"].value_counts()

N    187845
O    124236
E      5447
Name: suppress, dtype: int64

## Drop irrelevant information

In [25]:
final = good[["rxcui", "rxaui", "tty", "str", "suppress", "cvf"]]

In [26]:
final.shape

(317528, 6)

In [27]:
final.head()

Unnamed: 0,rxcui,rxaui,tty,str,suppress,cvf
0,38,829,BN,Parlodel,N,4096.0
1,44,947,IN,Mesna,N,4096.0
2,61,1424,IN,beta-Alanine,N,4096.0
3,73,2458041,IN,Docosahexaenoate,N,4096.0
4,74,1684,IN,4-Aminobenzoic Acid,N,4096.0


In [28]:
final.isnull().sum()

rxcui            0
rxaui            0
tty              0
str              0
suppress         0
cvf         184319
dtype: int64

## Save to file

In [29]:
final.to_csv("../../pipeline/rxnorm/rxconso_info.tsv", sep='\t', index=False)