# `DRUG_CONCEPT`

`DRUG_CONCEPT` is a table in the `effect_nsides` database that stores drugs.
This table only stores those drugs which appear in `DRUG_EXPOSURE`.
The schema for this table is the following:

```mysql
CREATE TABLE DRUG_CONCEPT (
    concept_id int
    concept_name varchar(255)
    rxnorm_concept_id int
    drugbank_concept_id varchar(255)
    chebi_concept_id int
)
```

Fields:
* `concept_id` is the OMOP CDM `concept_id` for each drug
* `concept_name` is the drug's OMOP CDM `concept_name`
* `rxnorm_concept_id` is the drug's ID from RxNorm
* `drugbank_concept_id` is the drug's ID from DrugBank
* `chebi_concept_id` is the drug's ID from ChEBI

Note that while all drugs are guaranteed to have RxNorm and OMOP CDM IDs, there are a sizable fraction of drugs that could not be mapped to DrugBank or ChEBI identifiers.

In [1]:
import numpy as np
import pandas as pd

## Load RxNorm codes that were used

In [2]:
drug_id_vector = np.load('../../data/meta_formatted/drug_id_vector.npy')

drug_code_df = (
    pd.DataFrame(drug_id_vector, columns=['rxnorm_concept_id'])
    .drop_duplicates()
)

print(drug_code_df.shape[0])

drug_code_df.head(2)

3453


Unnamed: 0,rxnorm_concept_id
0,314826
1,8167


## Load RxNorm - OMOP concept_id mapping (Map from ATHENA)

In [3]:
athena_rxnorm_df = (
    pd.read_csv('../../data/external_maps/RxNorm.csv', sep='\t', dtype=str)
    .query('vocabulary_id == "RxNorm"')   
    .assign(concept_code=lambda df: df['concept_code'].astype(int))
)

athena_rxnorm_df.head(2)

Unnamed: 0,concept_id,concept_name,domain_id,vocabulary_id,concept_class_id,standard_concept,concept_code,valid_start_date,valid_end_date,invalid_reason
1,1510703,Helleborus extract,Drug,RxNorm,Ingredient,S,2047647,20180702,20991231,
2,1510746,{28 (Norethindrone 0.35 MG Oral Tablet) } Pack...,Drug,RxNorm,Branded Pack,S,2043465,20180702,20991231,


In [4]:
drug_concept = (
    drug_code_df
    .merge(athena_rxnorm_df, left_on='rxnorm_concept_id', right_on='concept_code', how='left')
    .filter(items=['concept_id', 'concept_name', 'rxnorm_concept_id'])
)

# Check that all RxNorm IDs were mapped to OMOP CDM concept_id
assert not drug_concept.isnull().any().any()

print(len(set(drug_concept['concept_id'])))

drug_concept.head(2)

3453


Unnamed: 0,concept_id,concept_name,rxnorm_concept_id
0,19080523,"silicon dioxide, colloidal",314826
1,19132892,phenylhydrazine,8167


## Load RxNorm to DrugBank mapping (Map from RxNorm)

In [5]:
# https://www.nlm.nih.gov/research/umls/rxnorm/docs/techdoc.html#s12_0
column_names = ['RXCUI', 'LAT', 'TS', 'LUI', 'STT', 'SUI', 'ISPREF', 'RXAUI', 
                'SAUI', 'SCUI', 'SDUI', 'SAB', 'TTY', 'CODE', 'STR', 'SRL', 
                'SUPPRESS', 'CVF']

map_df = pd.read_csv('../../data/external_maps/RXNCONSO.RRF', sep='|', header=None, 
                     names=column_names, index_col=False)

In [6]:
# Number of unique DrugBank codes for a single RXCUI
(
    map_df
    .query('SAB == "DRUGBANK"')
    .groupby('RXCUI')['CODE']
    .nunique()
    .value_counts()
)

1    8784
2      63
4       6
3       4
5       3
6       2
8       2
7       1
Name: CODE, dtype: int64

In [7]:
rxnorm_to_drugbank = (
    map_df
    .query('SAB == "DRUGBANK"')
    .drop(['LAT', 'TS', 'LUI', 'STT', 'SUI', 'ISPREF', 'SCUI', 'SDUI',
           'RXAUI', 'SAUI', 'SAB', 'SRL', 'SUPPRESS', 'CVF',], axis=1)
    .assign(sort_index=lambda df: df['TTY'].map({'IN': 1, 'FSY': 2, 'SY': 2}))
    .sort_values('sort_index', ascending=True)
    .groupby('RXCUI')
    .first()
    .reset_index()
    .drop(['sort_index', 'TTY'], axis=1)
    .rename(columns={'RXCUI': 'rxnorm_concept_id', 'CODE': 'drugbank_concept_id', 
                     'STR': 'drugbank_concept_name'})
)

rxnorm_to_drugbank.head(2)

Unnamed: 0,rxnorm_concept_id,drugbank_concept_id,drugbank_concept_name
0,60,DB01509,Tenamfetamine
1,74,DB02362,Aminobenzoic acid


## Load DrugBank links file (from DrugBank)

Of interest, this contains DrugBank IDs, ChEBI IDs, and drug names.

In [8]:
db_df = (
    pd.read_csv('../../data/external_maps/drug links.csv')
    .assign(name_lower=lambda df: df['Name'].apply(lambda x: x.lower()))
    .rename(columns={'DrugBank ID': 'drugbank_concept_id', 'Name': 'drugbank_concept_name', 
                     'ChEBI ID': 'chebi_concept_id'})
    .filter(items=['drugbank_concept_id', 'drugbank_concept_name', 'chebi_concept_id', 'name_lower'])
)

db_df.head(2)

Unnamed: 0,drugbank_concept_id,drugbank_concept_name,chebi_concept_id,name_lower
0,DB00001,Lepirudin,,lepirudin
1,DB00002,Cetuximab,,cetuximab


In [9]:
# Map through RxNorm CUIs if possible
mapped_by_rxnorm = (
    drug_concept
    .merge(rxnorm_to_drugbank, how='left', on='rxnorm_concept_id')
    .drop(['drugbank_concept_name'], axis=1)
    .merge(db_df, on='drugbank_concept_id', how='left')
    .filter(items=['concept_id', 'concept_name', 'rxnorm_concept_id', 
                   'drugbank_concept_id', 'chebi_concept_id'])
)

# Map remaining terms through their names (taken to lowercase)
mapped_by_name = (
    mapped_by_rxnorm
    .loc[mapped_by_rxnorm['drugbank_concept_id'].isnull(), ['concept_id', 'concept_name', 'rxnorm_concept_id']]
    .assign(name_lower=lambda df: df['concept_name'].apply(lambda x: x.lower()))
    .merge(db_df, how='left', on='name_lower')
    .filter(items=['concept_id', 'concept_name', 'rxnorm_concept_id', 
                   'drugbank_concept_id', 'chebi_concept_id'])
)

# Combine the two mapping paths
drug_concept = pd.concat([mapped_by_rxnorm.dropna(subset=['drugbank_concept_id']), 
                          mapped_by_name], ignore_index=True, sort=False)

drug_concept.to_csv('../../data/tables/drug_concept.csv.xz', index=False, compression='xz')

drug_concept.head(2)

Unnamed: 0,concept_id,concept_name,rxnorm_concept_id,drugbank_concept_id,chebi_concept_id
0,19080523,"silicon dioxide, colloidal",314826,DB11132,30563.0
1,42903427,Aldosterone,1312358,DB04630,27584.0


In [10]:
# Fraction of concepts without a DrugBank mapping
db_un = drug_concept[drug_concept['drugbank_concept_id'].isnull()].shape[0] / drug_concept.shape[0]

# Fraction of concepts without a ChEBI mapping
ch_un = drug_concept[drug_concept['chebi_concept_id'].isnull()].shape[0] / drug_concept.shape[0]

print(f'DrugBank {100 * db_un :.2f}% unmapped, ChEBI {100 * ch_un :.2f}% unmapped')

DrugBank 20.79% unmapped, ChEBI 38.26% unmapped
