In [1]:
import pathlib
import xml.etree.ElementTree as ET

import mysql.connector
import numpy as np
import pandas as pd
import requests
import sqlalchemy

In [2]:
# Load MySQL password from file
with open('../mysql_password.txt') as f:
    password = f.readline().strip()

## Command used to build `drugs` table

```mysql
CREATE TABLE drugs (
    rxnorm_cui int NOT NULL,
    name varchar(255),
    PRIMARY KEY (rxnorm_cui)
);
```

# Format data for insertion into DB

In [3]:
# Mapping between the index of a drug in the vector (or matrix, depending on context)
drug_vector = np.load('/data1/home/rav7008/formike/data/unique_ingredients.npy').astype(int)

# Mapping between RxNorm CUI and concept_id and name
drug_concepts_df = pd.read_csv('/data1/home/rav7008/formike/data/drug_concept_names.csv')

# Merge these two sources of information. Want names but only for drugs that appear in the vector
drugs_df = (
    pd.DataFrame({'drug': drug_vector})
    .merge(drug_concepts_df, left_on='drug', right_on='concept_code', how='left')
)

drugs_df.head()

Unnamed: 0,drug,drug_concept_id,concept_name,concept_code
0,314826,19080523.0,"silicon dioxide, colloidal",314826.0
1,8167,19132892.0,phenylhydrazine,8167.0
2,1312358,42903427.0,Aldosterone,1312358.0
3,4024,745268.0,"ergoloid mesylates, USP",4024.0
4,4025,1145379.0,Ergotamine,4025.0


In [4]:
# There are missing values (drugs without a mapping to RxNorm CUI in drug_concept_names.csv)
drugs_df[drugs_df['concept_code'].isnull()]

Unnamed: 0,drug,drug_concept_id,concept_name,concept_code
847,100278,,,
1057,857953,,,
1058,1427022,,,
1059,1303851,,,
1060,1304122,,,
1213,386055,,,
1679,1731071,,,
1895,596723,,,
2804,1727875,,,
3281,644634,,,


In [5]:
def get_rxnorm_name(cui):
    '''Use the RxNav API to find concept names for RxNorm CUIs without a name in the file'''
    res = requests.get(f'https://rxnav.nlm.nih.gov/REST/rxcui/{cui}/status')
    return ET.fromstring(res.content).findtext('rxcuiStatus/minConceptGroup/minConcept/name')

# Fill missing values using the API
drugs_df.loc[drugs_df['concept_code'].isnull(), 'concept_name'] = drugs_df.loc[
    drugs_df['concept_code'].isnull(), 'drug'].apply(get_rxnorm_name)

# See mappings made using API. (`concept_name` is now filled in)
drugs_df[drugs_df['concept_code'].isnull()]

Unnamed: 0,drug,drug_concept_id,concept_name,concept_code
847,100278,,Lactobacillus casei rhamnosus,
1057,857953,,influenza B virus vaccine B/Brisbane/60/2008 a...,
1058,1427022,,"Influenza B virus vaccine, B-Massachusetts-2-2...",
1059,1303851,,"Influenza A virus vaccine, A-California-7-2009...",
1060,1304122,,"Influenza A virus vaccine, A-Victoria-361-2011...",
1213,386055,,lauromacrogols,
1679,1731071,,fosaprepitant,
1895,596723,,cerivastatin,
2804,1727875,,Tetanus immune globulin,
3281,644634,,alexitol,


There are duplicate drugs, but the primary key in the database of the drug table is the `rxnorm_cui`. Below I drop duplicates with respect to both `rxnorm_cui` and `name`. Since this results in the data being inserted into the `drugs` MySQL table, this indicates that there is at least a unique mapping between `rxnorm_cui` and `name` in the data.

In [6]:
drugs_df = (
    drugs_df
    .filter(items=['drug', 'concept_name'])
    .rename(columns={'drug': 'rxnorm_cui', 'concept_name': 'name'})
    .drop_duplicates(subset=['rxnorm_cui', 'name'])
)

drugs_df.head(2)

Unnamed: 0,rxnorm_cui,name
0,314826,"silicon dioxide, colloidal"
1,8167,phenylhydrazine


# Connect to DB and insert data

In [7]:
engine = sqlalchemy.create_engine(f"mysql+mysqlconnector://mnz2108:{password}@localhost/effect_nsides")

engine.execute('''
CREATE TABLE drugs (
    rxnorm_cui int NOT NULL,
    name varchar(255),
    PRIMARY KEY (rxnorm_cui)
);
''')

# Insert data into the database
(
    drugs_df
    .to_sql(
        name='drugs',
        con=engine,
        if_exists='replace',
        index=False,
    )
)

In [8]:
# Verify that the data can be re-extracted from the table
result_df = pd.read_sql(
    sql='SELECT * FROM drugs;',
    con=engine,
)

result_df.head()

Unnamed: 0,rxnorm_cui,name
0,314826,"silicon dioxide, colloidal"
1,8167,phenylhydrazine
2,1312358,Aldosterone
3,4024,"ergoloid mesylates, USP"
4,4025,Ergotamine
