In [1]:
import io
import pathlib
import re
import tarfile

import mysql.connector
import numpy as np
import pandas as pd
import scipy.io
import sqlalchemy
import tqdm

In [2]:
# Load MySQL password from file
with open('../mysql_password.txt') as f:
    password = f.readline().strip()

## Command used to build `onesides` table

```mysql
CREATE TABLE onesides (
    drug_cui int NOT NULL,
    outcome_id int NOT NULL,
    PRR double,
    PRR_error double,
    PRIMARY KEY (drug_cui, outcome_id)
);
```

# Format data for insertion into DB

In [3]:
root = pathlib.Path('/data1/home/rav7008/formike/')

tf = tarfile.open(root / 'all_PRRs.tar.gz', mode='r:gz')

# Load the outcomes vector to allow ID lookup from index
outcomes_vector = np.load((root / 'data/all_outcome_strings.npy'), allow_pickle=True)

# Load the ingredients vector to allow ID lookup from index
ingredients_vector = np.load((root / 'data/unique_ingredients.npy'), encoding='bytes').astype(int)

# Vector index to CUI, where only indices that are the first occurrence of the CUI are present
index_to_cui = (
    pd.DataFrame(ingredients_vector, columns=['drug_cui'])
    .reset_index()
    .groupby('drug_cui')
    .min()
    .reset_index()
    .set_index('index')
    .loc[:, 'drug_cui']
    .to_dict()
)

# List of indices corresponding to unique RxNorm CUIs
unique_ingredients = sorted(set(index_to_cui.keys()))


# Find the unique drug indices with a PRR file present 
# (only using index of first appearance for each RxNorm CUI)
subfiles = tf.getnames()
prr_subfiles = [path for path in subfiles if 'PRRs' not in path]
drug_indices = [int(path.split('__')[1][:-4]) for path in prr_subfiles]
drug_indices = sorted(set(drug_index for drug_index in drug_indices if drug_index in unique_ingredients))

In [4]:
onesides_df = pd.DataFrame()

for drug_index in tqdm.tqdm_notebook(drug_indices):
    prr = np.load(io.BytesIO(tf.extractfile(f'PRR__{drug_index}.npy').read())).flatten()
    error = np.load(io.BytesIO(tf.extractfile(f'PRRs__{drug_index}.npy').read())).flatten()

    drug_df = (
        pd.DataFrame({'PRR': prr, 'PRR_error': error})
        .reset_index()
        .dropna()
        .rename(columns={'index': 'outcome_index'})    
        .query('PRR < 1e99 & PRR_error < 1e99')
        .assign(
            drug_cui = index_to_cui[drug_index],
            outcome_id = lambda df: df['outcome_index'].apply(lambda x: outcomes_vector[x]),
        )
        .filter(items=['drug_cui', 'outcome_id', 'PRR', 'PRR_error'])
    )
    
    onesides_df = pd.concat([onesides_df, drug_df], ignore_index=True)

HBox(children=(IntProgress(value=0, max=2742), HTML(value='')))




# Connect to DB and insert data

In [5]:
engine = sqlalchemy.create_engine(f"mysql+mysqlconnector://mnz2108:{password}@localhost/effect_nsides")
engine.execute('DROP TABLE onesides;')
engine.execute('''
CREATE TABLE onesides (
    drug_cui int NOT NULL,
    outcome_id int NOT NULL,
    PRR double,
    PRR_error double,
    PRIMARY KEY (drug_cui, outcome_id)
);
''')

# Insert data into the database
(
    onesides_df
    .to_sql(
        name='onesides',
        con=engine,
        if_exists='replace',
        index=False,
        chunksize=50_000,
    )
)

In [6]:
# Verify that the data can be re-extracted from the table
result_df = pd.read_sql(
    sql='SELECT * FROM onesides LIMIT 100;',
    con=engine,
)

result_df.head()

Unnamed: 0,drug_cui,outcome_id,PRR,PRR_error
0,314826,31967,2.5,1.014889
1,314826,196523,10.0,1.334166
2,314826,441408,3.333333,1.055146
3,314826,373474,10.0,1.334166
4,314826,4232311,5.0,1.131371
