In [1]:
import numpy as np
import pandas as pd
import sqlalchemy
import tqdm

In [2]:
# Load a dictionary that maps between RxNorm and OMOP CDM concept_id
rxnorm_to_omop = (
    pd.read_csv('../../data/tables/drug_concept.csv.xz')
    .set_index('rxnorm_concept_id')['concept_id']
    .to_dict()
)

In [3]:
# Load MySQL password from file
with open('../../mysql_password.txt') as f:
    password = f.readline().strip()
# Create MySQL connector
engine = sqlalchemy.create_engine(f"mysql+mysqlconnector://mnz2108:{password}@localhost/effect_nsides")

# Setup reader to go through TWOSIDES file 1_000_000 lines at a time
twosides_reader = pd.read_csv('../../data/tables/twosides.csv.xz', chunksize=1_000_000, 
                              compression='xz')

In [4]:
engine.execute('drop table IF EXISTS TWOSIDES;')

engine.execute('''
CREATE TABLE TWOSIDES (
    drug_concept_id_1 int,
    drug_concept_id_2 int,
    condition_concept_id int,
    A int,
    B int,
    C int,
    D int,
    PRR float,
    PRR_error float,
    mean_reporting_frequency float
);''')
engine.execute('describe TWOSIDES;').fetchall()

[('drug_concept_id_1', 'int(11)', 'YES', '', None, ''),
 ('drug_concept_id_2', 'int(11)', 'YES', '', None, ''),
 ('condition_concept_id', 'int(11)', 'YES', '', None, ''),
 ('A', 'int(11)', 'YES', '', None, ''),
 ('B', 'int(11)', 'YES', '', None, ''),
 ('C', 'int(11)', 'YES', '', None, ''),
 ('D', 'int(11)', 'YES', '', None, ''),
 ('PRR', 'float', 'YES', '', None, ''),
 ('PRR_error', 'float', 'YES', '', None, ''),
 ('mean_reporting_frequency', 'float', 'YES', '', None, '')]

In [5]:
for chunk in tqdm.tqdm_notebook(twosides_reader, total=3722):
    (
        chunk
        .assign(
            drug_concept_id_1 = lambda df: df['drug_1'].map(rxnorm_to_omop),
            drug_concept_id_2 = lambda df: df['drug_2'].map(rxnorm_to_omop),
            mean_reporting_frequency=lambda df: df['A'] / (df['A'] + df['B']),
            PRR = lambda df: df['PRR']
                             .apply(lambda x: np.nan if (x == np.inf or x == 'inf') else x),
            PRR_error = lambda df: df['PRR_error']
                                   .apply(lambda x: np.nan if (x == np.inf or x == 'inf') else x),
        )
        # Drop rows with A and C both zero
        .query('~(A == 0 & C == 0)')
        .rename(columns={'outcome_id': 'condition_concept_id'})
        .filter(items=['drug_concept_id_1', 'drug_concept_id_2', 'condition_concept_id',
                       'A', 'B', 'C', 'D', 'PRR', 'PRR_error', 'mean_reporting_frequency'])
        .to_sql(
            name='TWOSIDES',
            con=engine,
            if_exists='append',
            index=False,
            dtype={
                'drug_concept_id_1': sqlalchemy.types.Integer,
                'drug_concept_id_2': sqlalchemy.types.Integer,
                'condition_concept_id': sqlalchemy.types.Integer,
                'A': sqlalchemy.types.Integer,
                'B': sqlalchemy.types.Integer,
                'C': sqlalchemy.types.Integer,
                'D': sqlalchemy.types.Integer,
                'PRR': sqlalchemy.types.Float,
                'PRR_error': sqlalchemy.types.Float,
                'mean_reporting_frequency': sqlalchemy.types.Float,
            },
            chunksize=100_000,
        )
    )

HBox(children=(IntProgress(value=0, max=3722), HTML(value='')))


