In [1]:
import numpy as np
import pandas as pd
import sqlalchemy
import tqdm

In [2]:
# Load a dictionary that maps between RxNorm and OMOP CDM concept_id
rxnorm_to_omop = (
    pd.read_csv('../../data/tables/drug_concept.csv.xz')
    .set_index('rxnorm_concept_id')['concept_id']
    .to_dict()
)

In [3]:
# Load MySQL password from file
with open('../../mysql_password.txt') as f:
    password = f.readline().strip()
# Create MySQL connector
engine = sqlalchemy.create_engine(f"mysql+mysqlconnector://mnz2108:{password}@localhost/effect_nsides")

In [4]:
columns = ['drug_concept_id', 'condition_concept_id', 'A', 'B', 'C', 'D', 'PRR', 'PRR_error']
offsides = pd.read_csv('../../data/tables/offsides.csv.xz', header=None, 
                       names=columns, compression='xz', dtype='object')

In [5]:
offsides.loc[lambda df: df['drug_concept_id'] == 'drug_id']

Unnamed: 0,drug_concept_id,condition_concept_id,A,B,C,D,PRR,PRR_error
333488,drug_id,outcome_id,A,B,C,D,PRR,PRR_error


In [6]:
# Remove the header row and map RxNorm to OMOP CDM codes
offsides = (
    offsides
    .loc[lambda df: df['drug_concept_id'] != 'drug_id']
    .assign(
        drug_concept_id=lambda df: df['drug_concept_id']
                                   .astype(int)
                                   .map(rxnorm_to_omop)
    )
)

# Columns to convert to integer. Some are like '0.0', so str -> float -> int
int_cols = ['drug_concept_id', 'condition_concept_id', 'A', 'B', 'C', 'D']
offsides.loc[:, int_cols] = offsides.loc[:, int_cols].astype(float).astype(int)

# Float columns. Want NaN instead of INF for SQL
float_cols = ['PRR', 'PRR_error']
offsides.loc[:, float_cols] = (
    offsides
    .loc[:, float_cols]
    .astype(float)
    .applymap(lambda x: np.nan if x == np.inf else x)
)

# Add mean reporting frequency
offsides = (
    offsides
    .assign(
        mean_reporting_frequency=lambda df: df['A'] / (df['A'] + df['B'])
    )
)

offsides.head()

Unnamed: 0,drug_concept_id,condition_concept_id,A,B,C,D,PRR,PRR_error,mean_reporting_frequency
0,745268,35104065,0,132,0,1320,,,0.0
1,745268,35104066,0,132,0,1320,,,0.0
2,745268,35104067,0,132,0,1320,,,0.0
3,745268,35104069,0,132,0,1320,,,0.0
4,745268,35104070,0,132,3,1317,0.0,,0.0


In [7]:
# Columns to convert to integer. Some are like '0.0', so str -> float -> int
int_cols = ['drug_concept_id', 'condition_concept_id', 'A', 'B', 'C', 'D']
offsides.loc[:, int_cols] = offsides.loc[:, int_cols].astype(float).astype(int)

# Drop rows with A and C both zero
offsides = offsides.query('~(A == 0 & C == 0)')

# Float columns. Want NaN instead of INF for SQL
float_cols = ['PRR', 'PRR_error']
offsides.loc[:, float_cols] = (
    offsides
    .loc[:, float_cols]
    .astype(float)
    .applymap(lambda x: np.nan if x == np.inf else x)
)

offsides = (
    offsides
    .assign(
        mean_reporting_frequency=lambda df: df['A'] / (df['A'] + df['B'])
    )
)

print(offsides.shape)

offsides.head()

Unnamed: 0,drug_concept_id,condition_concept_id,A,B,C,D,PRR,PRR_error,mean_reporting_frequency
4,745268,35104070,0,132,3,1317,0.0,,0.0
8,745268,35104074,6,126,21,1299,2.857143,0.45382,0.045455
19,745268,35104085,0,132,1,1319,0.0,,0.0
25,745268,35104091,0,132,1,1319,0.0,,0.0
33,745268,35104100,1,131,1,1319,10.0,1.411264,0.007576


In [8]:
engine.execute('DROP TABLE IF EXISTS OFFSIDES;')

engine.execute('''
CREATE TABLE OFFSIDES (
    drug_concept_id int,
    condition_concept_id int,
    A int,
    B int,
    C int,
    D int,
    PRR float,
    PRR_error float,
    mean_reporting_frequency float
);''')
engine.execute('describe OFFSIDES;').fetchall()

[('drug_concept_id', 'int(11)', 'YES', '', None, ''),
 ('condition_concept_id', 'int(11)', 'YES', '', None, ''),
 ('A', 'int(11)', 'YES', '', None, ''),
 ('B', 'int(11)', 'YES', '', None, ''),
 ('C', 'int(11)', 'YES', '', None, ''),
 ('D', 'int(11)', 'YES', '', None, ''),
 ('PRR', 'float', 'YES', '', None, ''),
 ('PRR_error', 'float', 'YES', '', None, ''),
 ('mean_reporting_frequency', 'float', 'YES', '', None, '')]

In [9]:
(
    offsides
    .to_sql(
        name='OFFSIDES',
        con=engine,
        if_exists='append',
        index=False,
        dtype={
            'drug_concept_id': sqlalchemy.types.Integer,
            'condition_concept_id': sqlalchemy.types.Integer,
            'A': sqlalchemy.types.Integer,
            'B': sqlalchemy.types.Integer,
            'C': sqlalchemy.types.Integer,
            'D': sqlalchemy.types.Integer,
            'PRR': sqlalchemy.types.Float,
            'PRR_error': sqlalchemy.types.Float,
            'mean_reporting_frequency': sqlalchemy.types.Float,
        },
        chunksize=100_000,
    )
)