In [1]:
import numpy as np
import pandas as pd
import sqlalchemy
import tqdm

In [2]:
# Load MySQL password from file
with open('../../mysql_password.txt') as f:
    password = f.readline().strip()
# Create MySQL connector
engine = sqlalchemy.create_engine(f"mysql+mysqlconnector://mnz2108:{password}@localhost/effect_nsides")

In [3]:
engine.execute('show tables;').fetchall()

[('CONDITION_CONCEPT',),
 ('CONDITION_OCCURRENCE',),
 ('DRUG_CONCEPT',),
 ('DRUG_EXPOSURE',),
 ('OFFSIDES',),
 ('REPORT',),
 ('TWOSIDES',)]

## CONDITION_CONCEPT

Only the final column, `condition_snomed_id` should have any `NULL` values.

In [4]:
pd.read_sql('SELECT * FROM CONDITION_CONCEPT LIMIT 5;', engine)

Unnamed: 0,condition_concept_id,condition_concept_name,condition_meddra_id,condition_snomed_id
0,36516812,Arthralgia,10003239,77074
1,35708093,Diarrhoea,10012735,196523
2,36718132,Headache,10019211,375527
3,35708208,Vomiting,10047700,441408
4,35708139,Dyspepsia,10013946,197913


In [5]:
# total number of rows
engine.execute('SELECT COUNT(*) FROM CONDITION_CONCEPT;').fetchall()

[(17552,)]

In [6]:
# No rows with a NULL value for ID, name, or MedDRA
engine.execute('''
SELECT COUNT(*) FROM CONDITION_CONCEPT 
WHERE condition_concept_id IS NULL OR 
      condition_concept_name IS NULL OR 
      condition_meddra_id IS NULL 
;''').fetchone()

(0,)

In [7]:
# number with a NULL for SNOMED ID (not all MedDRA map to SNOMED)
n_no_snomed = engine.execute('''
SELECT COUNT(*) FROM CONDITION_CONCEPT 
WHERE condition_snomed_id IS NULL;
''').fetchone()

print(n_no_snomed)

pd.read_sql('''
SELECT * FROM CONDITION_CONCEPT 
WHERE condition_snomed_id IS NULL
LIMIT 10;
''', engine)

(10047,)


Unnamed: 0,condition_concept_id,condition_concept_name,condition_meddra_id,condition_snomed_id
0,35809170,Unevaluable event,10062355,
1,36313799,Blood immunoglobulin E increased,10005591,
2,36211465,Incorrect dose administered,10064355,
3,42890329,Increased viscosity of nasal secretion,10071730,
4,36315029,CSF test abnormal,10059703,
5,35606974,Ocular icterus,10058117,
6,43053893,Periorbital contusion,10062515,
7,35809131,General physical health deterioration,10049438,
8,36313524,Alanine aminotransferase increased,10001551,
9,37522263,Stent placement,10048561,


## CONDITION_OCCURRENCE

There should be no `NULL` values at all.

In [8]:
pd.read_sql('SELECT * FROM CONDITION_OCCURRENCE LIMIT 5;', engine)

Unnamed: 0,report_id,condition_concept_id
0,100033001,36516812
1,100033001,35708093
2,100033001,36718132
3,100033001,35708208
4,100033011,35708139


In [9]:
# total number of rows
engine.execute('SELECT COUNT(*) FROM CONDITION_OCCURRENCE;').fetchall()

[(14748182,)]

In [10]:
# No rows with a NULL value for report ID, or condition ID
engine.execute('''
SELECT COUNT(*) FROM CONDITION_OCCURRENCE 
WHERE report_id IS NULL OR 
      condition_concept_id IS NULL
;''').fetchone()

(0,)

## DRUG_CONCEPT

In [11]:
pd.read_sql('SELECT * FROM DRUG_CONCEPT LIMIT 5;', engine)

Unnamed: 0,drug_concept_id,drug_concept_name,rxnorm_concept_id,drugbank_concept_id,chebi_concept_id
0,19080523,"silicon dioxide, colloidal",314826,DB11132,30563.0
1,42903427,Aldosterone,1312358,DB04630,27584.0
2,745268,"ergoloid mesylates, USP",4024,DB01049,34706.0
3,1145379,Ergotamine,4025,DB00696,64318.0
4,902251,hypromellose,27334,DB11075,


In [12]:
# total number of rows
engine.execute('SELECT COUNT(*) FROM DRUG_CONCEPT;').fetchall()

[(3453,)]

In [13]:
# No rows with a NULL value for drug ID, drug name, or RxNorm ID
engine.execute('''
SELECT COUNT(*) FROM DRUG_CONCEPT 
WHERE drug_concept_id IS NULL OR 
      drug_concept_name IS NULL OR
      rxnorm_concept_id IS NULL
;''').fetchone()

(0,)

In [14]:
# No rows with a NULL value for drug ID, drug name, or RxNorm ID
engine.execute('''
SELECT COUNT(*) FROM DRUG_CONCEPT 
WHERE drugbank_concept_id IS NULL OR 
      chebi_concept_id IS NULL
;''').fetchone()

(1321,)

## DRUG_EXPOSURE

In [15]:
pd.read_sql('SELECT * FROM DRUG_EXPOSURE LIMIT 5;', engine)

Unnamed: 0,report_id,drug_concept_id
0,6869784,19080523
1,4709196,19080523
2,4389224,19080523
3,102051661,19080523
4,90908801,19080523


In [16]:
# total number of rows
engine.execute('SELECT COUNT(*) FROM DRUG_EXPOSURE;').fetchall()

[(12526653,)]

In [17]:
# No rows with a NULL value for report ID, drug ID
engine.execute('''
SELECT COUNT(*) FROM DRUG_EXPOSURE 
WHERE report_id IS NULL OR 
      drug_concept_id IS NULL
;''').fetchone()

(0,)

## OFFSIDES

In [18]:
pd.read_sql('SELECT * FROM OFFSIDES LIMIT 5;', engine)

Unnamed: 0,drug_concept_id,condition_concept_id,A,B,C,D,PRR,PRR_error,mean_reporting_frequency
0,745268,35104070,0,132,3,1317,0.0,,0.0
1,745268,35104074,6,126,21,1299,2.85714,0.45382,0.045455
2,745268,35104085,0,132,1,1319,0.0,,0.0
3,745268,35104091,0,132,1,1319,0.0,,0.0
4,745268,35104100,1,131,1,1319,10.0,1.41126,0.007576


In [19]:
# total number of rows
engine.execute('SELECT COUNT(*) FROM DRUG_EXPOSURE;').fetchall()

[(12526653,)]

In [20]:
# No rows with a NULL value for report ID, drug ID
engine.execute('''
SELECT COUNT(*) FROM DRUG_EXPOSURE 
WHERE report_id IS NULL OR 
      drug_concept_id IS NULL
;''').fetchone()

(0,)

## REPORT

In [21]:
pd.read_sql('SELECT * FROM REPORT LIMIT 5;', engine)

Unnamed: 0,report_id,report_year,person_age,person_sex
0,4572294,2005,76.0,F
1,4440060,2004,78.0,M
2,4456349,2004,60.0,M
3,5148155,2006,46.0,M
4,6206938,2009,,M


In [22]:
# total number of rows
engine.execute('SELECT COUNT(*) FROM REPORT;').fetchall()

[(4694086,)]

In [23]:
# No rows with a NULL value for report ID, report year
engine.execute('''
SELECT COUNT(*) FROM REPORT 
WHERE report_id IS NULL OR
      report_year IS NULL
;''').fetchone()

(0,)

In [24]:
# No rows with a NULL value for report ID, drug ID
engine.execute('''
SELECT COUNT(*) FROM REPORT 
WHERE person_age IS NULL OR
      person_sex IS NULL
;''').fetchone()

(1837745,)

## TWOSIDES

In [25]:
pd.read_sql('SELECT * FROM TWOSIDES LIMIT 5;', engine)

Unnamed: 0,drug_concept_id_1,drug_concept_id_2,condition_concept_id,A,B,C,D,PRR,PRR_error,mean_reporting_frequency
0,932745,1326115,35104074,1,13,3,137,3.33333,1.12016,0.071429
1,932745,1326115,35104113,0,14,1,139,0.0,,0.0
2,932745,1326115,35104306,1,13,1,139,10.0,1.38616,0.071429
3,932745,1326115,35104351,0,14,1,139,0.0,,0.0
4,932745,1326115,35104746,0,14,1,139,0.0,,0.0


In [26]:
# total number of rows
engine.execute('SELECT COUNT(*) FROM TWOSIDES;').fetchall()

[(222155888,)]

In [27]:
# No rows with a NULL value for drug_concept_id_1, drug_concept_id_2, 
#  condition_concept_id, A, B, C, D
engine.execute('''
SELECT COUNT(*) FROM TWOSIDES 
WHERE drug_concept_id_1 IS NULL OR 
      drug_concept_id_2 IS NULL OR
      condition_concept_id IS NULL OR
      A IS NULL OR
      B IS NULL OR
      C IS NULL OR
      D IS NULL
;''').fetchone()

(0,)

In [28]:
# No rows with a NULL value for drug_concept_id_1, drug_concept_id_2, 
#  condition_concept_id, A, B, C, D
engine.execute('''
SELECT COUNT(*) FROM TWOSIDES 
WHERE PRR IS NULL OR
      PRR_error IS NULL OR
      mean_reporting_frequency IS NULL
;''').fetchone()

(179213227,)

In [29]:
# No rows with a NULL value for PRR but C > 0
engine.execute('''
SELECT COUNT(*) FROM TWOSIDES 
WHERE PRR IS NULL AND
      C > 0
;''').fetchone()

(0,)

## Overall

In [30]:
engine.execute('''
SELECT
    table_schema AS 'DB Name',
    ROUND(SUM(data_length + index_length) / 1024 / 1024, 1) AS 'DB Size in MB'
FROM
    information_schema.tables
WHERE
    table_schema = 'effect_nsides'
GROUP BY
    table_schema;
''').fetchall()

[('effect_nsides', Decimal('9543.3'))]