In [1]:
import pathlib
import re

import mysql.connector
import numpy as np
import pandas as pd
import sqlalchemy
import tqdm

In [2]:
# Load MySQL password from file
with open('../mysql_password.txt') as f:
    password = f.readline().strip()

## Command used to build `reports_drugs` table

```mysql
CREATE TABLE reports_drugs (
    report_id int NOT NULL,
    drug_cui int NOT NULL,
    PRIMARY KEY (report_id)
);
```

# Format data for insertion into DB

### 1. Combine report-drug exposure relationships from across 50 different files

Currently, the drug exposures for each report are split across separate files. For example, the first 64,951 reports are in `data/AEOLUS_all_reports_IN_0.npy` and the next 32,798 are in `data/AEOLUS_all_reports_IN_1.npy` and so on.

In [3]:
root = pathlib.Path('/data1/home/rav7008/formike/')

# All report-drug files with _IN_ in the name
reports_drug_files = list(root.glob('data/AEOLUS_all_reports_IN_[0-9]*.npy'))

# Format paths as {0: 'data/AEOLUS_ALL_REPORTS_IN_0.npy'} to parse them in the correct order
number_to_path = {int(re.search('(?<=_IN_)[0-9]+(?=\.npy)', path.name).group()): path 
                  for path in reports_drug_files}

In [4]:
all_relationships = list()
n_rel = 0
starting_row = 0

# Extract all report-drug relationships by iterating over all 
# `AEOLUS_ALL_REPORTS_IN_*.npy` files and keeping track of indexes across files

for file_number in tqdm.tnrange(50):
    # Next filepath
    report_drug_file = number_to_path[file_number]
    reports_to_drugs = np.load(report_drug_file, allow_pickle=True).item().tocoo()
    
    # Get relationships as tuples. Reindex rows by the total rows.
    relationships = list(map(tuple, zip(reports_to_drugs.row + starting_row, reports_to_drugs.col)))
    all_relationships.extend(relationships)
    starting_row += reports_to_drugs.shape[0]
    n_rel += reports_to_drugs.nnz
    
assert len(all_relationships) == n_rel

HBox(children=(IntProgress(value=0, max=50), HTML(value='')))




In [5]:
report_drug_df = pd.DataFrame(all_relationships, columns=['report_id', 'drug_cui'])

report_drug_df.head(2)

Unnamed: 0,report_id,drug_cui
0,0,61
1,0,740


### 2. Verify the procedure was successful

To check that the re-indexing scheme from the previous cell was successful, we investigate the junction between the first and second matrices. Below, we show that there are 64,951 rows in the first matrix and that the second matrix begins with relationships in it's first (indexed as 0-th) row. Specifically, the second matrix's first relationships are (indexed with respect to itself) $[(0,382), (0,2061), (1,716) ...]$. When combining relationships across matrices, though, these should correspond to $[(64,951,382), (64,951,2061), (64,952,716) ...]$, since the first row in the second matrix is the row that follows the last row in the first matrix.

Below, we show that the scheme was successful.

In [6]:
first_matrix = np.load(number_to_path[0], allow_pickle=True).item().tocsc()

first_matrix.shape

(64951, 4396)

In [7]:
first_matrix.nnz

473498

In [8]:
second_matrix = np.load(number_to_path[1], allow_pickle=True).item().tocoo()

# Show the first few relationships in the second matrix
list(zip(second_matrix.row[:10], second_matrix.col[:10]))

[(0, 382),
 (0, 2061),
 (1, 716),
 (1, 1907),
 (1, 2006),
 (1, 2061),
 (2, 2061),
 (2, 2883),
 (2, 4148),
 (3, 382)]

In [9]:
# The first matrix has 473,498 relationships, so the 473,499-th should be the 
# first relationship from the second matrix, which it correctly is.

all_relationships[473_495:473_505]

[(64949, 3502),
 (64949, 3552),
 (64949, 4028),
 (64951, 382),
 (64951, 2061),
 (64952, 716),
 (64952, 1907),
 (64952, 2006),
 (64952, 2061),
 (64953, 2061)]

# Connect to DB and insert data

In [None]:
engine = sqlalchemy.create_engine(f"mysql+mysqlconnector://mnz2108:{password}@localhost/effect_nsides")

engine.execute('''
CREATE TABLE reports_drugs (
    report_id int NOT NULL,
    drug_cui int NOT NULL,
    PRIMARY KEY (report_id)
);
''')

# Insert data into the database
(
    report_drug_df
    .to_sql(
        name='reports_drugs',
        con=engine,
        if_exists='replace',
        index=False,
        chunksize=1_000_000,
    )
)

In [11]:
# Verify that the data can be re-extracted from the table
result_df = pd.read_sql(
    sql='SELECT * FROM reports_drugs LIMIT 100;',
    con=engine,
)

result_df.head()

Unnamed: 0,report_id,drug_cui
0,0,61
1,0,740
2,0,743
3,0,2035
4,0,2334
