In [1]:
import pathlib

import mysql.connector
import numpy as np
import pandas as pd
import scipy.io
import sqlalchemy

In [2]:
# Load MySQL password from file
with open('../mysql_password.txt') as f:
    password = f.readline().strip()

## Command used to build `reports_outcomes` table

```mysql
CREATE TABLE reports_outcomes (
    report_id int NOT NULL,
    outcome_id int NOT NULL,
    PRIMARY KEY (report_id)
);
```

# Format data for insertion into DB

In [3]:
root = pathlib.Path('/data1/home/rav7008/formike/')

report_indices = np.load(root / 'data/all_reportids_IN.npy', allow_pickle=True)

In [4]:
# Outcomes with their vector positions (key for outcomes)
outcomes_vector = np.load(root / 'data/all_outcome_strings.npy', allow_pickle=True)

# Outcomes for each report
outcomes = scipy.io.mmread(root.joinpath('data/AEOLUS_all_reports_IN_alloutcomes.mtx').as_posix())

outcomes

<4694086x7085 sparse matrix of type '<class 'numpy.int64'>'
	with 11854054 stored elements in COOrdinate format>

In [5]:
# Remove the first column, which is mapped to the outcome `None`
outcomes = outcomes.tocsc()[:, 1:].tocoo()

outcomes

<4694086x7084 sparse matrix of type '<class 'numpy.int64'>'
	with 11854054 stored elements in COOrdinate format>

In [6]:
relationships = list(map(tuple, zip(outcomes.row, outcomes.col)))

reports_outcomes_df = (
    pd.DataFrame(relationships, columns=['report_index', 'outcome_index'])
    .assign(
        report_id=lambda df: df['report_index'].apply(lambda x: report_indices[x]),
        outcome_id=lambda df: df['outcome_index'].apply(lambda x: outcomes_vector[x + 1]),
    )
)

reports_outcomes_df.head(2)

Unnamed: 0,report_index,outcome_index,report_id,outcome_id
0,1,0,4440060,316866
1,2,0,4456349,316866


# Connect to DB and insert data

In [7]:
engine = sqlalchemy.create_engine(f"mysql+mysqlconnector://mnz2108:{password}@localhost/effect_nsides")
# engine.execute('drop table reports_outcomes;')
engine.execute('''
CREATE TABLE reports_outcomes (
    report_id int NOT NULL,
    outcome_id int NOT NULL,
    PRIMARY KEY (report_id)
);
''')

# Insert data into the database
(
    reports_outcomes_df
    .filter(items=['report_id', 'outcome_id'])
    .to_sql(
        name='reports_outcomes',
        con=engine,
        if_exists='replace',
        index=False,
        chunksize=500_000,
    )
)

In [8]:
# Verify that the data can be re-extracted from the table
result_df = pd.read_sql(
    sql='SELECT * FROM reports_outcomes LIMIT 100;',
    con=engine,
)

result_df.head()

Unnamed: 0,report_id,outcome_id
0,4440060,316866
1,4456349,316866
2,5148155,316866
3,6206938,316866
4,6684617,316866
