In [1]:
import pathlib
import xml.etree.ElementTree as ET

import mysql.connector
import numpy as np
import pandas as pd
import requests
import sqlalchemy

In [2]:
# Load MySQL password from file
with open('../mysql_password.txt') as f:
    password = f.readline().strip()

## Command used to build `events` table

```mysql
CREATE TABLE events (
    snomed_id int NOT NULL,
    name varchar(255),
    PRIMARY KEY (snomed_id)
);
```

# Format data for insertion into DB

In [3]:
# Outcomes with their vector positions (key for outcomes)
outcomes = np.load('/data1/home/rav7008/formike/data/all_outcome_strings.npy', allow_pickle=True)

# SNOMED CT concepts for each outcome
snomed_map_df = pd.read_csv('/data1/home/rav7008/formike/data/snomed_outcomes_to_strings.csv')

# Merge dataframes (want only SNOMED information for outcomes in the vector)
outcomes_df = (
    pd.DataFrame(outcomes, columns=['outcome'])
    .reset_index()
    .merge(snomed_map_df, left_on='outcome', right_on='snomed_outcome_concept_id', how='left')
)

outcomes_df.head(2)

Unnamed: 0,index,outcome,snomed_outcome_concept_id,concept_name
0,0,,,
1,1,316866.0,316866.0,Hypertensive disorder


In [4]:
# Only one row with a NaN value (first row)
# Since table has primary key "snomed_id", we cannot write a record with a NaN value in that field
outcomes_df.shape, outcomes_df.dropna().shape

((7085, 4), (7084, 4))

In [5]:
outcomes_df = (
    outcomes_df
    .dropna() 
    .rename(columns={'snomed_outcome_concept_id': 'snomed_id', 'concept_name': 'name'})
    .filter(items=['snomed_id', 'name'])
    .assign(snomed_id=lambda df: df['snomed_id'].astype(int))
)

outcomes_df.head(2)

Unnamed: 0,snomed_id,name
1,316866,Hypertensive disorder
2,4003185,Refractory anemia


# Connect to DB and insert data

In [6]:
engine = sqlalchemy.create_engine(f"mysql+mysqlconnector://mnz2108:{password}@localhost/effect_nsides")

engine.execute('''
CREATE TABLE events (
    snomed_id int NOT NULL,
    name varchar(255),
    PRIMARY KEY (snomed_id)
);
''')

# Insert data into the database
(
    outcomes_df
    .to_sql(
        name='events',
        con=engine,
        if_exists='replace',
        index=False,
    )
)

In [7]:
# Verify that the data can be re-extracted from the table
result_df = pd.read_sql(
    sql='SELECT * FROM events;',
    con=engine,
)

result_df.head()

Unnamed: 0,snomed_id,name
0,316866,Hypertensive disorder
1,4003185,Refractory anemia
2,439977,Poisoning by analgesic AND/OR antipyretic
3,4110705,Squamous cell carcinoma of lung
4,4184746,Left ventricular hypertrophy
