In [None]:
import pandas as pd
import numpy as np
import re
import math
import os 

from pubchempy import get_compounds, Compound
from molvs import validate_smiles, standardize_smiles

## Read MIMIC Extract Data

In [None]:
mimic_extract_path = "mimic-extract/"

x_train = pd.read_pickle(os.path.join(mimic_extract_path, "lvl2_imputer_train.pkl"))


x_dev = pd.read_pickle(os.path.join(mimic_extract_path, "lvl2_imputer_dev.pkl"))
x_test = pd.read_pickle(os.path.join(mimic_extract_path, "lvl2_imputer_test.pkl"))

y_train = pd.read_pickle(os.path.join(mimic_extract_path, "Ys_train.pkl"))
y_dev = pd.read_pickle(os.path.join(mimic_extract_path, "Ys_dev.pkl"))
y_test = pd.read_pickle(os.path.join(mimic_extract_path, "Ys_test.pkl"))

ys = pd.read_pickle(os.path.join(mimic_extract_path, "Ys.pkl"))

In [None]:
# patient_train_dict = {}

# for i in y_train.itertuples():
#     patient_id = i.Index[0]
#     mort_hosp = i.mort_hosp
#     mort_icu = i.mort_icu
#     los_3 = i.los_3
#     los_7 = i.los_7
#     patient_train_dict[patient_id] = {"mort_hosp": mort_hosp, "mort_icu": mort_icu, "los_3": los_3, "los_7": los_7}
    
# patient_dev_dict = {}

# for i in y_dev.itertuples():
#     patient_id = i.Index[0]
#     mort_hosp = i.mort_hosp
#     mort_icu = i.mort_icu
#     los_3 = i.los_3
#     los_7 = i.los_7
#     patient_dev_dict[patient_id] = {"mort_hosp": mort_hosp, "mort_icu": mort_icu, "los_3": los_3, "los_7": los_7}
    
# patient_test_dict = {}

# for i in y_test.itertuples():
#     patient_id = i.Index[0]
#     mort_hosp = i.mort_hosp
#     mort_icu = i.mort_icu
#     los_3 = i.los_3
#     los_7 = i.los_7
#     patient_test_dict[patient_id] = {"mort_hosp": mort_hosp, "mort_icu": mort_icu, "los_3": los_3, "los_7": los_7}
    
# protocol_num = 5
# pd.to_pickle(patient_train_dict, "patient_train_dict_"+str(protocol_num)+".p", protocol=protocol_num)
# pd.to_pickle(patient_dev_dict, "patient_dev_dict_"+str(protocol_num)+".p", protocol=protocol_num)
# pd.to_pickle(patient_test_dict, "patient_test_dict_"+str(protocol_num)+".p", protocol=protocol_num)

## Read Patient Pubchem Dict

In [None]:
patient_pubchem_dict = pd.read_pickle("patient_pubchem_dict.p")

## Create TimeSeries Cohort

In [None]:
patient_ids = list(patient_pubchem_dict.keys())

In [None]:
x_train_sub = x_train.loc[patient_ids]
x_dev_sub = x_dev.loc[patient_ids]
x_test_sub = x_test.loc[patient_ids]

y_train_sub = y_train.loc[patient_ids]
y_dev_sub = y_dev.loc[patient_ids]
y_test_sub = y_test.loc[patient_ids]
ys_sub = ys.loc[patient_ids]

x_train_mean = x_train_sub.loc[:, pd.IndexSlice[:, 'mean']]
x_dev_mean = x_dev_sub.loc[:, pd.IndexSlice[:, 'mean']]
x_test_mean = x_test_sub.loc[:, pd.IndexSlice[:, 'mean']]

x_train_mean = x_train_mean.sort_index()
x_dev_mean = x_dev_mean.sort_index()
x_test_mean = x_test_mean.sort_index()

y_train_sub = y_train_sub.sort_index()
y_dev_sub = y_dev_sub.sort_index()
y_test_sub  = y_test_sub.sort_index()
ys_sub  = ys_sub.sort_index()

sub_train = x_train_mean.values
sub_dev = x_dev_mean.values
sub_test = x_test_mean.values

# reshape the data for timeseries prediction
x_train_lstm = sub_train.reshape(int(sub_train.shape[0] / 24), 24, 104)
x_dev_lstm = sub_dev.reshape(int(sub_dev.shape[0] / 24), 24, 104)
x_test_lstm = sub_test.reshape(int(sub_test.shape[0] / 24), 24, 104)

In [None]:
x_train_lstm.shape, y_train_sub.shape, x_dev_lstm.shape, y_dev_sub.shape, x_test_lstm.shape, y_test_sub.shape, ys_sub.shape

In [None]:
timeseries_data_path = "data/timeseries/"

pd.to_pickle(x_train_lstm, os.path.join(timeseries_data_path, "x_train_lstm.p"))
pd.to_pickle(x_dev_lstm, os.path.join(timeseries_data_path, "x_dev_lstm.p"))
pd.to_pickle(x_test_lstm, os.path.join(timeseries_data_path, "x_test_lstm.p"))

pd.to_pickle(y_train_sub, os.path.join(timeseries_data_path, "y_train.p"))
pd.to_pickle(y_dev_sub, os.path.join(timeseries_data_path, "y_dev.p"))
pd.to_pickle(y_test_sub, os.path.join(timeseries_data_path, "y_test.p"))
pd.to_pickle(ys_sub, os.path.join(timeseries_data_path, "ys.p"))

## Utils

In [None]:
train_id_set = set()
for i in y_train_sub.index:
    train_id_set.add(i[0])

dev_id_set = set()
for i in y_dev_sub.index:
    dev_id_set.add(i[0])
    
test_id_set = set()
for i in y_test_sub.index:
    test_id_set.add(i[0])

## Create Unique Drug with Canonical Smiles

In [None]:
patient_unique_canonical_smiles_dict = {}

patient_unique_canonical_smiles_train_dict = {}
patient_unique_canonical_smiles_dev_dict = {}
patient_unique_canonical_smiles_test_dict = {}

for patient_id_, pubchem_id_list_ in patient_pubchem_dict.items():
        
    cid_list_ = []
    for cid_ in pubchem_id_list_:
        cid_list_.append(cid_.canonical_smiles)
    
    patient_unique_canonical_smiles_dict[patient_id_] = cid_list_
    
    if patient_id_ in train_id_set:
        patient_unique_canonical_smiles_train_dict[patient_id_] = cid_list_
    
    if patient_id_ in dev_id_set:
        patient_unique_canonical_smiles_dev_dict[patient_id_] = cid_list_
        
    if patient_id_ in test_id_set:
        patient_unique_canonical_smiles_test_dict[patient_id_] = cid_list_

In [None]:
for patient_, smiles_ in patient_unique_canonical_smiles_dict.items():
    if len(smiles_) == 0:
        print(patient_)

In [None]:
unique_path = "data/drug_unique/"

pd.to_pickle(patient_unique_canonical_smiles_dict, os.path.join(unique_path, "smiles_all.p"))
pd.to_pickle(patient_unique_canonical_smiles_train_dict, os.path.join(unique_path, "smiles_train.p"))
pd.to_pickle(patient_unique_canonical_smiles_dev_dict, os.path.join(unique_path, "smiles_dev.p"))
pd.to_pickle(patient_unique_canonical_smiles_test_dict, os.path.join(unique_path, "smiles_test.p"))