In [None]:
import os
import sys
import platform

import numpy as np
from sklearn.model_selection import ShuffleSplit, StratifiedShuffleSplit, KFold, StratifiedKFold

import matplotlib.pyplot as plt
%matplotlib inline

In [None]:
n_splits = 5 # 10
has_dev_set = False

system_name = platform.system()
if system_name == 'Linux':
    github_dir = '/home/gor/codes/dVLogger-Project'
elif system_name == 'Darwin':
    github_dir = '/Users/gor/codes/dVLogger-Project'
features_npz = os.path.join(github_dir, 'data', 'case_sum_0706_A_1.npz')
clinical_outcomes_npz = os.path.join(github_dir, 'data', 'clinical_outcomes_0707.npz')
folds_clinical_outcomes_npz = os.path.join(github_dir, 'data', 'folds_clinical_outcomes_0707.npz')

In [None]:
locals().update(np.load(features_npz))
locals().update(np.load(clinical_outcomes_npz))

In [None]:
assert(np.all(X_case == Y_case))
case = X_case

In [None]:
Y_col

In [None]:
# regression_splitter = ShuffleSplit(n_splits=n_splits, test_size=0.15, train_size=0.7, random_state=42)
# classification_splitter = StratifiedShuffleSplit(n_splits=n_splits, test_size=0.15, train_size=0.7, random_state=42)
regression_splitter = KFold(n_splits=n_splits, shuffle=True, random_state=42)
classification_splitter = StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=42)

In [None]:
folds = np.empty((len(Y_col), n_splits, 3 if has_dev_set else 2), dtype=object)

In [None]:
for y_idx in range(len(Y_col)):
    y = Y[:, y_idx]
    non_nan_idx = np.where(~ np.isnan(y))[0]
    y = y[non_nan_idx]
    if has_dev_set:
        all_idx = np.arange(len(y))
    # y_name = Y_col[y_idx]
    y_train_task_type = Y_train_task_type[y_idx]
    # y_name, y.shape, y_train_task_type

    if y_train_task_type == 'classification':
        split_generator = classification_splitter.split(np.zeros(len(y)), y)
    elif y_train_task_type == 'regression':
        split_generator = regression_splitter.split(np.zeros(len(y)), y)
    else:
        raise ValueError

    for split_idx in range(n_splits):
        train_idx, test_idx = next(split_generator)
        if has_dev_set:
            dev_idx = np.setdiff1d(all_idx, np.concatenate([train_idx, test_idx]))
            folds[y_idx, split_idx, :] = [non_nan_idx[train_idx], non_nan_idx[dev_idx], non_nan_idx[test_idx]]
        else:
            folds[y_idx, split_idx, :] = [non_nan_idx[train_idx], non_nan_idx[test_idx]]
#         print(len(train_idx), train_idx)
#         print(len(dev_idx), dev_idx)
#         print(len(test_idx), test_idx)
#         print(len(non_nan_idx), non_nan_idx)
#         print(len(non_nan_idx[train_idx]), non_nan_idx[train_idx])
        

In [None]:
np.savez_compressed(folds_clinical_outcomes_npz, folds=folds, has_dev_set=has_dev_set)