In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split

from preprocessing import preprocessing
from pca import pca_load
#from pca_all import pca_load_all
from baseline import baseline
from lr import LR
#from lr import LR_P
from rf import RF
from nn import nn_load
from ae import ae_load
from nn_rf import nn_separate_load
from roc import ROC

from sklearn.metrics import accuracy_score

# short code or full code
#code_type = "short"
code_type = "full"

if code_type == "short":
    filename = "main_dataset_final_3"
elif code_type == "full":
    filename = "main_dataset_final_3_full_icd10"
    
preprocessing(filename)

# binary or continuous
datatype = "binary"
# datatype = "cont"
filename_add = int(code_type == "full")
X = pd.read_csv("data/stroke_data_" + datatype + filename_add*"_full" + ".csv")
#X = pd.read_csv("data/stroke_data_binary_full.csv")

X.dropna(inplace=True)
X_train, X_test = train_test_split(X, test_size=0.2, random_state=42)

# disease_mapping = {
#     "Obesity": ['E65','E66','E67','E68'], 
#     "hyperlipidemia": ['E78'],
#     "diabetes": ['E10','E11','E12','E13','E14'],
#     "hypertension": ['I10','I11','I12','I13','I15'],
#     "nicotine":['F17','Z72'],
#     "alcohol":['F10'],
#     "af":['I48'],
#     "heart_diseases":['I20','I21','I22','I24','I25'],
#     "strokes":['I63','I64','I60','G45','I61'],
# }

# reverse_mapping = {}
# for category, codes in disease_mapping.items():
#     for code in codes:
#         reverse_mapping[code] = category

# columns_by_category = {category: [] for category in disease_mapping}

# for column in X_train.columns:
#     for code in reverse_mapping.keys():
#         if column.startswith(code):
#             category = reverse_mapping[code]
#             columns_by_category[category].append(column)
#             break

# disease_mapping = columns_by_category
# for category, columns in columns_by_category.items():
#     print(f"Category '{category}' has columns: {columns}")
# ###

# last_key = next(reversed(columns_by_category))
# last_value = columns_by_category[last_key]
# X_train["stroke"] = X_train[last_value].sum(axis=1)
# X_test["stroke"] = X_test[last_value].sum(axis=1)

disease_mapping = {
    "Obesity": ['E65','E66','E67','E68'], 
    "hyperlipidemia": ['E78'],
    "diabetes": ['E10','E11','E12','E13','E14'],
    "hypertension": ['I10','I11','I12','I13','I15'],
    "nicotine":['F17','Z72'],
    "alcohol":['F10'],
    "af":['I48'],
    "heart_diseases":['I20','I21','I22','I24','I25'],
    "strokes":['I63','I64','I60','G45','I61'],
}

# pca vs. baseline
train, test = pca_load(datatype, disease_mapping, X_train, X_test, code_type, pc0_only=True)
# #X_train, X_test = pca_load_all(datatype, X_train, X_test)
max_train, max_test = baseline(datatype, disease_mapping, "max", X_train, X_test)
sum_train, sum_test = baseline(datatype, disease_mapping, "sum", X_train, X_test)
nn_train, nn_test = nn_load(datatype, disease_mapping, X_train, X_test, code_type)
ae_train, ae_test = ae_load(datatype, disease_mapping, X_train, X_test, code_type)
# nn_rf_train, nn_rf_test = nn_separate_load(datatype, disease_mapping, X_train, X_test, code_type)


print(train.shape, test.shape)
print(max_train.shape, max_test.shape)
print(sum_train.shape, sum_test.shape)
print(nn_train.shape,nn_test.shape)
print(ae_train.shape,ae_test.shape)
# print(nn_rf_train.shape, nn_rf_test.shape)

# Binary

In [None]:
# # orginal data with all covariates

# X = pd.read_csv("data/stroke_data_binary.csv")
# X.dropna(inplace=True)
# #X= X[X['race'] != 'Black or African American']
# or_train, or_test = train_test_split(X, test_size=0.2, random_state=42)
# print("By Logistic Regression, result for all original predicitors:")
# LR(or_train,or_test,tolerance = 1e-4,iter = 6000, seed = 43)

# param_grid = {
#     'n_estimators': [100],  
#     'max_features': [0.2, 0.4],  
#     'max_depth': [10, 20],  
#     'min_samples_split': [5, 10],  
#     'min_samples_leaf': [2, 4],  
#     'bootstrap': [True, False]  
# }
# print("Result for Random Forest, result for all original predicitors:")
# RF(or_train,or_test,params=param_grid)

In [None]:
# Logistic Regression
# We select several comorbidities and demographics as the predictors.

# ## baseline: orginal data with the max_aggregation covariates
# print("By Logistic Regression, result for sum baseline:")
# LR(sum_train,sum_test,tolerance = 1e-4,iter = 4000, seed = 43)
# print("By Logistic Regression, result for max baseline:")
# LR(max_train,max_test,tolerance = 1e-4,iter = 4000, seed = 43)
# ## selected PCA covariates.
# print("By Logistic Regression, result for PCA:")
# LR(train,test,tolerance = 1e-4,iter = 4000, seed = 43)
# print("By Logistic Regression, result for NN:")
# LR(nn_train,nn_test,tolerance = 1e-4,iter = 4000, seed = 43)
# print("By Logistic Regression, result for Autoencoder:")
# LR(ae_train,ae_test,tolerance = 1e-4,iter = 4000, seed = 43)
# # print("By Logistic Regression, result for NN_RF:")
# # LR(nn_rf_train,nn_rf_test,tolerance = 1e-4,iter = 4000, seed = 43)

In [None]:
Y_scores_sum, accuracy_sum, recall_sum = LR(sum_train,sum_test,tolerance = 1e-4,iter = 4000, seed = 43)
Y_scores_max, accuracy_max, recall_max = LR(max_train,max_test,tolerance = 1e-4,iter = 4000, seed = 43)
Y_scores_pca, accuracy_pca, recall_pca = LR(train,test,tolerance = 1e-4,iter = 4000, seed = 43)
Y_scores_nn, accuracy_nn, recall_nn = LR(nn_train,nn_test,tolerance = 1e-4,iter = 4000, seed = 43)
Y_scores_ae, accuracy_ae, recall_ae = LR(ae_train,ae_test,tolerance = 1e-4,iter = 4000, seed = 43)
Y_scores = [Y_scores_sum, Y_scores_max, Y_scores_pca, Y_scores_nn, Y_scores_ae]
Y_true = test["recur_30"]

method = ["sum","max","pca","nn","ae"]
ROC(Y_true, Y_scores, method, clf="LR",seed=1)

In [None]:
# file_path = 'data/'
# Y_scores = pd.DataFrame(Y_scores)
# Y_scores.to_csv(file_path + "scores.csv", index=False)
# Y_scores = Y_scores.values.tolist()

# Y_true = pd.DataFrame(Y_true)
# Y_true.to_csv(file_path + "true.csv", index=False)
# Y_true= Y_true.values.tolist()

# method = ["sum","max","pca","nn","ae"]
# ROC(Y_true, Y_scores, method, LR = True)

In [None]:
# Random Forest
# We select several comorbidities and demographics as the predictors.
# Since the number of predicitors is not large, so here 'max_features' is not small.
param_grid = {
    'n_estimators': [100,200,300],  
    'max_features': [0.8, 0.9, 1],  
    'max_depth': [None, 10, 20, 30],  
    'min_samples_split': [2, 5, 10],  
    'min_samples_leaf': [1, 2, 4],  
    'bootstrap': [True]  
}

## baseline: orginal data with the max_aggregation covariates
# print("By Random Forest, result for sum baseline:")
# RF(sum_train,sum_test,params=param_grid)
# print("By Random Forest, result for max baseline:")
# RF(max_train,max_test,params=param_grid)
# ## selected PCA covariates.
# print("By Random Forest, result for PCA:")
# RF(train,test,params=param_grid)
# print("By Random Forest, result for NN:")
# RF(nn_train,nn_test,params=param_grid)
# print("By Random Forest, result for Autoencoder:")
# RF(ae_train,ae_test,params=param_grid)
# print("By Random Forest, result for NN_RF:")
# RF(nn_rf_train,nn_rf_test,params=param_grid)


# print("By Random Forest, result for max baseline:")
# RF(max_train,max_test,params=param_grid)
# ## selected PCA covariates.
# print("By Random Forest, result for PCA:")
# RF(train,test,params=param_grid)
# print("By Random Forest, result for NN:")
# RF(nn_train,nn_test,params=param_grid)
# # print("By Random Forest, result for NN_RF:")
# # RF(nn_rf_train,nn_rf_test,params=param_grid)
# print("By Random Forest, result for sum baseline:")
# RF(sum_train,sum_test,params=param_grid)
# print("By Random Forest, result for Autoencoder:")
# RF(ae_train,ae_test,params=param_grid)


In [None]:
param_grid = {
    'n_estimators': [100,200,300],  
    'max_features': [0.8, 0.9, 1],  
    'max_depth': [None, 10, 20, 30],  
    'min_samples_split': [2, 5, 10],  
    'min_samples_leaf': [1, 2, 4],  
    'bootstrap': [True]  
}
Y_scores_sum, accuracy_sum, recall_sum = RF(sum_train,sum_test,tolerance = 1e-4,iter = 4000, seed = 43)
Y_scores_max, accuracy_max, recall_max = RF(max_train,max_test,tolerance = 1e-4,iter = 4000, seed = 43)
Y_scores_pca, accuracy_pca, recall_pca = RF(train,test,tolerance = 1e-4,iter = 4000, seed = 43)
Y_scores_nn, accuracy_nn, recall_nn = RF(nn_train,nn_test,tolerance = 1e-4,iter = 4000, seed = 43)
Y_scores_ae, accuracy_ae, recall_ae = RF(ae_train,ae_test,tolerance = 1e-4,iter = 4000, seed = 43)
Y_scores = [Y_scores_sum, Y_scores_max, Y_scores_pca, Y_scores_nn, Y_scores_ae]
Y_true = test["recur_30"]
# file_path = 'data/'
# Y_scores.to_csv(
#         file_path + "scores.csv", index=False)
# Y_true.to.to_csv(
#         file_path + "true.csv", index=False)

method = ["sum","max","pca","nn","ae"]
ROC(Y_true, Y_scores, method, clf="RF",seed=1)