In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split

from preprocessing import preprocessing
from pca import pca_load
from pca_all import pca_load_all
from baseline import baseline
from lr import LR
from rf import RF

# short code or full code
filename = "main_dataset_final_3"
# filename = "main_dataset_final_3_full_icd10"
# preprocessing(filename)

# binary or continuous
datatype = "binary"
# datatype = "cont"
X = pd.read_csv("data/stroke_data_" + datatype + ".csv")
X.dropna(inplace=True)
X_train, X_test = train_test_split(X, test_size=0.2, random_state=42)

disease_mapping = {
    #"BMI": ['Z68','E65','E66','E67','E68'], 
    "BMI": ['E65','E66'], 
    "blood_lipids": ['E78'],
    "blood_glucose": ['E10','E11'],
    #"hypertensive": ['I10','I11','I12','I13','I14','I15','I16'],
    "hypertensive": ['I10','I11','I12','I13','I15','I16'],
    "nicotine":['F17','Z72'],
    "alcohol":['F10']
}

# pca vs. baseline
train, test = pca_load(datatype, disease_mapping, X_train, X_test)
# train, test = pca_load_all(datatype, X_train, X_test)
base_train, base_test = baseline(datatype, disease_mapping, X_train, X_test)

print(train.shape, test.shape)
print(base_train.shape, base_test.shape)

BMI PCA data loaded
blood_lipids PCA data loaded
blood_glucose PCA data loaded
hypertensive PCA data loaded
nicotine PCA data loaded
alcohol PCA data loaded
(16612, 22) (4153, 22)
(16612, 14) (4153, 14)


# Binary

In [None]:
# orginal data with all covariates

X = pd.read_csv("data/stroke_data_binary.csv")
X.dropna(inplace=True)
#X= X[X['race'] != 'Black or African American']
or_train, or_test = train_test_split(X, test_size=0.2, random_state=42)
print("By Logistic Regression, result for all original predicitors:")
LR(or_train,or_test,tolerance = 1e-4,iter = 6000, seed = 43)

param_grid = {
    'n_estimators': [100,200,300],  
    'max_features': [0.2, 0.4, 0.6],  
    'max_depth': [None, 10, 20, 30],  
    'min_samples_split': [2, 5, 10],  
    'min_samples_leaf': [1, 2, 4],  
    'bootstrap': [True, False]  
}
print("Result for Random Forest, result for all original predicitors:")
RF(or_train,or_test,params=param_grid)

In [None]:
# Logistic Regression
# We select several comorbidities and demographics as the predictors.

## baseline: orginal data with the max_aggregation covariates
print("By Logistic Regression, result for baseline:")
LR(base_train,base_test,tolerance = 1e-4,iter = 4000, seed = 43)
## selected PCA covariates.
print("By Logistic Regression, result for PCA:")
LR(train,test,tolerance = 1e-4,iter = 4000, seed = 43)

In [None]:
# Random Forest
# We select several comorbidities and demographics as the predictors.
# Since the number of predicitors is not large, so here 'max_features' is not small.
param_grid = {
    'n_estimators': [100,200,300],  
    'max_features': [0.8, 0.9, 0.95],  
    'max_depth': [None, 10, 20, 30],  
    'min_samples_split': [2, 5, 10],  
    'min_samples_leaf': [1, 2, 4],  
    'bootstrap': [True, False]  
}

## baseline: orginal data with the max_aggregation covariates
print("By Random Forest, result for baseline:")
RF(base_train,base_test,params=param_grid)
## selected PCA covariates.
print("By Random Forest, result for baseline:")
RF(train,test,params=param_grid)

# Continuous

In [None]:

# short code or full code
filename = "main_dataset_final_3"
# filename = "main_dataset_final_3_full_icd10"
# preprocessing(filename)

# binary or continuous
# datatype = "binary"
datatype = "cont"
X = pd.read_csv("data/stroke_data_" + datatype + ".csv")
X.dropna(inplace=True)
X_train, X_test = train_test_split(X, test_size=0.2, random_state=42)

disease_mapping = {
    #"BMI": ['Z68','E65','E66','E67','E68'], 
    "BMI": ['E65','E66'], 
    "blood_lipids": ['E78'],
    "blood_glucose": ['E10','E11'],
    #"hypertensive": ['I10','I11','I12','I13','I14','I15','I16'],
    "hypertensive": ['I10','I11','I12','I13','I15','I16'],
    "nicotine":['F17','Z72'],
    "alcohol":['F10']
}

# pca vs. baseline
train, test = pca_load(datatype, disease_mapping, X_train, X_test)
# train, test = pca_load_all(datatype, X_train, X_test)
base_train, base_test = baseline(datatype, disease_mapping, X_train, X_test)

print(train.shape, test.shape)
print(base_train.shape, base_test.shape)

In [None]:
# orginal data with all covariates

X = pd.read_csv("data/stroke_data_cont.csv")
X.dropna(inplace=True)
#X= X[X['race'] != 'Black or African American']
or_train, or_test = train_test_split(X, test_size=0.2, random_state=42)
print("By Logistic Regression, result for LR:")
LR(or_train,or_test,tolerance = 1e-4,iter = 6000, seed = 43)

param_grid = {
    'n_estimators': [100,200,300],  
    'max_features': [0.2, 0.4, 0.6],  
    'max_depth': [None, 10, 20, 30],  
    'min_samples_split': [2, 5, 10],  
    'min_samples_leaf': [1, 2, 4],  
    'bootstrap': [True, False]  
}
print("By Logistic Regression, result for RF:")
RF(or_train,or_test,params=param_grid)

In [None]:
# Logistic Regression
# We select several comorbidities and demographics as the predictors.

## baseline: orginal data with the max_aggregation covariates
print("Result for baseline:")
LR(base_train,base_test,tolerance = 1e-4,iter = 4000, seed = 43)
## selected PCA covariates.
print("Result for PCA:")
LR(train,test,tolerance = 1e-4,iter = 4000, seed = 43)

In [None]:
# Random Forest
# We select several comorbidities and demographics as the predictors.
# Since the number of predicitors is not large, so here 'max_features' is not small.
param_grid = {
    'n_estimators': [100,200,300],  
    'max_features': [0.8, 0.9, 0.95],  
    'max_depth': [None, 10, 20, 30],  
    'min_samples_split': [2, 5, 10],  
    'min_samples_leaf': [1, 2, 4],  
    'bootstrap': [True, False]  
}

## baseline: orginal data with the max_aggregation covariates
print("By Random Forest, result for baseline:")
RF(base_train,base_test,params=param_grid)
## selected PCA covariates.
print("By Random Forest, result for PCA:")
RF(train,test,params=param_grid)