# Model Selection

In [None]:
import pandas as pd
import math
from sklearn.metrics import mean_squared_error, r2_score, accuracy_score, precision_score, recall_score
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split,RandomizedSearchCV
from scipy.stats import randint
import numpy as np

# Needed for decision tree visualization
import pydotplus
from IPython.display import Image

# To serialize the model
import pickle


## Define reusable components

Function Definitions

In [3]:
def printPerformanceMetrics(in_y_test, in_predicitions):
    acc  = accuracy_score(in_y_test, in_predicitions)
    mse = mean_squared_error(in_y_test, in_predicitions)
    rmse = math.sqrt(mse)
    pscore = precision_score(in_y_test, in_predicitions)
    recall = recall_score(in_y_test, in_predicitions)
    print(f"""
            Accuracy Score: {acc:.2f}
            --- For linear models ---
            MSE: {mse:.2f}
            RMSE: {rmse:.2f}
            --- For classification models ---
            Precision: {pscore:.2f}
            Recall: {recall:.2f}

            """)

In [4]:
def randomForestHyperParamTuning(in_rf_classifier, in_X_train, in_y_train):

    # Define the parameter grid
    param_dist = {
    'n_estimators': randint(50, 200),
    'max_depth': [None, 10, 20, 30],
    'min_samples_split': randint(2, 20),
    'min_samples_leaf': randint(1, 10),
    'max_features': ['sqrt', 'log2', None]
    }

    # Create a RandomizedSearchCV object
    rscv = RandomizedSearchCV(
        in_rf_classifier, param_distributions=param_dist, n_iter=10, cv=5, n_jobs=-1
        )

    # Fit the RandomizedSearchCV object to the data
    rscv.fit(in_X_train, in_y_train)

    # Print the best parameters
    print("Best Parameters: ", rscv.best_params_)

    #Get the best estimator
    best_rf = rscv.best_estimator_

    return best_rf

In [5]:
def serializeModel(in_model, in_save_as_dot_pkl_filename):
    with open(in_save_as_dot_pkl_filename, 'wb') as f:
        pickle.dump(in_model, f)

Constant Defiitions

In [6]:
# CONSTANTS
TARGET_COL = "Has CVD"
RAND_ST_VAL = 7
ROOT_RESOURCES_DIR = "./Repo/cardiovascular-prediction/resources/"

## Train and Test Model

### Retireve encoded data

In [7]:
## Read encoded split data

#X_train = pd.read_csv(ROOT_RESOURCES_DIR+"X_train_cv_by_age_race_gender_df_states_ENCODED.csv")
#X_test = pd.read_csv(ROOT_RESOURCES_DIR+"X_test_cv_by_age_race_gender_df_states_ENCODED.csv")
#y_train = pd.read_csv(ROOT_RESOURCES_DIR+"y_train_cv_by_age_race_gender_df_states_ENCODED.csv")
#y_test = pd.read_csv(ROOT_RESOURCES_DIR+"y_test_cv_by_age_race_gender_df_states_ENCODED.csv")

In [8]:
data_df = pd.read_csv(ROOT_RESOURCES_DIR+"cms_cvd_data.csv")

In [9]:
data_df.columns

Index(['Unnamed: 0', 'Year', 'Data_Value', 'Data_Value_Alt',
       'LowConfidenceLimit', 'HighConfidenceLimit', 'GeoLocation',
       'Category_Cardiovascular Diseases',
       'Topic_Acute Myocardial Infarction (Heart Attack)',
       'Topic_Coronary Heart Disease',
       'Topic_Diseases of the Heart (Heart Disease)', 'Topic_Heart Failure',
       'Topic_Major Cardiovascular Disease', 'Topic_Stroke',
       'Break_Out_Category_Age', 'Break_Out_Category_Gender',
       'Break_Out_Category_Overall', 'Break_Out_Category_Race',
       'Break_Out_65+', 'Break_Out_75+', 'Break_Out_Female',
       'Break_Out_Hispanic', 'Break_Out_Male', 'Break_Out_Non-Hispanic Black',
       'Break_Out_Non-Hispanic White', 'Break_Out_Other', 'Break_Out_Overall'],
      dtype='object')

In [10]:
data_df.shape

(42111, 27)

In [11]:
data_df["Topic_Diseases of the Heart (Heart Disease)"].head()

0    0.0
1    0.0
2    0.0
3    0.0
4    0.0
Name: Topic_Diseases of the Heart (Heart Disease), dtype: float64

In [12]:
def removeParens(x):
   return x.replace("(", "")
    

In [13]:
data_df_tmp = data_df.copy()
data_df_tmp[TARGET_COL] = data_df_tmp["Topic_Diseases of the Heart (Heart Disease)"].apply(lambda x: 1 if x==1.0 else 0)
data_df_tmp[TARGET_COL] = data_df_tmp["Topic_Heart Failure"].apply(lambda x: 1 if x==1.0 else x)

In [14]:
cols = ['Data_Value', 'Data_Value_Alt', 'LowConfidenceLimit',
       'HighConfidenceLimit',
       'Break_Out_Category_Age', 'Break_Out_Category_Gender',
       'Break_Out_Category_Overall', 'Break_Out_Category_Race',
       'Break_Out_65+', 'Break_Out_75+', 'Break_Out_Female',
       'Break_Out_Hispanic', 'Break_Out_Male', 'Break_Out_Non-Hispanic Black',
       'Break_Out_Non-Hispanic White', 'Break_Out_Other', 'Break_Out_Overall']

In [15]:

cols.append(TARGET_COL)
data_df = data_df_tmp[cols].copy()
#data_df_tmp["GeoLocation"] = data_df_tmp["GeoLocation"].apply("removeParens", axis = "columns")
data_df.value_counts()

Data_Value  Data_Value_Alt  LowConfidenceLimit  HighConfidenceLimit  Break_Out_Category_Age  Break_Out_Category_Gender  Break_Out_Category_Overall  Break_Out_Category_Race  Break_Out_65+  Break_Out_75+  Break_Out_Female  Break_Out_Hispanic  Break_Out_Male  Break_Out_Non-Hispanic Black  Break_Out_Non-Hispanic White  Break_Out_Other  Break_Out_Overall  Has CVD
3.90        3.9             3.8                 4.0                  0.0                     0.0                        0.0                         1.0                      0.0            0.0            0.0               0.0                 0.0             0.0                           1.0                           0.0              0.0                0.0        35
3.80        3.8             3.7                 3.9                  0.0                     0.0                        1.0                         0.0                      0.0            0.0            0.0               0.0                 0.0             0.0            

In [16]:
#split into train and test data sets
y = data_df[TARGET_COL]
#X = cv_by_age_race_gender_df_all.drop(columns=[TARGET_COL]).copy()
X = data_df.drop(columns=[TARGET_COL])
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=RAND_ST_VAL)


In [17]:
print(X_train.shape)
print(y_train.shape)

print(X_test.shape)
print(y_test.shape)


(31583, 17)
(31583,)
(10528, 17)
(10528,)


In [18]:
y_train

41516    0.0
18356    0.0
39946    0.0
23662    1.0
36839    1.0
        ... 
34140    1.0
13927    0.0
919      0.0
38467    0.0
10742    0.0
Name: Has CVD, Length: 31583, dtype: float64

In [19]:
y_train_df = pd.DataFrame(y_train, columns=[TARGET_COL])
y_test_df = pd.DataFrame(y_test, columns=[TARGET_COL])

In [20]:
y_train_df

Unnamed: 0,Has CVD
41516,0.0
18356,0.0
39946,0.0
23662,1.0
36839,1.0
...,...
34140,1.0
13927,0.0
919,0.0
38467,0.0


In [21]:
X_train

Unnamed: 0,Data_Value,Data_Value_Alt,LowConfidenceLimit,HighConfidenceLimit,Break_Out_Category_Age,Break_Out_Category_Gender,Break_Out_Category_Overall,Break_Out_Category_Race,Break_Out_65+,Break_Out_75+,Break_Out_Female,Break_Out_Hispanic,Break_Out_Male,Break_Out_Non-Hispanic Black,Break_Out_Non-Hispanic White,Break_Out_Other,Break_Out_Overall
41516,3.70,3.7,3.7,3.8,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
18356,7.00,7.0,6.8,7.3,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
39946,4.30,4.3,4.2,4.4,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
23662,1.17,11.7,10.7,12.7,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
36839,3.11,31.1,30.0,32.1,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
34140,5.85,58.5,54.0,63.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
13927,11.10,11.1,0.0,25.6,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
919,25.00,25.0,24.3,25.7,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
38467,3.90,3.9,3.8,4.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0


### Scale

In [22]:
# Scale data
scaler = StandardScaler().fit(X_train)
X_train_scaled = scaler.transform(X_train)
X_test_scaled = scaler.transform(X_test)


In [23]:
X_test_scaled_df = pd.DataFrame(X_test_scaled, columns=[X_test.columns])
X_train_scaled_df = pd.DataFrame(X_train_scaled, columns=[X_train.columns])
X_train_scaled_df

Unnamed: 0,Data_Value,Data_Value_Alt,LowConfidenceLimit,HighConfidenceLimit,Break_Out_Category_Age,Break_Out_Category_Gender,Break_Out_Category_Overall,Break_Out_Category_Race,Break_Out_65+,Break_Out_75+,Break_Out_Female,Break_Out_Hispanic,Break_Out_Male,Break_Out_Non-Hispanic Black,Break_Out_Non-Hispanic White,Break_Out_Other,Break_Out_Overall
0,-0.495891,-0.835899,-0.784764,-0.744922,-0.413677,-0.575218,2.658804,-0.963341,-0.281045,-0.280454,-0.376876,-0.358284,-0.376657,-0.367447,-0.377315,-0.376219,2.658804
1,-0.062021,-0.587993,-0.514281,-0.549294,-0.413677,-0.575218,-0.376109,1.038054,-0.281045,-0.280454,-0.376876,-0.358284,-0.376657,-0.367447,-0.377315,2.658029,-0.376109
2,-0.417006,-0.790825,-0.741137,-0.711385,-0.413677,1.738473,-0.376109,-0.963341,-0.281045,-0.280454,-0.376876,-0.358284,2.654934,-0.367447,-0.377315,-0.376219,-0.376109
3,-0.828525,-0.234914,-0.173996,-0.247468,-0.413677,-0.575218,2.658804,-0.963341,-0.281045,-0.280454,-0.376876,-0.358284,-0.376657,-0.367447,-0.377315,-0.376219,2.658804
4,-0.573462,1.222475,1.509979,0.836869,-0.413677,-0.575218,-0.376109,1.038054,-0.281045,-0.280454,-0.376876,-0.358284,-0.376657,-0.367447,-0.377315,2.658029,-0.376109
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
31578,-0.213219,3.280849,3.604040,2.563984,-0.413677,-0.575218,-0.376109,1.038054,-0.281045,-0.280454,-0.376876,-0.358284,-0.376657,2.721478,-0.377315,-0.376219,-0.376109
31579,0.477029,-0.279988,-1.107598,0.473560,-0.413677,-0.575218,-0.376109,1.038054,-0.281045,-0.280454,-0.376876,2.791084,-0.376657,-0.367447,-0.377315,-0.376219,-0.376109
31580,2.304542,0.764224,1.012639,0.479150,2.417344,-0.575218,-0.376109,-0.963341,3.558155,-0.280454,-0.376876,-0.358284,-0.376657,-0.367447,-0.377315,-0.376219,-0.376109
31581,-0.469596,-0.820874,-0.776038,-0.733743,-0.413677,-0.575218,2.658804,-0.963341,-0.281045,-0.280454,-0.376876,-0.358284,-0.376657,-0.367447,-0.377315,-0.376219,2.658804


In [24]:
y_train_df.isnull().sum()

Has CVD    0
dtype: int64

## Train and test models

### DecisionTreeClassifier

In [25]:
dtclf = DecisionTreeClassifier(random_state=RAND_ST_VAL)

In [26]:
dtclf = dtclf.fit(X_train_scaled,y_train)

#### Serialize trained model

To deserialize do
    with open('model.pkl', 'rb') as f:
    loaded_model = pickle.load(f)

In [27]:
serializeModel(dtclf, "./trained_models/dtclf.pkl")

In [28]:
# predict
predictions = dtclf.predict(X_test_scaled)

In [29]:
predictions

array([1., 1., 0., ..., 0., 0., 1.])

In [30]:
printPerformanceMetrics(y_test, predictions)



            Accuracy Score: 0.91
            --- For linear models ---
            MSE: 0.09
            RMSE: 0.31
            --- For classification models ---
            Precision: 0.90
            Recall: 0.89

            


### RandomForestClassifier

Start with any parameters

In [31]:
# RandomForestClassifier
n_est = 10
#max_depth = 20

rfc_clf = RandomForestClassifier(random_state=RAND_ST_VAL, n_estimators=n_est)
rfc_clf = rfc_clf.fit(X_train_scaled, y_train)

Serialize the trained model

In [32]:
serializeModel(rfc_clf, "./trained_models/rfc_clf.pkl")

In [33]:
# Evaluate the model
rfc_predictions = rfc_clf.predict(X_test_scaled)

In [34]:
printPerformanceMetrics(y_test, rfc_predictions)


            Accuracy Score: 0.91
            --- For linear models ---
            MSE: 0.09
            RMSE: 0.30
            --- For classification models ---
            Precision: 0.90
            Recall: 0.90

            


Try to find the optimal parameters

In [35]:
best_rfc_clf = RandomForestClassifier(random_state=RAND_ST_VAL)
best_rfc_clf = randomForestHyperParamTuning(best_rfc_clf,X_train_scaled, y_train)

Best Parameters:  {'max_depth': 10, 'max_features': 'log2', 'min_samples_leaf': 2, 'min_samples_split': 3, 'n_estimators': 125}


In [36]:
# Evaluate the model
best_rfc_predictions = best_rfc_clf.predict(X_test_scaled)

In [37]:
printPerformanceMetrics(y_test, best_rfc_predictions)


            Accuracy Score: 0.92
            --- For linear models ---
            MSE: 0.08
            RMSE: 0.28
            --- For classification models ---
            Precision: 0.92
            Recall: 0.90

            


In [38]:
#Serialize the model
serializeModel(best_rfc_clf, "./trained_models/best_rfc_clf.pkl")