In [1]:
from sklearn.model_selection import train_test_split 
import pandas as pd
import numpy as np
import matplotlib
from matplotlib import pylab as plt
import random
import math
import os

In [2]:
df = pd.read_csv("../data/heart.csv")
df.head()

Unnamed: 0,Age,Sex,ChestPainType,RestingBP,Cholesterol,FastingBS,RestingECG,MaxHR,ExerciseAngina,Oldpeak,ST_Slope,HeartDisease
0,40,M,ATA,140,289,0,Normal,172,N,0.0,Up,0
1,49,F,NAP,160,180,0,Normal,156,N,1.0,Flat,1
2,37,M,ATA,130,283,0,ST,98,N,0.0,Up,0
3,48,F,ASY,138,214,0,Normal,108,Y,1.5,Flat,1
4,54,M,NAP,150,195,0,Normal,122,N,0.0,Up,0


In [18]:
is_categorical = ["Sex","ChestPainType","FastingBS","RestingECG","ExerciseAngina","ST_Slope"]
is_continuous = ["Age","RestingBP","Cholesterol","MaxHR", "Oldpeak"]

cat_ftrs = ["Sex","ChestPainType","FastingBS","RestingECG","ExerciseAngina","ST_Slope"]
ordinal_ftrs =[]
ordinal_cats = []
num_ftrs = ["Age","RestingBP","Cholesterol","MaxHR", "Oldpeak"]

In [4]:
len(df[df["Cholesterol"] ==0])
# here we see 172 people have cholesterol encoded as 0 which must be a missing value.

172

In [5]:
len(df[df["RestingBP"] == 0])
#and one patient does not have blood pressure. missing data point.

1

In [6]:
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler, OneHotEncoder, OrdinalEncoder, MinMaxScaler
random_state = 73

In [20]:
y = df['HeartDisease'] # assign our attributes and target variable
X = df.loc[:, df.columns != 'HeartDisease']

In [40]:
#this creates dataframe splits (startingi from scratch due to df drop)
df = pd.read_csv("../data/heart.csv") #read in

df['Cholesterol'] = df['Cholesterol'].replace(0,np.nan) # replace junk values with nan
df['RestingBP'] = df['RestingBP'].replace(0,np.nan)

y = df['HeartDisease']
df.drop(columns=['HeartDisease'],inplace=True)
X = df.values
ftrs = df.columns

In [41]:
random_state = 73
X_train, X_other, y_train, y_other = train_test_split(df, y, train_size=0.7, random_state=random_state)
X_test, X_CV, y_test, y_CV = train_test_split(X_other, y_other, test_size=.5, random_state=random_state)
print(len(X_CV))
print(len(X_train))
print(len(X_test))

138
642
138


In [42]:
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import OrdinalEncoder
from sklearn.preprocessing import StandardScaler
from sklearn.impute import SimpleImputer

# one-hot encoder
categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='constant',fill_value='missing')),
    ('onehot', OneHotEncoder(sparse=False,handle_unknown='ignore'))])

# ordinal encoder
ordinal_transformer = Pipeline(steps=[
    ('imputer2', SimpleImputer(strategy='constant',fill_value='NA')),
    ('ordinal', OrdinalEncoder(categories = ordinal_cats))])

# standard scaler
numeric_transformer = Pipeline(steps=[
    ('scaler', StandardScaler())])

# collect the transformers
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, num_ftrs),
        ('cat', categorical_transformer, cat_ftrs),
        ('ord', ordinal_transformer, ordinal_ftrs)])

In [69]:
# fit_transform the training set
X_prep = preprocessor.fit_transform(X_train)
# collect feature names
feature_names = preprocessor.transformers_[0][-1] + \
                list(preprocessor.named_transformers_['cat'][1].get_feature_names(cat_ftrs)) + \
                preprocessor.transformers_[2][-1]

df_train = pd.DataFrame(data=X_prep,columns=feature_names)
print(df_train.shape)



# transform the CV
df_CV = preprocessor.transform(X_CV)
df_CV = pd.DataFrame(data=df_CV,columns = feature_names)
print(df_CV.shape)

# transform the test
df_test = preprocessor.transform(X_test)
df_test = pd.DataFrame(data=df_test,columns = feature_names)
print(feature_names)

(642, 21)
(138, 21)
['Age', 'RestingBP', 'Cholesterol', 'MaxHR', 'Oldpeak', 'Sex_F', 'Sex_M', 'ChestPainType_ASY', 'ChestPainType_ATA', 'ChestPainType_NAP', 'ChestPainType_TA', 'FastingBS_0', 'FastingBS_1', 'RestingECG_LVH', 'RestingECG_Normal', 'RestingECG_ST', 'ExerciseAngina_N', 'ExerciseAngina_Y', 'ST_Slope_Down', 'ST_Slope_Flat', 'ST_Slope_Up']


In [None]:
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer
from sklearn.ensemble import RandomForestRegressor


imputer = IterativeImputer(estimator = RandomForestRegressor(n_estimators=10),max_iter=10000, random_state=1000)
X_impute = imputer.fit_transform(df_train)
df_train_imp = pd.DataFrame(data=X_impute, columns = df_train.columns)


df_CV_imp = pd.DataFrame(data=imputer.transform(df_CV), columns = df_train.columns)
df_test_imp = pd.DataFrame(data=imputer.transform(df_test), columns = df_train.columns)

In [58]:
np.nan in df_train_imp

False

In [61]:
### XGB using non-imputed

import xgboost
from sklearn.model_selection import ParameterGrid
from sklearn.metrics import mean_squared_error
from sklearn.metrics import r2_score

param_grid = {"learning_rate": [0.03],
              "n_estimators": [10000],
              "seed": [0],
              #"reg_alpha": [0e0, 1e-2, 1e-1, 1e0, 1e1, 1e2],
              #"reg_lambda": [0e0, 1e-2, 1e-1, 1e0, 1e1, 1e2],
              "missing": [np.nan], 
              #"max_depth": [1,3,10,30,100],
              "colsample_bytree": [0.9],              
              "subsample": [0.66]}

XGB = xgboost.XGBRegressor()
XGB.set_params(100)
XGB.fit(df_train,y_train,early_stopping_rounds=50,eval_set=[(df_CV, y_CV)], verbose=False)
y_CV_pred = XGB.predict(df_CV)
print('the CV RMSE:',np.sqrt(mean_squared_error(y_CV,y_CV_pred)))
y_test_pred = XGB.predict(df_test)
print('the test RMSE:',np.sqrt(mean_squared_error(y_test,y_test_pred)))
print('the test R2:',r2_score(y_test,y_test_pred))


the CV RMSE: 0.2954902023140127
the test RMSE: 0.3397674190978847
the test R2: 0.5347141743729409


In [64]:
###XGB usign imputed values

#yeah xgb does indeed turn out to be better here
XGB.fit(df_train_imp,y_train,early_stopping_rounds=50,eval_set=[(df_CV_imp, y_CV)], verbose=False)
y_CV_pred = XGB.predict(df_CV_imp)
print('the CV RMSE:',np.sqrt(mean_squared_error(y_CV,y_CV_pred)))
y_test_pred = XGB.predict(df_test_imp)
print('the test RMSE:',np.sqrt(mean_squared_error(y_test,y_test_pred)))
print('the test R2:',r2_score(y_test,y_test_pred))

the CV RMSE: 0.3029387775454234
the test RMSE: 0.3576579341746828
the test R2: 0.4844247186474986


In [66]:
df.corr()

Unnamed: 0,Age,RestingBP,Cholesterol,FastingBS,MaxHR,Oldpeak
Age,1.0,0.263084,0.058758,0.198039,-0.382045,0.258612
RestingBP,0.263084,1.0,0.095939,0.067823,-0.109693,0.174252
Cholesterol,0.058758,0.095939,1.0,0.054012,-0.019856,0.058488
FastingBS,0.198039,0.067823,0.054012,1.0,-0.131438,0.052698
MaxHR,-0.382045,-0.109693,-0.019856,-0.131438,1.0,-0.160691
Oldpeak,0.258612,0.174252,0.058488,0.052698,-0.160691,1.0



contains NaN:
df_train
df_CV
df_test

does not contain NaN:
df_train_imp   // y_train
df_test_imp    // y_test
df_CV_imp      // y_CV



In [None]:
from sklearn.neighbors import KNeighborsClassifier
KNN_val_scores= []
KNN_train_scores =[]
best_score = 100
best_n = 0
for i in range(1,40):
    KNN = KNeighborsClassifier(n_neighbors = i)
    KNN.fit(df_train_imp,y_train)
    pred_i = KNN.predict(df_CV_imp)
    val_score = np.mean(pred_i != y_CV)
    
    pred_i = KNN.predict(df_train_imp)
    train_score = np.mean(pred_i != y_train)
    if score < best_score:
        best_n = i
        best_score = score
    KNN_val_scores.append(val_score) #maybe better score?
    KNN_train_scores.append(train_score)
    

In [None]:
plt.plot(range(1,40),KNN_val_scores,label = "val score")
plt.plot(range(1,40),KNN_train_scores,lable = "train score")
plt.plot(best_n, best_score, "o", color = "red",label = f"best n val = {best_n}")
plt.legend()
plt.show()

In [92]:
KNN = KNeighborsClassifier(n_neighbors = 13)
KNN.fit(df_train_imp,y_train)
prediction = KNN.predict(df_test_imp)
print(np.mean(prediction != y_test))

0.14492753623188406


In [None]:
from sklearn.ensemble import RandomForestClassifier

val_scores = []
train_scores =[]
best_score = 100
best_n = 0
for n in range(1,150):
    forest= RandomForestClassifier(n_estimators =n, random_state = 0)
    forest.fit(df_train_imp, y_train)
    val_prediction = forest.predict(df_CV_imp)
    train_prediction = forest.predict(df_train_imp)
    val_score = np.mean(val_prediction != y_CV)
    train_score = np.mean(train_prediction != y_train)
    train_scores.append(train_score)
    val_scores.append(val_score)
    if val_score < best_score:
        best_score = val_score
        best_n = n

In [None]:
plt.plot(range(1,150), val_scores, label= "val score")
plt.plot(best_n, best_score, "o", color = "red", label = f"best n = {best_n}")
plt.plot(range(1,150),train_scores, label = "train score")
plt.legend()
plt.show()

In [103]:
forest= RandomForestClassifier(n_estimators =best_n, random_state = 0)
forest.fit(df_train_imp, y_train)
prediction = forest.predict(df_test_imp)
score = np.mean(prediction != y_test)
print(score)

0.13768115942028986


In [None]:
from sklearn.svm import SVR
param_grid = {'svr__kernel' : ('linear', 'poly', 'rbf', 'sigmoid'),'svr__C' : [1,5,10],'svr__degree' : [3,8],'svr__coef0' : [0.01,10,0.5],'svr__gamma' : ('auto','scale')}
