In [2]:
import pandas as pd
import scipy.stats as stats
import numpy as np
from sqlalchemy import create_engine

In [3]:
url = "https://heartdisease4.s3.us-east-2.amazonaws.com/heart.csv"

df = pd.read_csv(url)
df.head()

Unnamed: 0,age,sex,cp,trestbps,chol,fbs,restecg,thalach,exang,oldpeak,slope,ca,thal,target
0,63,1,3,145,233,1,0,150,0,2.3,0,0,1,1
1,37,1,2,130,250,0,1,187,0,3.5,0,0,2,1
2,41,0,1,130,204,0,0,172,0,1.4,2,0,2,1
3,56,1,1,120,236,0,1,178,0,0.8,2,0,2,1
4,57,0,0,120,354,0,1,163,1,0.6,2,0,2,1


In [4]:
df = pd.get_dummies(df, columns=['cp', 'slope', 'thal'])

In [5]:
df.head()

Unnamed: 0,age,sex,trestbps,chol,fbs,restecg,thalach,exang,oldpeak,ca,...,cp_1,cp_2,cp_3,slope_0,slope_1,slope_2,thal_0,thal_1,thal_2,thal_3
0,63,1,145,233,1,0,150,0,2.3,0,...,0,0,1,1,0,0,0,1,0,0
1,37,1,130,250,0,1,187,0,3.5,0,...,0,1,0,1,0,0,0,0,1,0
2,41,0,130,204,0,0,172,0,1.4,0,...,1,0,0,0,0,1,0,0,1,0
3,56,1,120,236,0,1,178,0,0.8,0,...,1,0,0,0,0,1,0,0,1,0
4,57,0,120,354,0,1,163,1,0.6,0,...,0,0,0,0,0,1,0,0,1,0


#### Min Max Scaling

In [6]:
from sklearn.preprocessing import MinMaxScaler

scaler = MinMaxScaler()
df['age'] = scaler.fit_transform(df['age'].values.reshape(-1,1))
df['trestbps'] = scaler.fit_transform(df['trestbps'].values.reshape(-1,1))
df['chol'] = scaler.fit_transform(df['chol'].values.reshape(-1,1))
df['oldpeak'] = scaler.fit_transform(df['oldpeak'].values.reshape(-1,1))
df['thalach'] = scaler.fit_transform(df['thalach'].values.reshape(-1,1))

### Models

In [7]:
from sklearn.model_selection import train_test_split

X = df.drop('target', axis=1)
y = df['target']

X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42)

In [8]:
from sklearn.neighbors import KNeighborsClassifier

train_scores = []
test_scores = []

for k in range(1, 21, 2):
    knn = KNeighborsClassifier(n_neighbors = k)
    knn.fit(X_train, y_train)
    train_score = knn.score(X_train, y_train)
    test_score = knn.score(X_test, y_test)
    train_scores.append(train_score)
    test_scores.append(test_score)
    print(f'k: {k}, Train/Test Score: {train_score:0.3f} / {test_score:.3f}')

k: 1, Train/Test Score: 1.000 / 0.816
k: 3, Train/Test Score: 0.912 / 0.816
k: 5, Train/Test Score: 0.885 / 0.855
k: 7, Train/Test Score: 0.863 / 0.868
k: 9, Train/Test Score: 0.855 / 0.895
k: 11, Train/Test Score: 0.850 / 0.882
k: 13, Train/Test Score: 0.850 / 0.882
k: 15, Train/Test Score: 0.837 / 0.882
k: 17, Train/Test Score: 0.828 / 0.895
k: 19, Train/Test Score: 0.837 / 0.895


In [11]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import cross_val_score, GridSearchCV

gsc = GridSearchCV(estimator=RandomForestClassifier(),
                  param_grid={
                      'max_depth': range(3,7),
                      'n_estimators': (10, 50, 100, 1000),},
                   cv=5, scoring='neg_mean_squared_error', verbose=0, n_jobs=-1)

grid_result = gsc.fit(X_train, y_train)
best_params = grid_result.best_params_



In [13]:
rfr = RandomForestClassifier(max_depth=best_params['max_depth'],
                            n_estimators=best_params['n_estimators'],
                            random_state=42, verbose=False)

rfr.fit(X_train, y_train)
rfr.score(X_train, y_train)
rfr.score(X_test, y_test)

0.868421052631579