In [1]:
import pandas as pd
from pandas import DataFrame as df
import numpy as np
import warnings
warnings.filterwarnings('ignore')

import matplotlib.pyplot as plt
%matplotlib inline

In [2]:
dataset = pd.read_csv('Heart_Disease.csv')

In [3]:
dataset.head()

Unnamed: 0,Age,Sex,ChestPain,RestBP,Chol,Fbs,RestECG,MaxHR,ExAng,Oldpeak,Slope,Ca,Thal,AHD
0,63,1,typical,145,233,1,2,150,0,2.3,3,0.0,fixed,No
1,67,1,asymptomatic,160,286,0,2,108,1,1.5,2,3.0,normal,Yes
2,67,1,asymptomatic,120,229,0,2,129,1,2.6,2,2.0,reversable,Yes
3,37,1,nonanginal,130,250,0,0,187,0,3.5,3,0.0,normal,No
4,41,0,nontypical,130,204,0,2,172,0,1.4,1,0.0,normal,No


In [4]:
dataset.isnull().sum()

Age          0
Sex          0
ChestPain    0
RestBP       0
Chol         0
Fbs          0
RestECG      0
MaxHR        0
ExAng        0
Oldpeak      0
Slope        0
Ca           4
Thal         2
AHD          0
dtype: int64

In [5]:
from sklearn.impute import SimpleImputer
imputer = SimpleImputer(strategy='most_frequent')

In [6]:
missing = dataset['Ca'].values.reshape(-1,1)
dataset['Ca'] = imputer.fit_transform(missing)

In [7]:
missing1 = dataset['Thal'].values.reshape(-1,1)
dataset['Thal'] = imputer.fit_transform(missing1)

In [8]:
from sklearn.preprocessing import LabelEncoder
encoder = LabelEncoder()
cat_mask = dataset.dtypes==object
cat_cols=dataset.columns[cat_mask].tolist()
dataset[cat_cols]=dataset[cat_cols].apply(lambda col: encoder.fit_transform(col))

In [9]:
dataset.head()

Unnamed: 0,Age,Sex,ChestPain,RestBP,Chol,Fbs,RestECG,MaxHR,ExAng,Oldpeak,Slope,Ca,Thal,AHD
0,63,1,3,145,233,1,2,150,0,2.3,3,0.0,0,0
1,67,1,0,160,286,0,2,108,1,1.5,2,3.0,1,1
2,67,1,0,120,229,0,2,129,1,2.6,2,2.0,2,1
3,37,1,1,130,250,0,0,187,0,3.5,3,0.0,1,0
4,41,0,2,130,204,0,2,172,0,1.4,1,0.0,1,0


In [10]:
numeric_features = dataset.select_dtypes(include=[np.number])
numeric_features.dtypes
corr =numeric_features.corr()

print(corr['AHD'].sort_values(ascending=False))

AHD          1.000000
Ca           0.460033
ExAng        0.431894
Oldpeak      0.424510
Thal         0.359397
Slope        0.339213
Sex          0.276816
Age          0.223120
RestECG      0.169202
RestBP       0.150825
Chol         0.085164
Fbs          0.025264
ChestPain   -0.414446
MaxHR       -0.417167
Name: AHD, dtype: float64


In [11]:
column = dataset.columns
x = dataset[column]
x.drop('AHD', axis=1,inplace=True)
y = dataset['AHD']

In [12]:
from sklearn.decomposition import PCA

pca = PCA(n_components=8)
x_r = pca.fit_transform(x)
print('Total Explained Variance: ',pca.explained_variance_ratio_.sum())

Total Explained Variance:  0.9997239570932908


In [13]:
from sklearn.model_selection import train_test_split
x_train,x_test,y_train,y_test = train_test_split(x,y,
                    test_size=0.2,random_state=42)

In [14]:
from sklearn.preprocessing import MinMaxScaler
scaler = MinMaxScaler(feature_range=(0,1))
x_train = df(scaler.fit_transform(x_train))
x_test = df(scaler.transform(x_test))

In [15]:
x_train.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12
0,0.0,1.0,0.666667,0.339623,0.268041,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.5
1,0.583333,1.0,0.333333,0.528302,0.0,1.0,0.0,0.778626,0.0,0.032258,0.0,0.333333,1.0
2,0.833333,1.0,1.0,0.622642,0.371134,1.0,1.0,0.458015,0.0,0.016129,0.5,0.333333,0.5
3,0.75,0.0,0.333333,0.622642,0.804124,0.0,1.0,0.610687,0.0,0.129032,0.0,0.0,0.5
4,0.479167,1.0,0.0,0.132075,0.367698,1.0,0.0,0.580153,0.0,0.016129,0.0,1.0,1.0


In [16]:
from sklearn.linear_model import LogisticRegression
lr = LogisticRegression(random_state=42).fit(x_train, y_train)

In [17]:
y_pred_lr = lr.predict(x_test)
from sklearn.metrics import confusion_matrix, accuracy_score
from sklearn.metrics import classification_report
print('Confusion Matrix:'+'\n' ,confusion_matrix(y_test, y_pred_lr))
print(classification_report(y_test, y_pred_lr))

Confusion Matrix:
 [[27  2]
 [ 3 29]]
              precision    recall  f1-score   support

           0       0.90      0.93      0.92        29
           1       0.94      0.91      0.92        32

    accuracy                           0.92        61
   macro avg       0.92      0.92      0.92        61
weighted avg       0.92      0.92      0.92        61



In [18]:
from sklearn.tree import DecisionTreeClassifier
dt = DecisionTreeClassifier(random_state=42).fit(x_train, y_train)

In [19]:
y_pred_dt = dt.predict(x_test)
print('Confusion Matrix:'+'\n',confusion_matrix(y_test, y_pred_dt))
print(classification_report(y_test, y_pred_dt))

Confusion Matrix:
 [[23  6]
 [ 7 25]]
              precision    recall  f1-score   support

           0       0.77      0.79      0.78        29
           1       0.81      0.78      0.79        32

    accuracy                           0.79        61
   macro avg       0.79      0.79      0.79        61
weighted avg       0.79      0.79      0.79        61



In [20]:
from sklearn.ensemble import RandomForestClassifier
rf = RandomForestClassifier(random_state=42).fit(x_train, y_train)

In [21]:
y_pred_rf = rf.predict(x_test)
print('Confusion Matrix:'+'\n' ,confusion_matrix(y_test, y_pred_rf))
print(classification_report(y_test, y_pred_rf))

Confusion Matrix:
 [[27  2]
 [ 6 26]]
              precision    recall  f1-score   support

           0       0.82      0.93      0.87        29
           1       0.93      0.81      0.87        32

    accuracy                           0.87        61
   macro avg       0.87      0.87      0.87        61
weighted avg       0.88      0.87      0.87        61



In [22]:
lr.get_params

<bound method BaseEstimator.get_params of LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='warn', n_jobs=None, penalty='l2',
                   random_state=42, solver='warn', tol=0.0001, verbose=0,
                   warm_start=False)>

In [23]:
k_range = range(1, 200)
scores = []

for k in k_range:
    knn = RandomForestClassifier(random_state=42, n_estimators=106, max_depth= k)
    knn.fit(x_train,y_train)
    y_pred = knn.predict(x_test)
    scores.append(accuracy_score(y_test, y_pred))

max = np.array(scores).max()
m=[]
m = df(scores)
m[1] = k_range
print('Best Parameter: ')
print(m[m[0] == max].min())

Best Parameter: 
0    0.918033
1    7.000000
dtype: float64


In [24]:
rf = RandomForestClassifier(random_state=42, n_estimators=106, max_depth=7).fit(x_train, y_train)
y_pred_rf = rf.predict(x_test)
print('Confusion Matrix:'+'\n' ,confusion_matrix(y_test, y_pred_rf))
print(classification_report(y_test, y_pred_rf))

Confusion Matrix:
 [[28  1]
 [ 4 28]]
              precision    recall  f1-score   support

           0       0.88      0.97      0.92        29
           1       0.97      0.88      0.92        32

    accuracy                           0.92        61
   macro avg       0.92      0.92      0.92        61
weighted avg       0.92      0.92      0.92        61



In [25]:
df({'Predicted':y_pred_rf,'Actual':y_test}).head(10)

Unnamed: 0,Predicted,Actual
179,0,0
228,1,1
111,1,1
246,1,1
60,1,1
9,1,1
119,1,1
223,1,1
268,0,1
33,0,0
