# Heart-stroke prediction

1. #3 (age): age in years 
2. #4 (sex): sex (1 = male; 0 = female) 
3. #9 (cp): cp: chest pain type | Value 0: typical angina | Value 1: atypical angina | Value 2: non-anginal pain | Value 3: asymptomatic 
4. #10 (trestbps): resting blood pressure (in mm Hg on admission to the hospital) 
5. #12 (chol): serum cholestoral in mg/dl 
6. #16 (fbs): (fasting blood sugar > 120 mg/dl) (1 = true; 0 = false) 
7. #19 (restecg): resting electrocardiographic results | Value 0: normal | Value 1: having ST-T wave abnormality (T wave inversions and/or ST elevation or depression of > 0.05 mV) | Value 2: showing probable or definite left ventricular hypertrophy by Estes' criteria 
8. #32 (thalach): maximum heart rate achieved 
9. #38 (exang): exercise induced angina (1 = yes; 0 = no) 
10. #40 (oldpeak): ST depression induced by exercise relative to rest 
11. #41 (slope): the slope of the peak exercise ST segment | Value 1: upsloping | Value 2: flat | Value 3: downsloping 
12. #44 (ca): number of major vessels (0-3) colored by flourosopy 
13. #51 (thal): 3 = normal; 6 = fixed defect; 7 = reversable defect 
14. #58 (num) (the predicted attribute): Value 0: < 50% diameter narrowing | Value 1: > 50% diameter narrowing 

# Import libraries

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import pandas_profiling as pp
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.model_selection import train_test_split, GridSearchCV, cross_validate, cross_val_score
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.metrics import classification_report, confusion_matrix, log_loss, recall_score
from yellowbrick.classifier import ConfusionMatrix
from yellowbrick.cluster import KElbowVisualizer

In [None]:
%matplotlib inline

# Read in the Dataset

In [None]:
df = pd.read_csv('heart.csv')

In [None]:
df.head()

In [None]:
df.shape

# EDA

In [None]:
df.profile_report()

In [None]:
df.target.value_counts(normalize=True)

In [None]:
df.info()

# Data Pre-processing

In [None]:
df.duplicated().sum()

In [None]:
df.drop_duplicates(inplace=True)

In [None]:
df.duplicated().sum()

In [None]:
for col in df.select_dtypes('int'):
    print(df[col].value_counts(), '\n\n')

In [None]:
df['thal'].value_counts()

In [None]:
# Drop 0's as they mean null
df.drop(df[df['thal']==0].index, inplace=True)

In [None]:
df['thal'].value_counts()

In [None]:
plt.figure(figsize=(15, 10))
sns.heatmap(df.corr(), annot=True)

In [None]:
X = df.drop('target', axis=1)

In [None]:
y = df.target

In [None]:
X

In [None]:
y

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

In [None]:
scaler = MinMaxScaler()

In [None]:
X_train_scaled = scaler.fit_transform(X_train)

In [None]:
X_test_scaled = scaler.transform(X_test)

# Implement kNN

In [None]:
knn = KNeighborsClassifier(n_neighbors=1)

In [None]:
knn.fit(X_train_scaled, y_train)

In [None]:
knn.fit(X_train_scaled, y_train)

In [None]:
knn_pred = knn.predict(X_test_scaled)

In [None]:
print(classification_report(y_test, knn_pred))

In [None]:
plt.figure(figsize=(10, 6))
sns.heatmap(confusion_matrix(y_test, knn_pred), annot=True)

In [None]:
plt.figure(figsize=(10, 6))
cm = ConfusionMatrix(knn)
cm.fit(X_train_scaled, y_train)
cm.score(X_test_scaled, y_test)
cm.show()

In [None]:
error_rate = []
for i in range(1,40):
    knn = KNeighborsClassifier(n_neighbors=i)
    knn.fit(X_train_scaled,y_train)
    pred_i = knn.predict(X_test_scaled)
    error_rate.append(1 - recall_score(y_test, pred_i))

In [None]:
plt.figure(figsize=(10,6))
plt.plot(range(1,40),error_rate,color='blue', linestyle='dashed', marker='o',
         markerfacecolor='red', markersize=10)
plt.title('Error Rate vs. K Value')
plt.xlabel('K')
plt.ylabel('Error Rate')

In [None]:
knn = KNeighborsClassifier(n_neighbors=13)

In [None]:
knn.fit(X_train_scaled, y_train)

In [None]:
knn_pred = knn.predict(X_test_scaled)

In [None]:
print(classification_report(y_test, knn_pred))

In [None]:
recall_score(y_test, knn_pred, pos_label=1)

In [None]:
plt.figure(figsize=(10, 6))

cm = ConfusionMatrix(knn)
cm.fit(X_train_scaled, y_train)
cm.score(X_test_scaled, y_test)
cm.show()

In [None]:
scores = cross_validate(KNeighborsClassifier(n_neighbors=13), X_train_scaled, y_train, cv=10, n_jobs=-1, scoring=['recall'])

In [None]:
pd.DataFrame(scores)

In [None]:
pd.DataFrame(scores)['test_recall'].mean()

In [None]:
grid_params = {'weights':['uniform', 'distance'], 'algorithm': ['auto', 'ball_tree', 'kd_tree', 'brute'], 'p':[1, 2], 'leaf_size': [20, 25, 30, 35, 40, 45],
              'n_neighbors': [2, 3, 4, 5, 6, 7, 8, 9, 10], 'metric': ['minkowski', 'manhattan', 'euclidean']}

In [None]:
cv = GridSearchCV(KNeighborsClassifier(n_jobs=-1), param_grid=grid_params, n_jobs=-1, verbose=1, scoring='recall')

In [None]:
cv.fit(X_train_scaled, y_train)

In [None]:
cv.best_score_

In [None]:
cv.best_params_

In [None]:
knn_pred = cv.predict(X_test_scaled)

In [None]:
print(classification_report(y_test, knn_pred))

In [None]:
plt.figure(figsize=(10, 6))
cm = ConfusionMatrix(knn)
cm.fit(X_train_scaled, y_train)
cm.score(X_test_scaled, y_test)
cm.show()

In [None]:
knn_recall = recall_score(y_test, knn_pred)

In [None]:
knn_recall

# Logistic Regression

In [None]:
log_model = LogisticRegression(n_jobs=-1)

In [None]:
log_model.fit(X_train_scaled, y_train)

In [None]:
log_pred = log_model.predict(X_test_scaled)

In [None]:
print(classification_report(y_test, log_pred))

In [None]:
cm = ConfusionMatrix(log_model)
cm.fit(X_train_scaled, y_train)
cm.score(X_test_scaled, y_test)
cm.show()

In [None]:
scores = cross_validate(log_model, X_train_scaled, y_train, scoring='recall', cv=10, n_jobs=-1)

In [None]:
pd.DataFrame(scores)

In [None]:
pd.DataFrame(scores)['test_score'].mean()

In [None]:
log_recall = recall_score(y_test, log_pred)

In [None]:
log_recall

# SVClassifier

In [None]:
sv_model = SVC()

In [None]:
sv_model.fit(X_train_scaled, y_train)

In [None]:
sv_pred = sv_model.predict(X_test_scaled)

In [None]:
print(classification_report(y_test, sv_pred))

In [None]:
plt.figure(figsize=(10, 6))
cm = ConfusionMatrix(sv_model)
cm.fit(X_train_scaled, y_train)
cm.score(X_test_scaled, y_test)
cm.show()

In [None]:
scores = cross_validate(sv_model, X_train_scaled, y_train, cv=10, n_jobs=-1, scoring='recall')

In [None]:
pd.DataFrame(scores)

In [None]:
pd.DataFrame(scores)['test_score'].mean()

In [None]:
grid_param = {'C':[0.1, 0.3, 0.5, 0.7, 1, 1.5, 2, 2.5, 3],
             'kernel': ['linear', 'poly', 'rbf', 'sigmoid'],
             'degree': [1, 2, 3, 4],
             'gamma':[0.1, 0.3, 0.5, 0.7, 1, 1.5, 2, 2.5, 3, 'scale', 'auto'],
             'random_state':[42]}

In [None]:
cv = GridSearchCV(SVC(random_state=42), param_grid=grid_param, n_jobs=-1, scoring='recall')

In [None]:
cv.fit(X_train_scaled, y_train)

In [None]:
cv.best_score_

In [None]:
cv.best_params_

In [None]:
cv.best_estimator_

In [None]:
sv_pred = cv.predict(X_test_scaled)

In [None]:
print(classification_report(y_test, sv_pred))

In [None]:
svc_tuned = SVC(C=0.1, degree=1, gamma=0.1, kernel='poly', random_state=42)

In [None]:
scores = cross_validate(svc_tuned, X_train_scaled, y_train, cv=10, n_jobs=-1, scoring='recall')

In [None]:
pd.DataFrame(scores)

In [None]:
plt.figure(figsize=(10, 6))
cm = ConfusionMatrix(svc_tuned)
cm.fit(X_train_scaled, y_train)
cm.score(X_test_scaled, y_test)
cm.show()

In [None]:
sv_recall = recall_score(y_test, sv_pred)

In [None]:
sv_recall

# Visually Compare Model Scores

In [None]:
plt.figure(figsize=(10, 6))

rec_df = pd.DataFrame({'knn_recall_score': knn_recall,
              'support_vector_recall_score': sv_recall,
              'log_reg_recall_score': log_recall}, index=[0])

In [None]:
rec_df

In [None]:
plt.figure(figsize=(15, 10))
sns.barplot(x=rec_df.columns, y=rec_df.iloc[0].values)