<a href="https://colab.research.google.com/github/vyshnavi9241/Stroke-Prediction-Dataset-K-Nearest-Neighbors/blob/main/Stroke_Prediction_Dataset_K_Nearest_Neighbors.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import pandas as pd
import numpy as np

import matplotlib.pyplot as plt
import plotly.express as px
import plotly.figure_factory as ff
import plotly.graph_objects as go
from plotly.subplots import make_subplots
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report


In [2]:
df=pd.read_csv("healthcare-dataset-stroke-data.csv")

In [6]:
if 'id' in df.columns:
    df = df.drop(['id'], axis=1)  # removes the specified column of 'id'
else:
    print("Column 'id' not found in DataFrame.")

df['age'] = df['age'].astype('int')
df

Column 'id' not found in DataFrame.


Unnamed: 0,gender,age,hypertension,heart_disease,ever_married,work_type,Residence_type,avg_glucose_level,bmi,smoking_status,stroke
0,Male,67,0,1,Yes,Private,Urban,228.69,36.6,formerly smoked,1
1,Female,61,0,0,Yes,Self-employed,Rural,202.21,,never smoked,1
2,Male,80,0,1,Yes,Private,Rural,105.92,32.5,never smoked,1
3,Female,49,0,0,Yes,Private,Urban,171.23,34.4,smokes,1
4,Female,79,1,0,Yes,Self-employed,Rural,174.12,24.0,never smoked,1
...,...,...,...,...,...,...,...,...,...,...,...
5105,Female,80,1,0,Yes,Private,Urban,83.75,,never smoked,0
5106,Female,81,0,0,Yes,Self-employed,Urban,125.20,40.0,never smoked,0
5107,Female,35,0,0,Yes,Self-employed,Rural,82.99,30.6,never smoked,0
5108,Male,51,0,0,Yes,Private,Rural,166.29,25.6,formerly smoked,0


In [7]:
df.describe()

Unnamed: 0,age,hypertension,heart_disease,avg_glucose_level,bmi,stroke
count,5110.0,5110.0,5110.0,5110.0,4909.0,5110.0
mean,43.215264,0.097456,0.054012,106.147677,28.893237,0.048728
std,22.633866,0.296607,0.226063,45.28356,7.854067,0.21532
min,0.0,0.0,0.0,55.12,10.3,0.0
25%,25.0,0.0,0.0,77.245,23.5,0.0
50%,45.0,0.0,0.0,91.885,28.1,0.0
75%,61.0,0.0,0.0,114.09,33.1,0.0
max,82.0,1.0,1.0,271.74,97.6,1.0


In [8]:

df.isnull().sum()

gender                 0
age                    0
hypertension           0
heart_disease          0
ever_married           0
work_type              0
Residence_type         0
avg_glucose_level      0
bmi                  201
smoking_status         0
stroke                 0
dtype: int64

In [9]:

df = df[df['gender'] != 'Other']

In [12]:
df['smoking_status'] = df['smoking_status'].map({'Yes': 1, 'No': 0})
df['ever_married'] = df['ever_married'].map({'Yes': 1, 'No': 0})
df['work_type'] = df['work_type'].map({'Private': 0, 'Self-employed': 1, 'Govt_job': 2, 'children': 3, 'Never_worked': 4})
df['Residence_type'] = df['Residence_type'].map({'Urban': 0, 'Rural': 1})

color_palette = ['#FFFCFE', '#B6E2D3', '#D8F9B1']

corr = df.corr()  # Now calculate correlation on numerical data
fig = go.Figure(data= go.Heatmap(z=corr,
                                 x=corr.index.values,
                                 y=corr.columns.values,
                                 colorscale=color_palette,
                                 text = corr.round(2), hovertemplate="%{text}", xgap=3, ygap=3
                                 )
                )

fig.update_layout(title_text='<b>Correlation Matrix<b>',
                  title_x=0.5,
                  titlefont={'size': 24, 'family': 'Courier New'},
                  width=900, height=600,
                  xaxis_showgrid=False,
                  yaxis_showgrid=False,
                  yaxis_autorange='reversed',
                  paper_bgcolor=None,
                  template='plotly_dark'
                  )

fig.show()

In [14]:
gender_stroke_df = df.groupby(['gender', 'stroke']).size().reset_index().rename(columns={0: 'count'})

hypertension_stroke_df = df.groupby(['hypertension', 'stroke']).size().reset_index().rename(columns={0: 'count'})

married_stroke_df = df.groupby(['ever_married', 'stroke']).size().reset_index().rename(columns={0: 'count'})

work_type_stroke_df = df.groupby(['work_type', 'stroke']).size().reset_index().rename(columns={0: 'count'})

residence_stroke_df = df.groupby(['Residence_type', 'stroke']).size().reset_index().rename(columns={0: 'count'})

smoking_stroke_df = df.groupby(['smoking_status', 'stroke']).size().reset_index().rename(columns={0: 'count'})

## Creating Sunburst Figures
sb1 = px.sunburst(gender_stroke_df, values='count', path=['gender', 'stroke'])
sb2 = px.sunburst(hypertension_stroke_df, values='count', path=['hypertension', 'stroke'])

sb3 = px.sunburst(married_stroke_df, values='count', path=['ever_married', 'stroke'])
sb4 = px.sunburst(work_type_stroke_df, values='count', path=['work_type', 'stroke'])

sb5 = px.sunburst(residence_stroke_df, values='count', path=['Residence_type', 'stroke'])
sb6 = px.sunburst(smoking_stroke_df, values='count', path=['smoking_status', 'stroke'])

## Subplots
fig = make_subplots(rows=3, cols=2, specs=[
    [{"type": "sunburst"}, {"type": "sunburst"}],
    [{"type": "sunburst"}, {"type": "sunburst"}],
    [{"type": "sunburst"}, {"type": "sunburst"}]
], subplot_titles=("Gender and Stroke", "Hypertension and Stroke",
                   "Married and Stroke", "Work Type and Stroke",
                   "Residence and Stroke"))

## Plotting Figures
fig.add_trace(sb1.data[0], row=1, col=1)
fig.add_trace(sb2.data[0], row=1, col=2)
fig.add_trace(sb3.data[0], row=2, col=1)
fig.add_trace(sb4.data[0], row=2, col=2)
fig.add_trace(sb5.data[0], row=3, col=1)
fig.add_trace(sb6.data[0], row=3, col=2)

fig.update_traces(textinfo="label+percent parent")

# Update title and height
fig.update_layout(title_text="Stroke Sunbursts", title_x=0.5, height=1300, template='plotly_dark', showlegend=False,
        font=dict(
            family="Rubik",
            size=14)
)

fig.show()

In [16]:
df['gender']=df['gender'].apply(lambda x : 1 if x=='Male' else 0)
df["ever_married"] = df["ever_married"].apply(lambda x: 1 if x=="Yes" else 0)
df["Residence_type"] = df["Residence_type"].apply(lambda x: 1 if x=="Urban" else 0)

In [17]:
df = pd.get_dummies(data=df, columns=['smoking_status'])
df = pd.get_dummies(data=df, columns=['work_type'])

In [18]:
df = df.dropna()

In [19]:

X = df.drop(['stroke'], axis=1).values
y = df['stroke'].values

In [20]:
from sklearn.model_selection import train_test_split
from sklearn import metrics
from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix

In [21]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

In [22]:
len(X_train)

3926

In [23]:
len(X_test)

982

In [24]:
len(y_train)

3926

In [25]:

len(y_test)

982

In [26]:
knn = KNeighborsClassifier(n_neighbors=2, metric='euclidean')
knn.fit(X_train, y_train)
knn_pred = knn.predict(X_test) # model predict
knn_conf = confusion_matrix(y_test, knn_pred)
knn_report = classification_report(y_test, knn_pred)
knn_acc = round(accuracy_score(y_test, knn_pred)*100, ndigits = 2)
print(f"Confusion Matrix : \n\n{knn_conf}")
print(f"\nClassification Report : \n\n{knn_report}")
print(f"\nThe Accuracy of K Nearest Neighbors Classifier is {knn_acc} %")

Confusion Matrix : 

[[924  11]
 [ 46   1]]

Classification Report : 

              precision    recall  f1-score   support

           0       0.95      0.99      0.97       935
           1       0.08      0.02      0.03        47

    accuracy                           0.94       982
   macro avg       0.52      0.50      0.50       982
weighted avg       0.91      0.94      0.93       982


The Accuracy of K Nearest Neighbors Classifier is 94.2 %


In [28]:
knn_score = knn.score(X_train, y_train)
knn_test = knn.score(X_test, y_test)
knn_cm = confusion_matrix(y_test,knn_pred)
print(f'Training Score: {knn_score.round(3)}')
print(f'Testing Score: {knn_test.round(3)}')

conf_matrix = pd.DataFrame(data=knn_cm,columns=['Predicted:0','Predicted:1'],index=['Actual:0','Actual:1'])
print(f'Accuracy Score: {accuracy_score(y_test,knn_pred).round(3)}')

Training Score: 0.965
Testing Score: 0.942
Accuracy Score: 0.942


In [29]:
z=knn_cm
fig = ff.create_annotated_heatmap(z,
                                  name=True,
                                  y=['Actual No Stroke','Actual Stroke'],
                                  x=['Predicted No Stroke','Predicted Stroke'],
                                  colorscale='RdPu',
                                  xgap=3,ygap=3)
fig['data'][0]['showscale'] = True

fig.update_layout(title_text='<b>Confusion Matrix<b>',
                  title_x=0.5,
                  titlefont={'size': 24, 'family': 'Courier New'},
                  width=700, height=400,
                  xaxis_showgrid=False,
                  yaxis_showgrid=False,
                  yaxis_autorange='reversed',
                  paper_bgcolor=None,
                  template='plotly_dark'
                  )

fig.show()