In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
from sklearn.model_selection import GridSearchCV
import numpy as np

df = pd.read_csv('heart_2022_Key_indicators.csv')

In [2]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 319795 entries, 0 to 319794
Data columns (total 18 columns):
 #   Column            Non-Null Count   Dtype  
---  ------            --------------   -----  
 0   HeartDisease      319795 non-null  object 
 1   BMI               319795 non-null  float64
 2   Smoking           319795 non-null  object 
 3   AlcoholDrinking   319795 non-null  object 
 4   Stroke            319795 non-null  object 
 5   PhysicalHealth    319795 non-null  float64
 6   MentalHealth      319795 non-null  float64
 7   DiffWalking       319795 non-null  object 
 8   Sex               319795 non-null  object 
 9   AgeCategory       319795 non-null  object 
 10  Race              319795 non-null  object 
 11  Diabetic          319795 non-null  object 
 12  PhysicalActivity  319795 non-null  object 
 13  GenHealth         319795 non-null  object 
 14  SleepTime         319795 non-null  float64
 15  Asthma            319795 non-null  object 
 16  KidneyDisease     31

In [3]:
df.describe()

Unnamed: 0,BMI,PhysicalHealth,MentalHealth,SleepTime
count,319795.0,319795.0,319795.0,319795.0
mean,28.325399,3.37171,3.898366,7.097075
std,6.3561,7.95085,7.955235,1.436007
min,12.02,0.0,0.0,1.0
25%,24.03,0.0,0.0,6.0
50%,27.34,0.0,0.0,7.0
75%,31.42,2.0,3.0,8.0
max,94.85,30.0,30.0,24.0


In [4]:
df["HeartDisease"] = (df["HeartDisease"] == 'Yes').astype(int)

In [5]:
df["Smoking"] = (df["Smoking"] == 'Yes').astype(int)

In [6]:
df["AlcoholDrinking"] = (df["AlcoholDrinking"] == 'Yes').astype(int)

In [7]:
df["Stroke"] = (df["Stroke"] == 'Yes').astype(int)

In [8]:
df["DiffWalking"] = (df["DiffWalking"] == 'Yes').astype(int)

In [9]:
df["Sex"] = (df["Sex"] == 'Male').astype(int)

In [10]:
df["PhysicalActivity"] = (df["PhysicalActivity"] == 'Yes').astype(int)

In [11]:
df["Asthma"] = (df["Asthma"] == 'Yes').astype(int)

In [12]:
df["KidneyDisease"] = (df["KidneyDisease"] == 'Yes').astype(int)

In [13]:
df["SkinCancer"] = (df["SkinCancer"] == 'Yes').astype(int)

In [14]:
df.groupby('AgeCategory')['HeartDisease'].mean()

AgeCategory
18-24          0.006172
25-29          0.007844
30-34          0.012051
35-39          0.014404
40-44          0.023136
45-49          0.034143
50-54          0.054487
55-59          0.073999
60-64          0.098765
65-69          0.120084
70-74          0.156028
75-79          0.188483
80 or older    0.225603
Name: HeartDisease, dtype: float64

In [15]:
df['AgeCategory'].replace([
        '18-24', '25-29', '30-34', '35-39', '40-44', '45-49', '50-54', '55-59',
        '60-64', '65-69', '70-74', '75-79', '80 or older'
    ],
    [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13],
    inplace=True
)

In [16]:
df.groupby('GenHealth')['HeartDisease'].mean()

GenHealth
Excellent    0.022441
Fair         0.204285
Good         0.102632
Poor         0.341040
Very good    0.047261
Name: HeartDisease, dtype: float64

In [17]:
df['GenHealth'].replace([
        'Excellent', 'Very good', 'Good', 'Fair', 'Poor'
    ],
    [1, 2, 3, 4, 5],
    inplace=True
)

In [18]:
df['HeartDisease'].value_counts()

0    292422
1     27373
Name: HeartDisease, dtype: int64

In [19]:
df['Diabetic'].value_counts()

No                         269653
Yes                         40802
No, borderline diabetes      6781
Yes (during pregnancy)       2559
Name: Diabetic, dtype: int64

In [20]:
df.drop(df[df['Diabetic'] == 'No, borderline diabetes'].index, inplace=True)

In [21]:
df.drop(df[df['Diabetic'] == 'Yes (during pregnancy)'].index, inplace=True)

In [22]:
df["Diabetic"] = (df["Diabetic"] == 'Yes').astype(int)

In [23]:
df.reset_index(drop=True, inplace=True)

In [24]:
df.drop('Race', axis=1, inplace=True)

In [25]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 310455 entries, 0 to 310454
Data columns (total 17 columns):
 #   Column            Non-Null Count   Dtype  
---  ------            --------------   -----  
 0   HeartDisease      310455 non-null  int64  
 1   BMI               310455 non-null  float64
 2   Smoking           310455 non-null  int64  
 3   AlcoholDrinking   310455 non-null  int64  
 4   Stroke            310455 non-null  int64  
 5   PhysicalHealth    310455 non-null  float64
 6   MentalHealth      310455 non-null  float64
 7   DiffWalking       310455 non-null  int64  
 8   Sex               310455 non-null  int64  
 9   AgeCategory       310455 non-null  int64  
 10  Diabetic          310455 non-null  int64  
 11  PhysicalActivity  310455 non-null  int64  
 12  GenHealth         310455 non-null  int64  
 13  SleepTime         310455 non-null  float64
 14  Asthma            310455 non-null  int64  
 15  KidneyDisease     310455 non-null  int64  
 16  SkinCancer        31

In [29]:
df_features = df.drop('HeartDisease', axis=1)
df_features.to_csv(path_or_buf='features.csv', index=False, header=False)
pd.DataFrame(df['HeartDisease']).to_csv(path_or_buf='labels.csv', index=False, header=False)

In [26]:
df_features = df.drop('HeartDisease', axis=1)
scaler = StandardScaler()
scaler.fit(df_features)
scaled_features = scaler.transform(df_features)
scaled_data = pd.DataFrame(scaled_features, columns = df_features.columns)

In [27]:
x_train, x_test, y_train, y_test = train_test_split(
    scaled_data,
    df['HeartDisease'],
    test_size=0.33,
    random_state=42
)

In [28]:
print(len(x_train), len(x_test))

208004 102451


In [29]:
model = KNeighborsClassifier(n_neighbors = 5)
model.fit(x_train, y_train)

In [30]:
predictions = model.predict(x_test)

In [31]:
print(classification_report(y_test, predictions))

              precision    recall  f1-score   support

           0       0.92      0.98      0.95     93669
           1       0.35      0.13      0.19      8782

    accuracy                           0.90    102451
   macro avg       0.64      0.55      0.57    102451
weighted avg       0.87      0.90      0.88    102451



In [32]:
print(confusion_matrix(y_test, predictions))

[[91541  2128]
 [ 7626  1156]]


In [33]:
parametrs = { 'n_neighbors': [1, 3, 5, 7, 9] }

grid = GridSearchCV(KNeighborsClassifier(), parametrs, cv=5)
grid.fit(x_train, y_train)

In [34]:
grid.best_params_

{'n_neighbors': 9}

In [35]:
model = KNeighborsClassifier(n_neighbors = 2)
model.fit(x_train, y_train)
predictions = model.predict(x_test)

In [36]:
print(classification_report(y_test, predictions))

              precision    recall  f1-score   support

           0       0.92      0.99      0.95     93669
           1       0.33      0.07      0.12      8782

    accuracy                           0.91    102451
   macro avg       0.62      0.53      0.54    102451
weighted avg       0.87      0.91      0.88    102451



In [37]:
print(confusion_matrix(y_test, predictions))

[[92320  1349]
 [ 8132   650]]
