In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.pipeline import make_pipeline
from sklearn.neighbors import KNeighborsClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.compose import make_column_transformer
from sklearn.metrics import confusion_matrix, precision_score, recall_score


In [2]:
df = pd.read_csv("diabetes_dataset.csv")
df

Unnamed: 0,age,gender,ethnicity,education_level,income_level,employment_status,smoking_status,alcohol_consumption_per_week,physical_activity_minutes_per_week,diet_score,...,hdl_cholesterol,ldl_cholesterol,triglycerides,glucose_fasting,glucose_postprandial,insulin_level,hba1c,diabetes_risk_score,diabetes_stage,diagnosed_diabetes
0,58,Male,Asian,Highschool,Lower-Middle,Employed,Never,0,215,5.7,...,41,160,145,136,236,6.36,8.18,29.6,Type 2,1
1,48,Female,White,Highschool,Middle,Employed,Former,1,143,6.7,...,55,50,30,93,150,2.00,5.63,23.0,No Diabetes,0
2,60,Male,Hispanic,Highschool,Middle,Unemployed,Never,1,57,6.4,...,66,99,36,118,195,5.07,7.51,44.7,Type 2,1
3,74,Female,Black,Highschool,Low,Retired,Never,0,49,3.4,...,50,79,140,139,253,5.28,9.03,38.2,Type 2,1
4,46,Male,White,Graduate,Middle,Retired,Never,1,109,7.2,...,52,125,160,137,184,12.74,7.20,23.5,Type 2,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
99995,46,Male,Other,Graduate,Upper-Middle,Unemployed,Former,1,136,8.3,...,45,150,116,113,109,14.58,5.55,26.0,Pre-Diabetes,0
99996,41,Female,White,Graduate,Middle,Employed,Never,3,76,8.8,...,55,123,146,96,146,9.02,5.97,24.4,Pre-Diabetes,0
99997,57,Female,Black,No formal,Upper-Middle,Employed,Former,4,121,9.9,...,50,111,184,93,132,2.57,5.21,27.6,No Diabetes,0
99998,47,Female,Black,Highschool,Lower-Middle,Retired,Never,3,52,5.9,...,68,91,116,106,117,9.81,5.53,26.4,Pre-Diabetes,0


In [3]:
df.columns

Index(['age', 'gender', 'ethnicity', 'education_level', 'income_level',
       'employment_status', 'smoking_status', 'alcohol_consumption_per_week',
       'physical_activity_minutes_per_week', 'diet_score',
       'sleep_hours_per_day', 'screen_time_hours_per_day',
       'family_history_diabetes', 'hypertension_history',
       'cardiovascular_history', 'bmi', 'waist_to_hip_ratio', 'systolic_bp',
       'diastolic_bp', 'heart_rate', 'cholesterol_total', 'hdl_cholesterol',
       'ldl_cholesterol', 'triglycerides', 'glucose_fasting',
       'glucose_postprandial', 'insulin_level', 'hba1c', 'diabetes_risk_score',
       'diabetes_stage', 'diagnosed_diabetes'],
      dtype='object')

In [4]:
cols_to_drop = ['education_level', 'employment_status', 'income_level', 'waist_to_hip_ratio', 'ldl_cholesterol', 'hdl_cholesterol', 'hba1c', 'diabetes_risk_score', 'smoking_status', 'ethnicity', 'cardiovascular_history']
df = df.drop(columns=cols_to_drop)

In [5]:
df.columns

Index(['age', 'gender', 'alcohol_consumption_per_week',
       'physical_activity_minutes_per_week', 'diet_score',
       'sleep_hours_per_day', 'screen_time_hours_per_day',
       'family_history_diabetes', 'hypertension_history', 'bmi', 'systolic_bp',
       'diastolic_bp', 'heart_rate', 'cholesterol_total', 'triglycerides',
       'glucose_fasting', 'glucose_postprandial', 'insulin_level',
       'diabetes_stage', 'diagnosed_diabetes'],
      dtype='object')

In [6]:
# Separate features and target
target_col = "diagnosed_diabetes"
X = df.drop(target_col, axis=1)
y = df[target_col].astype(bool)

In [7]:
'''Given a list of features, create a KNN model and evaluate it.'''
def knn_model(features: list, model_num: int):
  x = X[features]

  # split the set into test and train
  X_train, X_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=42)

  # Construct the pipeline
  preproc = make_column_transformer(
      (StandardScaler(), features), # all candidate features are numerical
      remainder='drop'
  )
  pipeline = make_pipeline(
      preproc,
      KNeighborsClassifier(n_neighbors=5)
  )

  pipeline.fit(X_train, y_train)
  predictions = pipeline.predict(X_test)

  conf_mat = pd.DataFrame(
    confusion_matrix(y_test, predictions),
    columns = pipeline.classes_,
    index = pipeline.classes_)
  precision = precision_score(y_test, predictions, average='binary')
  recall = recall_score(y_test, predictions, average='binary')
  print(f"Precision for model {model_num}: {precision:.4f}\nRecall for model {model_num}: {recall:.4f}\n")
  print(f"Confusion Matrix:\n{conf_mat}")

In [8]:
# SEGMENT FOR MODEL 1
candidate_features_1 = ['age', 'alcohol_consumption_per_week', 'physical_activity_minutes_per_week', 'bmi', 'systolic_bp', 'glucose_fasting', 'glucose_postprandial', 'insulin_level']
knn_model(candidate_features_1, 1)

Precision for model 1: 0.8552
Recall for model 1: 0.8229

Confusion Matrix:
       False  True 
False   6416   1661
True    2112   9811


In [9]:
# SEGMENT FOR MODEL 2
candidate_features_2 = ['age', 'alcohol_consumption_per_week', 'physical_activity_minutes_per_week', 'bmi', 'systolic_bp', 'glucose_fasting', 'glucose_postprandial']
knn_model(candidate_features_2, 2)

Precision for model 2: 0.8656
Recall for model 2: 0.8224

Confusion Matrix:
       False  True 
False   6555   1522
True    2118   9805


In [10]:
# SEGMENT FOR MODEL 3
candidate_features_3 = ['alcohol_consumption_per_week', 'physical_activity_minutes_per_week', 'bmi', 'systolic_bp', 'glucose_fasting', 'glucose_postprandial']
knn_model(candidate_features_3, 3)

Precision for model 3: 0.8631
Recall for model 3: 0.8209

Confusion Matrix:
       False  True 
False   6525   1552
True    2136   9787


In [11]:
# SEGMENT FOR MODEL 4
candidate_features_4 = ['physical_activity_minutes_per_week', 'bmi', 'systolic_bp', 'glucose_fasting', 'glucose_postprandial']
knn_model(candidate_features_4, 4)

Precision for model 4: 0.8700
Recall for model 4: 0.8245

Confusion Matrix:
       False  True 
False   6608   1469
True    2093   9830


In [12]:
# SEGMENT FOR MODEL 5
candidate_features_5 = ['alcohol_consumption_per_week', 'physical_activity_minutes_per_week', 'bmi', 'systolic_bp', 'glucose_fasting', 'glucose_postprandial', 'insulin_level']
knn_model(candidate_features_5, 5)

Precision for model 5: 0.8580
Recall for model 5: 0.8185

Confusion Matrix:
       False  True 
False   6462   1615
True    2164   9759


In [13]:
# SEGMENT FOR MODEL 6
candidate_features_6 = ['alcohol_consumption_per_week', 'physical_activity_minutes_per_week', 'systolic_bp', 'glucose_fasting', 'glucose_postprandial', 'insulin_level']
knn_model(candidate_features_6, 6)

Precision for model 6: 0.8649
Recall for model 6: 0.8204

Confusion Matrix:
       False  True 
False   6549   1528
True    2141   9782


In [14]:
# SEGMENT FOR MODEL 7
candidate_features_7 = ['physical_activity_minutes_per_week', 'systolic_bp', 'glucose_fasting', 'glucose_postprandial']
knn_model(candidate_features_7, 7)

Precision for model 7: 0.8744
Recall for model 7: 0.8246

Confusion Matrix:
       False  True 
False   6665   1412
True    2091   9832


In [15]:
# SEGMENT FOR MODEL 8
candidate_features_8 = ['systolic_bp', 'glucose_fasting', 'glucose_postprandial']
knn_model(candidate_features_8, 8)

Precision for model 8: 0.8737
Recall for model 8: 0.8227

Confusion Matrix:
       False  True 
False   6659   1418
True    2114   9809


In [16]:
# SEGMENT FOR MODEL 9
candidate_features_9 = ['bmi', 'systolic_bp', 'glucose_fasting', 'glucose_postprandial']
knn_model(candidate_features_9, 9)

Precision for model 9: 0.8711
Recall for model 9: 0.8219

Confusion Matrix:
       False  True 
False   6627   1450
True    2124   9799


In [17]:
# SEGMENT FOR MODEL 10
candidate_features_10 = ['diet_score', 'physical_activity_minutes_per_week', 'systolic_bp', 'glucose_fasting', 'glucose_postprandial']
knn_model(candidate_features_10, 10)

Precision for model 10: 0.8670
Recall for model 10: 0.8208

Confusion Matrix:
       False  True 
False   6576   1501
True    2137   9786


In [20]:
# SEGMENT FOR MODEL 11
candidate_features_11 = ['physical_activity_minutes_per_week', 'bmi', 'glucose_fasting', 'glucose_postprandial']
knn_model(candidate_features_11, 11)

Precision for model 11: 0.8712
Recall for model 11: 0.8234

Confusion Matrix:
       False  True 
False   6625   1452
True    2106   9817
