In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import f1_score
import joblib

In [2]:
df = pd.read_csv("diabetes.csv")


In [3]:
def category(bmi):
    if bmi < 18.5:
        return "Underweight"
    elif 18.5 <= bmi < 24.9:
        return "Normal weight"
    elif 25 <= bmi < 29.9:
        return "Overweight"
    else:
        return "Obesity"

df['BMI_category'] = df['BMI'].apply(category)

In [4]:
print(df.head())


   Pregnancies  Glucose  BloodPressure  SkinThickness  Insulin   BMI  \
0            6      148             72             35        0  33.6   
1            1       85             66             29        0  26.6   
2            8      183             64              0        0  23.3   
3            1       89             66             23       94  28.1   
4            0      137             40             35      168  43.1   

   DiabetesPedigreeFunction  Age  Outcome   BMI_category  
0                     0.627   50        1        Obesity  
1                     0.351   31        0     Overweight  
2                     0.672   32        1  Normal weight  
3                     0.167   21        0     Overweight  
4                     2.288   33        1        Obesity  


In [5]:
train_data, val_data = train_test_split(df, test_size=0.2, random_state=123)


In [6]:
numeric_features = ['Pregnancies', 'Glucose', 'BloodPressure', 'SkinThickness', 
                    'Insulin', 'BMI', 'DiabetesPedigreeFunction', 'Age']
scaler = StandardScaler()
train_data.loc[:, numeric_features] = scaler.fit_transform(train_data[numeric_features])
val_data.loc[:, numeric_features] = scaler.transform(val_data[numeric_features])

In [7]:
categorical_features = ['BMI_category']
encoder = OneHotEncoder(sparse_output=False, handle_unknown='ignore')
train_categorical = encoder.fit_transform(train_data[categorical_features])
val_categorical = encoder.transform(val_data[categorical_features])

In [8]:
train_categorical_df = pd.DataFrame(train_categorical, index=train_data.index, columns=encoder.get_feature_names_out())
val_categorical_df = pd.DataFrame(val_categorical, index=val_data.index, columns=encoder.get_feature_names_out())

In [9]:
train_data = train_data.drop(columns=categorical_features)
val_data = val_data.drop(columns=categorical_features)

In [10]:
train_data = pd.concat([train_data, train_categorical_df], axis=1)
val_data = pd.concat([val_data, val_categorical_df], axis=1)


In [11]:
X_train = train_data.drop(columns=['Outcome'])
y_train = df.loc[train_data.index, 'BMI_category']

In [12]:
X_val = val_data.drop(columns=['Outcome'])
y_val = df.loc[val_data.index, 'BMI_category']

In [13]:
print(f"X_train shape: {X_train.shape}")
print(f"X_val shape: {X_val.shape}")
print(f"y_train shape: {y_train.shape}")
print(f"y_val shape: {y_val.shape}")


X_train shape: (614, 12)
X_val shape: (154, 12)
y_train shape: (614,)
y_val shape: (154,)


In [14]:
best_knn_f1 = 0
best_knn = None

for k in [3, 5, 7]:
    knn = KNeighborsClassifier(n_neighbors=k)
    knn.fit(X_train, y_train)
    predictions = knn.predict(X_val)
    f1 = f1_score(y_val, predictions, average='weighted')
    print(f"k={k}, F1 Score={f1}")
    
    if f1 > best_knn_f1:
        best_knn_f1 = f1
        best_knn = knn

k=3, F1 Score=0.9329131803637732
k=5, F1 Score=0.932342789213097
k=7, F1 Score=0.9267833142382963


In [15]:
best_tree_f1 = 0
best_tree = None

for depth in [3, 5, 7]:
    tree = DecisionTreeClassifier(max_depth=depth, random_state=42)
    tree.fit(X_train, y_train)
    predictions = tree.predict(X_val)
    f1 = f1_score(y_val, predictions, average='weighted')
    print(f"max_depth={depth}, F1 Score={f1}")
    
    if f1 > best_tree_f1:
        best_tree_f1 = f1
        best_tree = tree


max_depth=3, F1 Score=1.0
max_depth=5, F1 Score=1.0
max_depth=7, F1 Score=1.0


In [16]:
if best_knn_f1 > best_tree_f1:
    best_model = best_knn
    print(f"Selected KNN as the best model with F1 Score: {best_knn_f1}")
else:
    best_model = best_tree
    print(f"Selected Decision Tree as the best model with F1 Score: {best_tree_f1}")


Selected Decision Tree as the best model with F1 Score: 1.0


In [17]:
joblib.dump(best_model, 'best_model.pkl')
joblib.dump(scaler, 'scaler.pkl')
joblib.dump(encoder, 'encoder.pkl')

['encoder.pkl']

In [18]:
best_model = joblib.load('best_model.pkl')
scaler = joblib.load('scaler.pkl')
encoder = joblib.load('encoder.pkl')

In [30]:
def predict(test_sample):
    if isinstance(test_sample, pd.Series):  
        test_sample = test_sample.to_frame().T

    missing_categorical = [col for col in categorical_features if col not in test_sample.columns]
    
    if missing_categorical:
        print(f"Warning: Missing categorical columns: {missing_categorical}")
        return None  

    numeric_scaled = scaler.transform(test_sample[numeric_features])
    categorical_encoded = encoder.transform(test_sample[categorical_features])  

    categorical_encoded_df = pd.DataFrame(categorical_encoded, columns=encoder.get_feature_names_out())
    
    X_test = np.hstack((numeric_scaled, categorical_encoded_df))
    return best_model.predict(X_test)


In [31]:
print(df.columns)


Index(['Pregnancies', 'Glucose', 'BloodPressure', 'SkinThickness', 'Insulin',
       'BMI', 'DiabetesPedigreeFunction', 'Age', 'Outcome', 'BMI_category'],
      dtype='object')


In [32]:
for i in range(5):
    sample = df.iloc[[X_val.index[i]]]  
    predicted_class = predict(sample)
    print(f"Sample {i+1}: Predicted Class = {predicted_class}")

Sample 1: Predicted Class = ['Obesity']
Sample 2: Predicted Class = ['Overweight']
Sample 3: Predicted Class = ['Obesity']
Sample 4: Predicted Class = ['Overweight']
Sample 5: Predicted Class = ['Obesity']


