In [2]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.metrics import f1_score
import joblib

# Load dataset
data = pd.read_csv("diabetes.csv")

In [3]:
# Define BMI categories
def categorize_bmi(bmi):
    if bmi < 18.5:
        return "Underweight"
    elif 18.5 <= bmi < 24.9:
        return "Normal"
    elif 25 <= bmi < 29.9:
        return "Overweight"
    else:
        return "Obese"

data['BMI_category'] = data['BMI'].apply(categorize_bmi)

# Splitting data into train and validation sets
train_data, val_data = train_test_split(data, test_size=0.2, random_state=42, stratify=data['Outcome'])

# Identifying numerical and categorical columns
num_features = ['Pregnancies', 'Glucose', 'BloodPressure', 'SkinThickness', 'Insulin', 'BMI', 'DiabetesPedigreeFunction', 'Age']
cat_features = ['BMI_category']

# Preprocessing Pipelines
num_pipeline = StandardScaler()
cat_pipeline = OneHotEncoder()

preprocessor = ColumnTransformer([
    ('num', num_pipeline, num_features),
    ('cat', cat_pipeline, cat_features)
])

In [4]:
# Transforming data
X_train = preprocessor.fit_transform(train_data)
X_val = preprocessor.transform(val_data)
y_train = train_data['Outcome']
y_val = val_data['Outcome']

# Experiment with KNN classifier
best_knn, best_knn_f1 = None, 0
for k in [3, 5, 7]:
    knn = KNeighborsClassifier(n_neighbors=k)
    knn.fit(X_train, y_train)
    y_pred = knn.predict(X_val)
    f1 = f1_score(y_val, y_pred)
    if f1 > best_knn_f1:
        best_knn_f1 = f1
        best_knn = knn

# Experiment with Decision Tree classifier
best_dt, best_dt_f1 = None, 0
for depth in [3, 5, 7]:
    dt = DecisionTreeClassifier(max_depth=depth, random_state=42)
    dt.fit(X_train, y_train)
    y_pred = dt.predict(X_val)
    f1 = f1_score(y_val, y_pred)
    if f1 > best_dt_f1:
        best_dt_f1 = f1
        best_dt = dt

# Selecting best model
best_model = best_knn if best_knn_f1 > best_dt_f1 else best_dt

# Save models and transformers
joblib.dump(preprocessor, "preprocessor.pkl")
joblib.dump(best_model, "best_model.pkl")

# Inference function
def predict_sample(sample):
    preprocessor = joblib.load("preprocessor.pkl")
    model = joblib.load("best_model.pkl")
    sample_transformed = preprocessor.transform(sample)
    prediction = model.predict(sample_transformed)
    return prediction
# Test inference on 5 samples
samples = val_data.sample(5, random_state=42).drop(columns=['Outcome'])
predictions = predict_sample(samples)
print("Predictions:", predictions)


Predictions: [1 1 1 0 0]


In [6]:
# Check predictions against actual values
samples = val_data.sample(5, random_state=42).drop(columns=['Outcome'])
predictions = predict_sample(samples)
actual_values = y_val.loc[samples.index].values

print("Predictions:", predictions)
print("Actual Values:", actual_values)

# Performance evaluation
print("F1 Score:", f1_score(y_val, best_model.predict(X_val)))



Predictions: [1 1 1 0 0]
Actual Values: [1 1 1 0 0]
F1 Score: 0.7037037037037037
