In [56]:
import warnings
warnings.filterwarnings("ignore")

In [57]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

In [58]:
df = pd.read_csv("../data/16-diabetes.csv")

In [59]:
df.head()

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
0,6,148,72,35,0,33.6,0.627,50,1
1,1,85,66,29,0,26.6,0.351,31,0
2,8,183,64,0,0,23.3,0.672,32,1
3,1,89,66,23,94,28.1,0.167,21,0
4,0,137,40,35,168,43.1,2.288,33,1


In [60]:
df.describe()

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
count,768.0,768.0,768.0,768.0,768.0,768.0,768.0,768.0,768.0
mean,3.845052,120.894531,69.105469,20.536458,79.799479,31.992578,0.471876,33.240885,0.348958
std,3.369578,31.972618,19.355807,15.952218,115.244002,7.88416,0.331329,11.760232,0.476951
min,0.0,0.0,0.0,0.0,0.0,0.0,0.078,21.0,0.0
25%,1.0,99.0,62.0,0.0,0.0,27.3,0.24375,24.0,0.0
50%,3.0,117.0,72.0,23.0,30.5,32.0,0.3725,29.0,0.0
75%,6.0,140.25,80.0,32.0,127.25,36.6,0.62625,41.0,1.0
max,17.0,199.0,122.0,99.0,846.0,67.1,2.42,81.0,1.0


#### Train-Test Split

In [61]:
from sklearn.model_selection import train_test_split

X = df.drop("Outcome", axis=1)
y = df["Outcome"]

X_train, X_test, y_train, y_test = train_test_split(X,y, test_size=0.2, random_state=15)

#### Data Organise

In [62]:
columns_to_check = ["Glucose", "BloodPressure", "SkinThickness", "Insulin", "BMI"]

# Bir önceki çalışmada 0 değerlerinin yerine sütunun medyan değerini koymuştuk.
# Şimdi 0 olan değerleri drop edip modelini başarısını ölçüceğim.

for col in columns_to_check:
    X_train = X_train[X_train[col] != 0]
    X_test  = X_test[X_test[col] != 0]

y_train = y_train.loc[X_train.index]
y_test  = y_test.loc[X_test.index]


#### Scaling

In [63]:
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()

X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled  = scaler.transform(X_test)

In [64]:
from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier

knn_params = {
    "n_neighbors": [3,5,7,9,15],
    "weights": ["uniform", "distance"]
}

logistic_params = {
    "penalty" : ["l1", "l2", "elasticnet", None],
    "C" : [100, 10, 1, 0.1, 0.01],
    "solver" : ["newton-cg", "lbfgs", "liblinear", " sag", "newton-cholesky"]
}

svc_params = {
    "C": [0.1, 1, 10, 100, 1000],
    "kernel" : ["rbf"],
    "gamma" : ["scale", "auto"]
}

dt_params = {
    "criterion" : ["gini", "entropy", "log_loss"],
    "splitter" : ["best", "random"],
    "max_depth" : [1,2,3,4,5,15,None],
    "max_features" : ["sqrt", "log2", None]
}

rf_params = {
    "n_estimators" : [100,200, 500, 1000],
    "max_depth" : [5,8,10,15,None],
    "max_features" : ["sqrt", "log2", None],
    "min_samples_split" : [2,8,15,20]
}

In [65]:
from sklearn.metrics import recall_score, roc_auc_score
from sklearn.model_selection import GridSearchCV

models = [
    ("Logistic Regression", LogisticRegression(max_iter=1000), logistic_params),
    ("KNN", KNeighborsClassifier(), knn_params),
    ("SVM", SVC(probability=True), svc_params),
    ("Decision Tree", DecisionTreeClassifier(random_state=42), dt_params),
    ("Random Forest", RandomForestClassifier(random_state=42), rf_params),
]

results = []

for name, model, params in models:
    grid = GridSearchCV(estimator=model, param_grid=params, n_jobs=-1, cv=5, scoring="recall")

    if name == "Decision Tree" or name == "Random Forest":
        grid.fit(X_train, y_train)
        best_model = grid.best_estimator_
        y_pred = best_model.predict(X_test)
        y_prob = best_model.predict_proba(X_test)[:, 1]
    else:
        grid.fit(X_train_scaled, y_train)
        best_model = grid.best_estimator_
        y_pred = best_model.predict(X_test_scaled)
        y_prob = best_model.predict_proba(X_test_scaled)[:, 1]

    results.append({
        "Model": name,
        "Best CV Recall": grid.best_score_,
        "Test Recall": recall_score(y_test, y_pred),
        "Test ROC-AUC": roc_auc_score(y_test, y_prob),
        "Best Params": grid.best_params_
    })

results_df = pd.DataFrame(results).sort_values(by="Test Recall", ascending=False)

In [66]:
results_df

Unnamed: 0,Model,Best CV Recall,Test Recall,Test ROC-AUC,Best Params
3,Decision Tree,0.733333,0.8,0.821509,"{'criterion': 'gini', 'max_depth': 3, 'max_fea..."
4,Random Forest,0.628571,0.68,0.873208,"{'max_depth': 10, 'max_features': None, 'min_s..."
1,KNN,0.552381,0.68,0.852453,"{'n_neighbors': 3, 'weights': 'uniform'}"
2,SVM,0.571429,0.64,0.82717,"{'C': 10, 'gamma': 'auto', 'kernel': 'rbf'}"
0,Logistic Regression,0.638095,0.6,0.895849,"{'C': 0.01, 'penalty': 'l2', 'solver': 'liblin..."
