In [18]:
import numpy as np
from sklearn.model_selection import train_test_split
import pandas as pd


data = pd.read_csv('Data/chosen_data.csv')
data.drop('Unnamed: 0', inplace=True, axis=1)

np.random.seed(42)

X,y = data.drop('diagnosed_diabetes', axis=1), data['diagnosed_diabetes']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.5, random_state=1)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2, random_state=1)

X_train

Unnamed: 0,age,alcohol_consumption_per_week,physical_activity_minutes_per_week,diet_score,screen_time_hours_per_day,waist_to_hip_ratio,diastolic_bp,ldl_cholesterol,triglycerides,family_history_diabetes
57105,35,1,134,7.9,7.7,0.82,70,84,97,0
660816,64,4,72,5.2,8.5,0.81,66,118,123,0
295318,50,2,91,6.2,6.7,0.85,77,80,156,0
447035,61,3,116,4.7,5.9,0.94,74,103,151,0
106260,31,3,65,6.7,10.2,0.82,73,129,85,0
...,...,...,...,...,...,...,...,...,...,...
525621,32,4,216,2.3,7.2,0.90,84,103,129,0
359213,54,2,89,6.2,7.0,0.87,71,81,127,0
196287,59,4,100,5.5,4.1,0.81,85,112,114,0
141263,60,1,65,6.2,3.5,0.91,78,120,151,0


## Metrics

In [19]:
from matplotlib import pyplot as plt
from sklearn.metrics import confusion_matrix, roc_curve, accuracy_score, classification_report, ConfusionMatrixDisplay, \
    RocCurveDisplay, roc_auc_score


def metrics(preds:np.ndarray, probs:np.ndarray, lables:np.ndarray, method:str):
    cf_mtx = confusion_matrix(lables, preds)
    fpr, tpr, thresholds = roc_curve(lables, probs)
    roc_area = roc_auc_score(lables, probs)
    accuracy = accuracy_score(lables, preds)

    print(classification_report(lables, preds))

    text = f"{roc_area=}\n{accuracy=}"

    cf_mtx_disp = ConfusionMatrixDisplay(confusion_matrix=cf_mtx)
    roc_disp = RocCurveDisplay(fpr=fpr,tpr=tpr)

    fig, (ax_report, ax_matrix, ax_roc) = plt.subplots(1, 3, figsize=(16, 4))
    ax_report.text(
        x=0.05,
        y=0.95,
        s=text,
        family='monospace',
        fontsize=16,
        verticalalignment='top',
        transform=ax_report.transAxes)
    ax_report.set_title(f'Metrics for {method}')
    ax_report.axis('off')

    ax_matrix.set_title("Confusion Matrix")
    ax_matrix.grid(False)
    ax_matrix = cf_mtx_disp.plot(ax=ax_matrix, cmap='grey')

    ax_roc.set_title('ROC Curve')
    ax_roc.plot(fpr,tpr, linewidth=2, label='ROC Curve', color='red')
    ax_roc.set_xlabel('False Positive Rate')
    ax_roc.set_ylabel('True Positive Rate')


    return {"accuracy": accuracy, "roc_score": roc_area}

# Logistic and Poly features

In [None]:
from sklearn.preprocessing import StandardScaler, PolynomialFeatures
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression

pipe = Pipeline([('scaler', StandardScaler()), ('poly', PolynomialFeatures(degree=2)),('clf', LogisticRegression())])


pipe.fit(X_train, y_train)

In [None]:
metrics(pipe.predict(X_test), pipe.predict_proba(X_test)[:,1], y_test, "poly logistic")

# Optuna for poly logistic
On all data set

In [None]:
import optuna
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler, PolynomialFeatures
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import cross_val_score

def objective(trial):
    global  X, y
    # 1. Suggest hyperparameters
    # Degree of polynomial features
    poly_degree = trial.suggest_int('poly__degree', 1, 4)

    # Logistic Regression hyperparameters
    c_reg = trial.suggest_float('clf__C', 1e-5, 100, log=True)
    solver = trial.suggest_categorical('clf__solver', ['lbfgs'])
    tol = trial.suggest_float('clf__tol', 1e-5, 1e-1, log=True)
    max_iter = trial.suggest_int('clf__max_iter', 100, 1000)

    # 2. Define the pipeline with suggested params
    pipe = Pipeline([
        ('scaler', StandardScaler()),
        ('poly', PolynomialFeatures(degree=poly_degree, include_bias=False)),
        ('clf', LogisticRegression(C=c_reg, solver=solver, max_iter=max_iter, tol=tol, class_weight='balanced')),
    ])

    # 3. Perform Cross-Validation
    # We use 'roc_auc' as the scoring metric
    score = cross_val_score(pipe, X, y, cv=5, scoring='f1', n_jobs=-1)

    # Return the mean ROC AUC across folds
    return score.mean()

# --- Execution ---
study = optuna.create_study(direction='maximize')
study.optimize(objective, n_trials=50, show_progress_bar=True)

# Results and saving optimized pipline

In [None]:
study.best_params

In [None]:
import pickle

pipe.set_params(**study.best_params)
pipe.set_params(clf__class_weight='balanced',poly__include_bias=False )

pipe.fit(X,y)

## Saving

In [None]:
name = 'Logistic_Poly_full_f1'
with open(f'models/{name}.pkl', 'wb') as f:
    pickle.dump(pipe, f)


## Test Predict

In [None]:
import os
import numpy as np

data_kaggle = pd.read_csv('Data/chosen_test.csv')
data_kaggle.drop('Unnamed: 0', axis=1, inplace=True)

submission_name = "Poly_Logistic_full_CV.csv"
ids = np.arange(700000, 1000000)

probs = pipe.predict_proba(data_kaggle)[:, 1]
submission = pd.DataFrame({
    "id": ids,
    "diagnosed_diabetes": probs
})

PATH = os.path.join('results',submission_name)

submission.to_csv(PATH, index=False, float_format="%.6f")
submission.head()