In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
df = pd.read_csv('train.csv')
df = df.drop('id', axis=1)

In [3]:
one_hot_categories = ['gender', 'ethnicity', 'employment_status']
df = pd.concat([df.drop(one_hot_categories, axis=1), pd.get_dummies(df[one_hot_categories], dtype=int)], axis=1)

In [4]:
from sklearn.preprocessing import OrdinalEncoder

ordinal_categories = ['education_level', 'income_level', 'smoking_status']
ordinal_columns = [
    ['No formal', 'Highschool', 'Graduate', 'Postgraduate'],
    ['Low', 'Lower-Middle', 'Middle', 'Upper-Middle', 'High'],
    ['Never', 'Former', 'Current']
]
ordinal_encoder = OrdinalEncoder(categories=ordinal_columns)
df = pd.concat(
        [
        df.drop(ordinal_categories, axis=1), 
        pd.DataFrame(ordinal_encoder.fit_transform(df[ordinal_categories]), columns=ordinal_categories)
    ], 
axis=1)

In [5]:
X, y = df.drop('diagnosed_diabetes', axis=1), df['diagnosed_diabetes']

In [7]:
from sklearn.model_selection import train_test_split, cross_val_score

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, shuffle=True, stratify=y)

In [None]:
from xgboost import XGBClassifier
from sklearn.model_selection import cross_val_score

def objective(trial):
    param = {
        'max_depth': trial.suggest_int('max_depth', 3, 10),
        'learning_rate': trial.suggest_float('learning_rate', 0.01, 0.1),
        'n_estimators': trial.suggest_int('n_estimators', 100, 1000),
        'subsample': trial.suggest_float('subsample', 0.5, 1.0),
        'colsample_bytree': trial.suggest_float('colsample_bytree', 0.5, 1.0),
        'min_child_weight': trial.suggest_int('min_child_weight', 1, 10),
        'gamma': trial.suggest_float('gamma', 0, 5),
    }
    
    # initializing the XGBoost model
    model = XGBClassifier(**param)
    
    score = cross_val_score(model, X_train, y_train, cv=5).mean()   # calculating score using cross-validation
    return score

In [11]:
import optuna

# Create and run the optimization process with 100 trials
study = optuna.create_study(study_name="example_xgboost_study", direction='maximize') 
study.optimize(objective, n_trials=100, show_progress_bar=True, n_jobs=-1)   

# Retrieve the best parameter values
best_params = study.best_params
print(f"\nBest parameters: {best_params}")

[I 2025-12-11 03:03:01,211] A new study created in memory with name: example_xgboost_study


  0%|          | 0/100 [00:00<?, ?it/s]

[I 2025-12-11 03:04:43,070] Trial 2 finished with value: 0.6645793650793651 and parameters: {'max_depth': 7, 'learning_rate': 0.010493072492900365, 'n_estimators': 173, 'subsample': 0.7738842149588019, 'colsample_bytree': 0.6539986070124677, 'min_child_weight': 4, 'gamma': 4.092135395569783}. Best is trial 2 with value: 0.6645793650793651.
[I 2025-12-11 03:05:38,572] Trial 3 finished with value: 0.6816587301587301 and parameters: {'max_depth': 9, 'learning_rate': 0.0387494925001038, 'n_estimators': 222, 'subsample': 0.9915743233067356, 'colsample_bytree': 0.751537684304319, 'min_child_weight': 8, 'gamma': 1.2687532257780059}. Best is trial 3 with value: 0.6816587301587301.
[I 2025-12-11 03:06:59,801] Trial 1 finished with value: 0.6795968253968254 and parameters: {'max_depth': 4, 'learning_rate': 0.02960079212651546, 'n_estimators': 721, 'subsample': 0.673888078463168, 'colsample_bytree': 0.6173434547684189, 'min_child_weight': 8, 'gamma': 3.0048003683256734}. Best is trial 3 with valu

In [22]:
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
to_scale = ['age', 'alcohol_consumption_per_week',
       'physical_activity_minutes_per_week', 'diet_score',
       'sleep_hours_per_day', 'screen_time_hours_per_day', 'bmi',
       'waist_to_hip_ratio', 'systolic_bp', 'diastolic_bp', 'heart_rate',
       'cholesterol_total', 'hdl_cholesterol', 'ldl_cholesterol',
       'triglycerides', 'family_history_diabetes', 'hypertension_history',
       'cardiovascular_history']
X[to_scale] = scaler.fit_transform(X[to_scale])

In [23]:
from sklearn.metrics import classification_report

xgb_model = XGBClassifier(**best_params)
xgb_model.fit(X, y)

In [24]:
submission_data = pd.read_csv('test.csv')
ids = submission_data['id']
submission_data = submission_data.drop('id', axis=1)

In [25]:
submission_data = pd.concat(
    [submission_data.drop(one_hot_categories, axis=1), pd.get_dummies(submission_data[one_hot_categories], dtype=int)], 
axis=1)

In [26]:
submission_data = pd.concat(
        [
        submission_data.drop(ordinal_categories, axis=1), 
        pd.DataFrame(ordinal_encoder.fit_transform(submission_data[ordinal_categories]), columns=ordinal_categories)
    ], 
axis=1)

In [27]:
submission_data[to_scale] = scaler.transform(submission_data[to_scale])

In [28]:
preds = xgb_model.predict(submission_data)
preds = pd.DataFrame(preds, columns=['diagnosed_diabetes'])

In [29]:
submission_result = pd.concat([ids, preds], axis=1).set_index('id')

In [30]:
submission_result.to_csv('submission.csv')