# Coding Block 2 - Hyperparameter Optimization

### Load the packages

In [6]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import xgboost as xgb
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split, GridSearchCV, RandomizedSearchCV
from sklearn.metrics import accuracy_score, classification_report
import time

### Read the dataset

In [7]:
diab=pd.read_csv('diabetes.csv')
diab_cleaned=pd.read_csv('diabetes_cleaned.csv')

### Copy the code from your last successful classifiers (RF, XGBoost, ...)

In [8]:
def create_model(data, model_type="xgboost"):
    """
    Create and train ML models on the given dataset
    
    Parameters:
    -----------
    data : DataFrame
        The dataset containing features and target variable
    model_type : str
        The type of model to create (default: "xgboost")
        
    Returns:
    --------
    dict
        Dictionary containing the trained model, X and y data, and train/test splits
    """
    # Separate features and target
    X = data.drop('Outcome', axis=1)
    y = data['Outcome']
    
    # Split the data
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
    
    # Create and train model based on type
    if model_type == "random_forest":
        from sklearn.ensemble import RandomForestClassifier
        model = RandomForestClassifier(random_state=42)
        model.fit(X_train, y_train)
    elif model_type == "xgboost":
        import xgboost as xgb
        model = xgb.XGBClassifier(random_state=42)
        model.fit(X_train, y_train)
    else:
        raise ValueError(f"Unsupported model type: {model_type}")
    
    # Evaluate the model
    y_pred = model.predict(X_test)
    accuracy = accuracy_score(y_test, y_pred)
    print(f"{model_type.title()} Model Accuracy: {accuracy:.4f}")
    print(classification_report(y_test, y_pred))
    
    # Return model and data
    return {
        "model": model,
        "X": X,
        "y": y,
        "X_train": X_train,
        "X_test": X_test,
        "y_train": y_train,
        "y_test": y_test
    }

### Define the parameter grid for GridSearchCV or use RandomizedSearchCV

In [9]:
# Define a small parameter grid for XGBoost with GridSearchCV (~20 combinations)
param_grid_xgb = {
    'n_estimators': [100, 200],            # Number of trees
    'max_depth': [3, 5, 7],                # Maximum tree depth
    'learning_rate': [0.01, 0.1],          # Learning rate / eta
    'subsample': [0.8],                    # Subsample ratio
    'colsample_bytree': [0.8, 1.0]         # Column sampling per tree
}

# This grid creates 2×3×2×1×2 = 24 combinations
print(f"Total parameter combinations: {2*3*2*1*2}")

Total parameter combinations: 24


### Perform GridSearchCV or RandomizedSearchCV and tune the hyperparameters of the model
Maybe the hyperparameter tuning won't finish in time though. No problem.

In [10]:
# Perform GridSearchCV for hyperparameter tuning on XGBoost model

diab_cleaned.drop(columns=['outlier_z_score', 'outlier_Tukey'],
                   errors='ignore', inplace=True)

# Start timing
start_time = time.time()

# Import XGBoost
import xgboost as xgb

# First, create the base model with default hyperparameters
model_results = create_model(diab_cleaned, model_type="xgboost")
X_train = model_results["X_train"]
y_train = model_results["y_train"]
X_test = model_results["X_test"]
y_test = model_results["y_test"]

# Create an XGBoost classifier object 
xgb_model = xgb.XGBClassifier(random_state=42)

# Set up GridSearchCV with cross-validation
grid_search = GridSearchCV(
    estimator=xgb_model,
    param_grid=param_grid_xgb,
    scoring= 'f1',
    cv=5,
    verbose=1,
    n_jobs=-1  # Use all available cores
)

# Fit the grid search to the data
print("Starting GridSearchCV for XGBoost...")
grid_search.fit(X_train, y_train)

# End timing
end_time = time.time()
print(f"GridSearchCV completed in {end_time - start_time:.2f} seconds")

# Print the best parameters and best score
print("\nBest parameters found:")
print(grid_search.best_params_)
print(f"Best cross-validation accuracy: {grid_search.best_score_:.4f}")

# Train a model with the best parameters
best_xgb = xgb.XGBClassifier(
    random_state=42,
    **grid_search.best_params_
)
best_xgb.fit(X_train, y_train)

# Evaluate on test set
y_pred = best_xgb.predict(X_test)
print("\nTest set performance with optimized hyperparameters:")
print(f"Accuracy: {accuracy_score(y_test, y_pred):.4f}")
print(classification_report(y_test, y_pred))

# Compare with base model
print("\nImprovement in accuracy compared to default hyperparameters:")
base_accuracy = accuracy_score(y_test, model_results["model"].predict(X_test))
optimized_accuracy = accuracy_score(y_test, y_pred)
print(f"Base model accuracy: {base_accuracy:.4f}")
print(f"Optimized model accuracy: {optimized_accuracy:.4f}")
print(f"Improvement: {(optimized_accuracy - base_accuracy) * 100:.2f}%")

Xgboost Model Accuracy: 0.7468
              precision    recall  f1-score   support

           0       0.83      0.77      0.80        99
           1       0.63      0.71      0.67        55

    accuracy                           0.75       154
   macro avg       0.73      0.74      0.73       154
weighted avg       0.76      0.75      0.75       154

Starting GridSearchCV for XGBoost...
Fitting 5 folds for each of 24 candidates, totalling 120 fits
GridSearchCV completed in 13.79 seconds

Best parameters found:
{'colsample_bytree': 0.8, 'learning_rate': 0.1, 'max_depth': 5, 'n_estimators': 100, 'subsample': 0.8}
Best cross-validation accuracy: 0.6527

Test set performance with optimized hyperparameters:
Accuracy: 0.7662
              precision    recall  f1-score   support

           0       0.83      0.80      0.81        99
           1       0.66      0.71      0.68        55

    accuracy                           0.77       154
   macro avg       0.75      0.75      0.75     