In [30]:
import numpy as np
import pandas as pd

from utils import get_train_data

from FeatureEngineering import _encode

from sklearn.model_selection import train_test_split
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler
from xgboost import XGBClassifier

from sklearn.metrics import accuracy_score

In [31]:
X, y = get_train_data()

X_encoded, y, preprocessor = _encode(X, y=y)

In [32]:
X_train, X_test, y_train, y_test = train_test_split(X_encoded, y, test_size=0.2, random_state=42)

pipeline = make_pipeline(
    StandardScaler(),
    XGBClassifier(
        eta = 0.2,
        gamma = 0,
        max_depth = 6,
        min_child_weight = 1,
        subsample = 1,
        colsample_bytree = 1,
        seed = 42,
    )
)

pipeline.fit(X_train, y_train)

y_pred = pipeline.predict(X_test)

## __Optuna__

In [33]:
import optuna

  from .autonotebook import tqdm as notebook_tqdm


In [43]:
def objective(trial):
    # Split the data
    X_train, X_val, y_train, y_val = train_test_split(X_encoded, y, 
                                                      test_size=0.2, 
                                                      random_state=42)

    # Hyperparameter search space
    param = {
    'eta': trial.suggest_float('eta', 0.01, 0.2),
    'gamma': trial.suggest_float('gamma', 0.0, 5.0),
    'max_depth': trial.suggest_int('max_depth', 3, 10),  # Added missing comma here
    'min_child_weight': trial.suggest_int('min_child_weight', 3, 10),
    'subsample': trial.suggest_float('subsample', 0.5, 1.0),
    'colsample_bytree': trial.suggest_float('colsample_bytree', 0.5, 1.0)
    }

    # Train XGBoost
    pipeline = make_pipeline(
    StandardScaler(),
    XGBClassifier(**param)
    )
    pipeline.fit(X_train, y_train)

    # Evaluate
    preds = pipeline.predict(X_val)
    accuracy = accuracy_score(y_val, preds)
    return accuracy

In [44]:
# Create a study
study = optuna.create_study(direction='maximize')  # 'minimize' for RMSE, 'maximize' for accuracy, etc.

# Optimize
study.optimize(objective, n_trials=50)

# Best parameters and score
print("Best trial:")
print(" Value: ", study.best_value)
print(" Params: ", study.best_params)

[I 2025-01-04 12:35:26,210] A new study created in memory with name: no-name-50005d9e-e480-45e2-bf29-4089eeb08fd5
[I 2025-01-04 12:35:29,502] Trial 0 finished with value: 0.8156424581005587 and parameters: {'eta': 0.17494786853214872, 'gamma': 4.6849034407554315, 'max_depth': 8, 'min_child_weight': 7, 'subsample': 0.9050605797533691, 'colsample_bytree': 0.5970826263307563}. Best is trial 0 with value: 0.8156424581005587.
[I 2025-01-04 12:35:32,185] Trial 1 finished with value: 0.8100558659217877 and parameters: {'eta': 0.015788676140434738, 'gamma': 1.3042841784810966, 'max_depth': 3, 'min_child_weight': 6, 'subsample': 0.6993429897017918, 'colsample_bytree': 0.51280214371911}. Best is trial 0 with value: 0.8156424581005587.
[I 2025-01-04 12:35:32,629] Trial 2 finished with value: 0.7988826815642458 and parameters: {'eta': 0.031209463097379007, 'gamma': 2.0715476511240913, 'max_depth': 9, 'min_child_weight': 9, 'subsample': 0.7343172084490812, 'colsample_bytree': 0.888771575982402}. Be

Best trial:
 Value:  0.8379888268156425
 Params:  {'eta': 0.13931600539184902, 'gamma': 1.6973396780852883, 'max_depth': 4, 'min_child_weight': 4, 'subsample': 0.5885034417642654, 'colsample_bytree': 0.9034020022420274}


In [45]:
best_params = study.best_params
best_pipeline = make_pipeline(
    StandardScaler(),
    XGBClassifier(**best_params))
best_pipeline.fit(X_encoded, y)

## __Prediction__

In [46]:
test_data = pd.read_csv('data/test.csv')
test_data_encoded = _encode(test_data, preprocessor=preprocessor).reindex(columns=X_encoded.columns, 
                                                fill_value=0)

predictions = best_pipeline.predict(test_data_encoded)

## __Output__

In [47]:
results = pd.DataFrame(
    dict(
        PassengerId=test_data['PassengerId'],
        Survived=predictions,
    )
)

results.to_csv('submission_XGB_vOptuna.csv', index=False)