In [1]:
import pandas as pd
import numpy as np
import optuna
import logging
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import accuracy_score
from sklearn.linear_model import LogisticRegression
from catboost import CatBoostClassifier
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
import lightgbm as lgb
from sklearn.preprocessing import OneHotEncoder

# Constants
ENV = 'Kaggle'  # Set to 'Colab', 'Kaggle', or 'Sagemaker'
DEV = False  # Set to True to enable subsetting, False for full training data
SUBSET_SIZE = 1000  # Number of samples for the subset during development
TRIALS = 15  # Number of trials for Optuna

# Set up logging
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')

# Basic setup
ID_COL = 'id'
TARGET_COL = 'Depression'

def load_data():
    """Load data based on environment setting."""
    paths = {
        'Kaggle': '/kaggle/input/playground-series-s4e11/',
        'Sagemaker': '/home/ec2-user/SageMaker/data/PS4E11/',
        'Colab': '/content/drive/MyDrive/Kaggle_analysis/PS4E11/data/'
    }
    base_path = paths.get(ENV)
    if not base_path:
        raise ValueError("Invalid environment specified")
    
    train_data = pd.read_csv(base_path + 'train.csv')
    test_data = pd.read_csv(base_path + 'test.csv')
    sample_submission = pd.read_csv(base_path + 'sample_submission.csv')
    
    return train_data, test_data, sample_submission

def preprocess_data(train_data, test_data):
    """Preprocess the data for model training."""
    X = train_data.drop(columns=[ID_COL, TARGET_COL])
    y = train_data[TARGET_COL]
    X_test = test_data.drop(columns=[ID_COL])

    if DEV:
        subset_indices = np.random.choice(X.index, size=min(SUBSET_SIZE, len(X)), replace=False)
        X = X.loc[subset_indices]
        y = y.loc[subset_indices]
        X_test = X_test.iloc[:SUBSET_SIZE]

    # Handle categorical features
    cat_features = X.select_dtypes(include=['object', 'category']).columns.tolist()
    X[cat_features] = X[cat_features].fillna('missing')
    X_test[cat_features] = X_test[cat_features].fillna('missing')

    # Convert categorical features to category dtype
    for col in cat_features:
        X[col] = X[col].astype('category')
        X_test[col] = X_test[col].astype('category')

    # One-hot encode for XGBoost
    X_encoded = pd.get_dummies(X, columns=cat_features)
    X_test_encoded = pd.get_dummies(X_test, columns=cat_features)

    # Align columns
    X_encoded, X_test_encoded = X_encoded.align(X_test_encoded, join='left', axis=1, fill_value=0)

    return X, y, X_test, X_encoded, X_test_encoded, cat_features

def get_model(model_type, params):
    """Create model instance based on type."""
    if model_type == 'CatBoost':
        return CatBoostClassifier(**params)
    elif model_type == 'XGBoost':
        return XGBClassifier(**params)
    elif model_type == 'LightGBM':
        return LGBMClassifier(**params)
    else:
        raise ValueError(f"Invalid model type: {model_type}")

def objective(trial, model_type, X, y, cat_features=None, X_encoded=None):
    """Optuna objective function for hyperparameter optimization."""
    skf = StratifiedKFold(n_splits=3, shuffle=True, random_state=42)
    oof_preds = np.zeros(len(X))

    # Suggest hyperparameters based on model type
    if model_type == 'CatBoost':
        params = {
            'iterations': trial.suggest_int('iterations', 100, 1000),
            'depth': trial.suggest_int('depth', 4, 10),
            'learning_rate': trial.suggest_float('learning_rate', 0.01, 0.3, log=True),
            'l2_leaf_reg': trial.suggest_float('l2_leaf_reg', 1e-3, 10.0, log=True),
            'random_strength': trial.suggest_float('random_strength', 1, 20),
            'verbose': 0,
        }
    elif model_type == 'XGBoost':
        params = {
            'n_estimators': trial.suggest_int('n_estimators', 100, 1000),
            'max_depth': trial.suggest_int('max_depth', 4, 10),
            'learning_rate': trial.suggest_float('learning_rate', 0.01, 0.3, log=True),
            'subsample': trial.suggest_float('subsample', 0.5, 1.0),
            'colsample_bytree': trial.suggest_float('colsample_bytree', 0.5, 1.0),
            'verbosity': 0,
        }
    elif model_type == 'LightGBM':
        params = {
            'n_estimators': trial.suggest_int('n_estimators', 100, 1000),
            'num_leaves': trial.suggest_int('num_leaves', 20, 150),
            'learning_rate': trial.suggest_float('learning_rate', 0.01, 0.3, log=True),
            'min_child_samples': trial.suggest_int('min_child_samples', 5, 50),
            'subsample': trial.suggest_float('subsample', 0.5, 1.0),
        }

    for train_idx, valid_idx in skf.split(X, y):
        X_train, X_valid = X.iloc[train_idx], X.iloc[valid_idx]
        y_train, y_valid = y.iloc[train_idx], y.iloc[valid_idx]

        if model_type == 'XGBoost':
            X_train, X_valid = X_encoded.iloc[train_idx], X_encoded.iloc[valid_idx]

        model = get_model(model_type, params)
        
        try:
            if model_type == 'CatBoost':
                model.fit(
                    X_train, y_train,
                    cat_features=cat_features,
                    eval_set=(X_valid, y_valid),
                    early_stopping_rounds=100,
                    verbose=False
                )
            elif model_type == 'XGBoost':
                model.fit(
                    X_train, y_train,
                    eval_set=[(X_valid, y_valid)],
                    early_stopping_rounds=100,
                    verbose=False
                )
            else:  # LightGBM
                model.fit(
                    X_train, y_train,
                    eval_set=[(X_valid, y_valid)],
                    callbacks=[lgb.early_stopping(stopping_rounds=100)]
                )

            oof_preds[valid_idx] = model.predict(X_valid)
        
        except Exception as e:
            logging.error(f"Error in model fitting: {str(e)}")
            raise

    return accuracy_score(y, oof_preds)

def tune_hyperparameters(X, y, model_type, cat_features=None, X_encoded=None):
    """Run Optuna study to find best hyperparameters."""
    study = optuna.create_study(direction='maximize')
    study.optimize(lambda trial: objective(trial, model_type, X, y, cat_features, X_encoded), n_trials=TRIALS)
    print(f"Best hyperparameters for {model_type}: {study.best_params}")
    return study.best_params

def train_and_predict_with_tuned_params(X, y, X_test, model_type, cat_features, params, X_encoded=None, X_test_encoded=None):
    """Train model with tuned parameters and make predictions."""
    skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
    oof_preds = np.zeros(len(X))
    test_preds = np.zeros(len(X_test))

    for fold, (train_idx, valid_idx) in enumerate(skf.split(X, y), 1):
        print(f"Training {model_type} - Fold {fold}/5")
        
        if model_type == 'XGBoost':
            X_train, X_valid = X_encoded.iloc[train_idx], X_encoded.iloc[valid_idx]
            X_test_current = X_test_encoded
        else:
            X_train, X_valid = X.iloc[train_idx], X.iloc[valid_idx]
            X_test_current = X_test

        y_train, y_valid = y.iloc[train_idx], y.iloc[valid_idx]

        model = get_model(model_type, params)
        
        try:
            if model_type == 'CatBoost':
                model.fit(
                    X_train, y_train,
                    cat_features=cat_features,
                    eval_set=(X_valid, y_valid),
                    early_stopping_rounds=100,
                    verbose=False
                )
            elif model_type == 'XGBoost':
                model.fit(
                    X_train, y_train,
                    eval_set=[(X_valid, y_valid)],
                    early_stopping_rounds=100,
                    verbose=False
                )
            else:  # LightGBM
                model.fit(
                    X_train, y_train,
                    eval_set=[(X_valid, y_valid)],
                    callbacks=[lgb.early_stopping(stopping_rounds=100)]
                )

            oof_preds[valid_idx] = model.predict(X_valid)
            test_preds += model.predict(X_test_current) / skf.n_splits

            # Print fold performance
            fold_score = accuracy_score(y_valid, model.predict(X_valid))
            print(f"{model_type} Fold {fold} accuracy: {fold_score:.4f}")
            
        except Exception as e:
            logging.error(f"Error in fold {fold}: {str(e)}")
            raise

    return oof_preds, test_preds

def train_meta_model(oof_preds_list, y):
    """Train meta-model using base model predictions."""
    X_meta = np.column_stack(oof_preds_list)
    meta_model = LogisticRegression()
    meta_model.fit(X_meta, y)
    return meta_model

def generate_final_predictions(meta_model, test_preds_list):
    """Generate ensemble predictions using meta-model."""
    X_meta_test = np.column_stack(test_preds_list)
    final_preds = meta_model.predict(X_meta_test)
    return final_preds

# Load and preprocess data
print("Loading and preprocessing data...")
train_data, test_data, sample_submission = load_data()
X, y, X_test, X_encoded, X_test_encoded, cat_features = preprocess_data(train_data, test_data)

# Tune and train models
print("\nTuning LightGBM...")
best_params_lgbm = tune_hyperparameters(X, y, 'LightGBM', cat_features)

print("\nTuning CatBoost...")
best_params_catboost = tune_hyperparameters(X, y, 'CatBoost', cat_features)

print("\nTuning XGBoost...")
best_params_xgboost = tune_hyperparameters(X, y, 'XGBoost', None, X_encoded)

# Train models and get predictions
print("\nTraining models with tuned parameters...")
oof_lgbm, test_lgbm = train_and_predict_with_tuned_params(
    X, y, X_test, 'LightGBM', cat_features, best_params_lgbm
)
oof_catboost, test_catboost = train_and_predict_with_tuned_params(
    X, y, X_test, 'CatBoost', cat_features, best_params_catboost
)
oof_xgboost, test_xgboost = train_and_predict_with_tuned_params(
    X, y, X_test, 'XGBoost', None, best_params_xgboost, X_encoded, X_test_encoded
)

# Train meta model and generate final predictions
print("\nTraining meta model...")
oof_preds_list = [oof_catboost, oof_xgboost, oof_lgbm]
test_preds_list = [test_catboost, test_xgboost, test_lgbm]

meta_model = train_meta_model(oof_preds_list, y)
final_preds = generate_final_predictions(meta_model, test_preds_list)

# Create and save submission
submission = pd.DataFrame({
    ID_COL: test_data[ID_COL],
    TARGET_COL: final_preds
})
submission.to_csv('submission.csv', index=False)
print("\nFinal ensemble submission file has been saved.")

# Display sample of predictions
print("\nSample of final predictions:")
print(submission.head())

Loading and preprocessing data...


[I 2024-11-13 12:17:03,265] A new study created in memory with name: no-name-ccee4db9-d353-426b-b70d-b219952e18ad



Tuning LightGBM...
[LightGBM] [Info] Number of positive: 17045, number of negative: 76755
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.014611 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 683
[LightGBM] [Info] Number of data points in the train set: 93800, number of used features: 18
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.181716 -> initscore=-1.504762
[LightGBM] [Info] Start training from score -1.504762
Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[32]	valid_0's binary_logloss: 0.160791
[LightGBM] [Info] Number of positive: 17045, number of negative: 76755
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.018521 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_w

[I 2024-11-13 12:17:17,887] Trial 0 finished with value: 0.9366950959488273 and parameters: {'n_estimators': 476, 'num_leaves': 66, 'learning_rate': 0.1833218914547814, 'min_child_samples': 31, 'subsample': 0.883142706467505}. Best is trial 0 with value: 0.9366950959488273.


[LightGBM] [Info] Number of positive: 17045, number of negative: 76755
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.018173 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 683
[LightGBM] [Info] Number of data points in the train set: 93800, number of used features: 18
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.181716 -> initscore=-1.504762
[LightGBM] [Info] Start training from score -1.504762
Training until validation scores don't improve for 100 rounds
Did not meet early stopping. Best iteration is:
[179]	valid_0's binary_logloss: 0.166161
[LightGBM] [Info] Number of positive: 17045, number of negative: 76755
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.018414 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=tr

[I 2024-11-13 12:17:39,415] Trial 1 finished with value: 0.9375408670931059 and parameters: {'n_estimators': 179, 'num_leaves': 62, 'learning_rate': 0.017528844084858095, 'min_child_samples': 26, 'subsample': 0.9739808361617948}. Best is trial 1 with value: 0.9375408670931059.


[LightGBM] [Info] Number of positive: 17045, number of negative: 76755
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.018190 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 683
[LightGBM] [Info] Number of data points in the train set: 93800, number of used features: 18
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.181716 -> initscore=-1.504762
[LightGBM] [Info] Start training from score -1.504762
Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[47]	valid_0's binary_logloss: 0.159954
[LightGBM] [Info] Number of positive: 17045, number of negative: 76755
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.018610 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM

[I 2024-11-13 12:17:58,586] Trial 2 finished with value: 0.937498223169865 and parameters: {'n_estimators': 167, 'num_leaves': 93, 'learning_rate': 0.12280902810592578, 'min_child_samples': 34, 'subsample': 0.5975023415837712}. Best is trial 1 with value: 0.9375408670931059.


[LightGBM] [Info] Number of positive: 17045, number of negative: 76755
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.018508 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 683
[LightGBM] [Info] Number of data points in the train set: 93800, number of used features: 18
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.181716 -> initscore=-1.504762
[LightGBM] [Info] Start training from score -1.504762
Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[20]	valid_0's binary_logloss: 0.16297
[LightGBM] [Info] Number of positive: 17045, number of negative: 76755
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.018970 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM]

[I 2024-11-13 12:18:14,097] Trial 3 finished with value: 0.9364605543710022 and parameters: {'n_estimators': 594, 'num_leaves': 88, 'learning_rate': 0.2383540182973698, 'min_child_samples': 9, 'subsample': 0.5028601163884551}. Best is trial 1 with value: 0.9375408670931059.


Early stopping, best iteration is:
[23]	valid_0's binary_logloss: 0.157903
[LightGBM] [Info] Number of positive: 17045, number of negative: 76755
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.018305 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 683
[LightGBM] [Info] Number of data points in the train set: 93800, number of used features: 18
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.181716 -> initscore=-1.504762
[LightGBM] [Info] Start training from score -1.504762
Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[120]	valid_0's binary_logloss: 0.157356
[LightGBM] [Info] Number of positive: 17045, number of negative: 76755
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.018700 seconds.
You can set `force_row_wise=true` to remove the overhea

[I 2024-11-13 12:18:41,710] Trial 4 finished with value: 0.9377683013503909 and parameters: {'n_estimators': 738, 'num_leaves': 58, 'learning_rate': 0.058665406319911975, 'min_child_samples': 38, 'subsample': 0.925368339112632}. Best is trial 4 with value: 0.9377683013503909.


[LightGBM] [Info] Number of positive: 17045, number of negative: 76755
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.018325 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 683
[LightGBM] [Info] Number of data points in the train set: 93800, number of used features: 18
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.181716 -> initscore=-1.504762
[LightGBM] [Info] Start training from score -1.504762
Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[69]	valid_0's binary_logloss: 0.157832
[LightGBM] [Info] Number of positive: 17045, number of negative: 76755
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.018425 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM

[I 2024-11-13 12:18:59,738] Trial 5 finished with value: 0.937590618336887 and parameters: {'n_estimators': 835, 'num_leaves': 43, 'learning_rate': 0.11043127926484399, 'min_child_samples': 7, 'subsample': 0.8929522176299234}. Best is trial 4 with value: 0.9377683013503909.


[LightGBM] [Info] Number of positive: 17045, number of negative: 76755
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.018353 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 683
[LightGBM] [Info] Number of data points in the train set: 93800, number of used features: 18
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.181716 -> initscore=-1.504762
[LightGBM] [Info] Start training from score -1.504762
Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[88]	valid_0's binary_logloss: 0.158548
[LightGBM] [Info] Number of positive: 17045, number of negative: 76755
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.018435 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM

[I 2024-11-13 12:19:23,391] Trial 6 finished with value: 0.93727078891258 and parameters: {'n_estimators': 188, 'num_leaves': 78, 'learning_rate': 0.07245006715574497, 'min_child_samples': 50, 'subsample': 0.7771385784352492}. Best is trial 4 with value: 0.9377683013503909.


[LightGBM] [Info] Number of positive: 17045, number of negative: 76755
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.018406 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 683
[LightGBM] [Info] Number of data points in the train set: 93800, number of used features: 18
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.181716 -> initscore=-1.504762
[LightGBM] [Info] Start training from score -1.504762
Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[144]	valid_0's binary_logloss: 0.158313
[LightGBM] [Info] Number of positive: 17045, number of negative: 76755
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.018401 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGB

[I 2024-11-13 12:19:56,610] Trial 7 finished with value: 0.937498223169865 and parameters: {'n_estimators': 407, 'num_leaves': 91, 'learning_rate': 0.03968573585284045, 'min_child_samples': 28, 'subsample': 0.9512773645727439}. Best is trial 4 with value: 0.9377683013503909.


[LightGBM] [Info] Number of positive: 17045, number of negative: 76755
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.018306 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 683
[LightGBM] [Info] Number of data points in the train set: 93800, number of used features: 18
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.181716 -> initscore=-1.504762
[LightGBM] [Info] Start training from score -1.504762
Training until validation scores don't improve for 100 rounds
Did not meet early stopping. Best iteration is:
[376]	valid_0's binary_logloss: 0.155517
[LightGBM] [Info] Number of positive: 17045, number of negative: 76755
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.018644 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=tr

[I 2024-11-13 12:20:36,447] Trial 8 finished with value: 0.9382942430703625 and parameters: {'n_estimators': 389, 'num_leaves': 22, 'learning_rate': 0.027305208961937385, 'min_child_samples': 30, 'subsample': 0.9967126337962093}. Best is trial 8 with value: 0.9382942430703625.


[LightGBM] [Info] Number of positive: 17045, number of negative: 76755
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.018329 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 683
[LightGBM] [Info] Number of data points in the train set: 93800, number of used features: 18
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.181716 -> initscore=-1.504762
[LightGBM] [Info] Start training from score -1.504762
Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[40]	valid_0's binary_logloss: 0.16011
[LightGBM] [Info] Number of positive: 17045, number of negative: 76755
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.018388 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM]

[I 2024-11-13 12:20:55,550] Trial 9 finished with value: 0.9372423596304194 and parameters: {'n_estimators': 771, 'num_leaves': 126, 'learning_rate': 0.1388954450320937, 'min_child_samples': 47, 'subsample': 0.7039224060824176}. Best is trial 8 with value: 0.9382942430703625.


[LightGBM] [Info] Number of positive: 17045, number of negative: 76755
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.018239 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 683
[LightGBM] [Info] Number of data points in the train set: 93800, number of used features: 18
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.181716 -> initscore=-1.504762
[LightGBM] [Info] Start training from score -1.504762
Training until validation scores don't improve for 100 rounds
Did not meet early stopping. Best iteration is:
[349]	valid_0's binary_logloss: 0.164186
[LightGBM] [Info] Number of positive: 17045, number of negative: 76755
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.018428 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=tr

[I 2024-11-13 12:21:31,341] Trial 10 finished with value: 0.9372992181947406 and parameters: {'n_estimators': 349, 'num_leaves': 27, 'learning_rate': 0.010836864218003754, 'min_child_samples': 18, 'subsample': 0.7831650173712091}. Best is trial 8 with value: 0.9382942430703625.


[LightGBM] [Info] Number of positive: 17045, number of negative: 76755
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.018126 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 683
[LightGBM] [Info] Number of data points in the train set: 93800, number of used features: 18
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.181716 -> initscore=-1.504762
[LightGBM] [Info] Start training from score -1.504762
Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[231]	valid_0's binary_logloss: 0.155533
[LightGBM] [Info] Number of positive: 17045, number of negative: 76755
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.018343 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGB

[I 2024-11-13 12:22:03,657] Trial 11 finished with value: 0.9380099502487562 and parameters: {'n_estimators': 630, 'num_leaves': 23, 'learning_rate': 0.04143779884878618, 'min_child_samples': 40, 'subsample': 0.9973030557945183}. Best is trial 8 with value: 0.9382942430703625.


[LightGBM] [Info] Number of positive: 17045, number of negative: 76755
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.018207 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 683
[LightGBM] [Info] Number of data points in the train set: 93800, number of used features: 18
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.181716 -> initscore=-1.504762
[LightGBM] [Info] Start training from score -1.504762
Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[414]	valid_0's binary_logloss: 0.155122
[LightGBM] [Info] Number of positive: 17045, number of negative: 76755
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.018343 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGB

[I 2024-11-13 12:22:52,430] Trial 12 finished with value: 0.9384008528784649 and parameters: {'n_estimators': 971, 'num_leaves': 20, 'learning_rate': 0.024639430998757615, 'min_child_samples': 41, 'subsample': 0.837124717086883}. Best is trial 12 with value: 0.9384008528784649.


[LightGBM] [Info] Number of positive: 17045, number of negative: 76755
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.018167 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 683
[LightGBM] [Info] Number of data points in the train set: 93800, number of used features: 18
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.181716 -> initscore=-1.504762
[LightGBM] [Info] Start training from score -1.504762
Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[335]	valid_0's binary_logloss: 0.156626
[LightGBM] [Info] Number of positive: 17045, number of negative: 76755
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.018442 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGB

[I 2024-11-13 12:23:41,462] Trial 13 finished with value: 0.9379246624022743 and parameters: {'n_estimators': 989, 'num_leaves': 40, 'learning_rate': 0.023254018027190716, 'min_child_samples': 21, 'subsample': 0.8303807284245661}. Best is trial 12 with value: 0.9384008528784649.


[LightGBM] [Info] Number of positive: 17045, number of negative: 76755
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.018161 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 683
[LightGBM] [Info] Number of data points in the train set: 93800, number of used features: 18
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.181716 -> initscore=-1.504762
[LightGBM] [Info] Start training from score -1.504762
Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[216]	valid_0's binary_logloss: 0.158053
[LightGBM] [Info] Number of positive: 17045, number of negative: 76755
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.018532 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGB

[I 2024-11-13 12:24:27,302] Trial 14 finished with value: 0.9377185501066098 and parameters: {'n_estimators': 968, 'num_leaves': 145, 'learning_rate': 0.026045706393390454, 'min_child_samples': 43, 'subsample': 0.675576851639372}. Best is trial 12 with value: 0.9384008528784649.
[I 2024-11-13 12:24:27,305] A new study created in memory with name: no-name-48fb76e0-0fcb-4f26-b0b9-78cac1418c1a


Best hyperparameters for LightGBM: {'n_estimators': 971, 'num_leaves': 20, 'learning_rate': 0.024639430998757615, 'min_child_samples': 41, 'subsample': 0.837124717086883}

Tuning CatBoost...


[I 2024-11-13 12:28:38,198] Trial 0 finished with value: 0.9369722814498934 and parameters: {'iterations': 821, 'depth': 6, 'learning_rate': 0.012179959702313095, 'l2_leaf_reg': 0.04276463637189424, 'random_strength': 10.839807682662931}. Best is trial 0 with value: 0.9369722814498934.
[I 2024-11-13 12:30:06,865] Trial 1 finished with value: 0.9301137171286425 and parameters: {'iterations': 280, 'depth': 6, 'learning_rate': 0.014323486114627648, 'l2_leaf_reg': 0.09801798572705687, 'random_strength': 17.781804354542146}. Best is trial 0 with value: 0.9369722814498934.
[I 2024-11-13 12:33:51,102] Trial 2 finished with value: 0.9388343994314143 and parameters: {'iterations': 573, 'depth': 7, 'learning_rate': 0.019583662827699706, 'l2_leaf_reg': 0.00388112710472163, 'random_strength': 3.5919341139814627}. Best is trial 2 with value: 0.9388343994314143.
[I 2024-11-13 12:37:00,499] Trial 3 finished with value: 0.9356076759061833 and parameters: {'iterations': 411, 'depth': 9, 'learning_rate'

Best hyperparameters for CatBoost: {'iterations': 857, 'depth': 5, 'learning_rate': 0.28569741642557295, 'l2_leaf_reg': 0.06341815841556608, 'random_strength': 7.19129656774867}

Tuning XGBoost...


[I 2024-11-13 13:26:33,799] Trial 0 finished with value: 0.9393461265103056 and parameters: {'n_estimators': 429, 'max_depth': 10, 'learning_rate': 0.027287957478457112, 'subsample': 0.7406349519396822, 'colsample_bytree': 0.587878878379606}. Best is trial 0 with value: 0.9393461265103056.
[I 2024-11-13 13:39:07,932] Trial 1 finished with value: 0.9385856432125089 and parameters: {'n_estimators': 169, 'max_depth': 8, 'learning_rate': 0.03004582318848185, 'subsample': 0.789573312755669, 'colsample_bytree': 0.8787558332875016}. Best is trial 0 with value: 0.9393461265103056.
[I 2024-11-13 13:51:22,643] Trial 2 finished with value: 0.9384434968017058 and parameters: {'n_estimators': 446, 'max_depth': 10, 'learning_rate': 0.13586983370956826, 'subsample': 0.8506205367683435, 'colsample_bytree': 0.6909443776019566}. Best is trial 0 with value: 0.9393461265103056.
[I 2024-11-13 14:36:47,820] Trial 3 finished with value: 0.9394669509594883 and parameters: {'n_estimators': 710, 'max_depth': 7,

Best hyperparameters for XGBoost: {'n_estimators': 999, 'max_depth': 5, 'learning_rate': 0.03797952254718775, 'subsample': 0.624820499801507, 'colsample_bytree': 0.6916170018267218}

Training models with tuned parameters...
Training LightGBM - Fold 1/5
[LightGBM] [Info] Number of positive: 20454, number of negative: 92106
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.021910 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 690
[LightGBM] [Info] Number of data points in the train set: 112560, number of used features: 18
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.181716 -> initscore=-1.504762
[LightGBM] [Info] Start training from score -1.504762
Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[466]	valid_0's binary_logloss: 0.152173
LightGBM Fold 1 accuracy: 0.9381
Training LightGBM 



XGBoost Fold 1 accuracy: 0.9400
Training XGBoost - Fold 2/5




XGBoost Fold 2 accuracy: 0.9388
Training XGBoost - Fold 3/5




XGBoost Fold 3 accuracy: 0.9409
Training XGBoost - Fold 4/5




XGBoost Fold 4 accuracy: 0.9414
Training XGBoost - Fold 5/5




XGBoost Fold 5 accuracy: 0.9389

Training meta model...

Final ensemble submission file has been saved.

Sample of final predictions:
       id  Depression
0  140700           0
1  140701           0
2  140702           0
3  140703           1
4  140704           0
