In [None]:
import pandas as pd 


In [None]:
df_train = pd.read_csv('data/train.csv')
df_test = pd.read_csv('data/test.csv')
df_aug = pd.read_csv('data/bank-full.csv', sep=';')

df_train.info()
df_aug.head()

In [None]:
df_aug['y'] = df_aug['y'].map({'yes':1, 'no':0})
df_aug.columns = df_train.columns.drop('id')

In [None]:
df_train_ids = df_train['id']
df_test_ids = df_test['id']
df_train_y = df_train['y']
df_aug_y = df_aug['y']



#drop unnecessary columns before column trasformation
df_train = df_train.drop(columns=['id', 'y'])
df_test = df_test.drop(columns=['id'])
df_aug = df_aug.drop(columns=['y'])



In [None]:
df_all = pd.concat([df_train, df_test, df_aug], ignore_index=True)
categorical_cols = df_all.select_dtypes(include=['object']).columns.tolist()

In [None]:
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer

ct = ColumnTransformer(
    transformers = [
        ('encoder', OneHotEncoder(), categorical_cols),
    ],
    remainder = 'passthrough'
)

scaler = StandardScaler()

df_all_transformed = ct.fit_transform(df_all)
df_all_transformed = scaler.fit_transform(df_all_transformed)

df_all_transformed = pd.DataFrame(df_all_transformed)

In [None]:
df_all_transformed = pd.get_dummies(df_all, columns=categorical_cols, dummy_na=False)

In [None]:
df_train_transformed = df_all_transformed.iloc[:len(df_train)]
df_test_transformed = df_all_transformed.iloc[len(df_train):len(df_train)+len(df_test)]
df_aug_transformed = df_all_transformed.iloc[len(df_train)+len(df_test):]

In [None]:
x_all = pd.concat([df_train_transformed, df_aug_transformed])
y_all = pd.concat([df_train_y, df_aug_y])

In [None]:
from sklearn.model_selection import train_test_split

x_train, x_test, y_train, y_test = train_test_split(x_all, y_all, test_size=0.2, random_state=42)

#### RF + LightGBM

In [None]:
import lightgbm as lgb
from sklearn.ensemble import StackingClassifier, RandomForestClassifier
from sklearn.linear_model import LogisticRegression

rfClassifier = RandomForestClassifier(n_estimators=300,
    max_depth=None,
    max_features='sqrt',
    random_state=42,
    n_jobs=-1)
lgbmClassifier = lgb.LGBMClassifier(n_estimators=500,
    learning_rate=0.05,
    max_depth=-1,
    subsample=0.8,  
    colsample_bytree=0.8,
    random_state=42,
    n_jobs=-1)

stack_model = StackingClassifier(
    estimators=[('rf', rfClassifier), ('lgbm', lgbmClassifier)],
    final_estimator=LogisticRegression(),
    stack_method="predict_proba",
    n_jobs=-1,
    passthrough=False
)

stack_model.fit(x_train, y_train)
y_pred = stack_model.predict_proba(x_test)[:, 1]

from sklearn.metrics import roc_auc_score
roc = roc_auc_score(y_test, y_pred)
print(f'ROC AUC: {roc}')

#### RF + LightGBM with optuna

In [None]:
import optuna
import numpy as np
from sklearn.ensemble import RandomForestClassifier, StackingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import cross_val_score, StratifiedKFold
from sklearn.metrics import accuracy_score
import lightgbm as lgb
import warnings
warnings.filterwarnings('ignore')

class StackingClassifierOptimizer:
    """
    A comprehensive hyperparameter optimizer for stacking classifiers using Optuna.
    This class optimizes Random Forest, LightGBM, and the meta-learner (Logistic Regression).
    """
    
    def __init__(self, x_train, y_train, X_val=None, y_val=None, cv_folds=5, random_state=42):
        """
        Initialize the optimizer with training data and validation strategy.
        
        Parameters:
        -----------
        X_train : array-like
            Training features
        y_train : array-like  
            Training labels
        X_val : array-like, optional
            Validation features (if None, uses cross-validation)
        y_val : array-like, optional
            Validation labels (if None, uses cross-validation)
        cv_folds : int
            Number of cross-validation folds if validation set not provided
        random_state : int
            Random state for reproducibility
        """
        self.X_train = x_train
        self.y_train = y_train
        self.X_val = X_val
        self.y_val = y_val
        self.cv_folds = cv_folds
        self.random_state = random_state
        self.use_cv = X_val is None or y_val is None
        
        # Initialize cross-validation if needed
        if self.use_cv:
            self.cv = StratifiedKFold(n_splits=cv_folds, shuffle=True, random_state=random_state)
    
    def objective(self, trial):
        """
        Objective function for Optuna optimization.
        This function defines the hyperparameter search space and returns the metric to optimize.
        
        Parameters:
        -----------
        trial : optuna.Trial
            Optuna trial object for suggesting hyperparameters
            
        Returns:
        --------
        float
            Accuracy score to be maximized
        """
        
        # ============================================================================
        # STEP 1: Define hyperparameter search spaces for Random Forest
        # ============================================================================
        rf_params = {
            'n_estimators': trial.suggest_int('rf_n_estimators', 100, 500, step=50),
            'max_depth': trial.suggest_categorical('rf_max_depth', [None, 10, 20, 30, 50]),
            'min_samples_split': trial.suggest_int('rf_min_samples_split', 2, 20),
            'min_samples_leaf': trial.suggest_int('rf_min_samples_leaf', 1, 10),
            'max_features': trial.suggest_categorical('rf_max_features', ['sqrt', 'log2', None]),
            'bootstrap': trial.suggest_categorical('rf_bootstrap', [True, False]),
            'random_state': self.random_state,
            'n_jobs': -1
        }
        
        # ============================================================================
        # STEP 2: Define hyperparameter search spaces for LightGBM
        # ============================================================================
        lgbm_params = {
            'n_estimators': trial.suggest_int('lgbm_n_estimators', 100, 1000, step=50),
            'learning_rate': trial.suggest_float('lgbm_learning_rate', 0.01, 0.3, log=True),
            'max_depth': trial.suggest_int('lgbm_max_depth', -1, 50),
            'num_leaves': trial.suggest_int('lgbm_num_leaves', 10, 300),
            'subsample': trial.suggest_float('lgbm_subsample', 0.6, 1.0),
            'colsample_bytree': trial.suggest_float('lgbm_colsample_bytree', 0.6, 1.0),
            'reg_alpha': trial.suggest_float('lgbm_reg_alpha', 1e-8, 10.0, log=True),
            'reg_lambda': trial.suggest_float('lgbm_reg_lambda', 1e-8, 10.0, log=True),
            'min_child_samples': trial.suggest_int('lgbm_min_child_samples', 5, 100),
            'random_state': self.random_state,
            'n_jobs': -1,
            'verbose': -1  # Suppress LightGBM output
        }
        
        # ============================================================================
        # STEP 3: Define hyperparameter search spaces for Logistic Regression (Meta-learner)
        # ============================================================================
        lr_params = {
            'C': trial.suggest_float('lr_C', 1e-5, 100, log=True),
            'solver': trial.suggest_categorical('lr_solver', ['liblinear', 'lbfgs']),
            'max_iter': trial.suggest_int('lr_max_iter', 100, 1000),
            'random_state': self.random_state
        }
        
        # Handle penalty parameter based on solver
        if lr_params['solver'] == 'liblinear':
            lr_params['penalty'] = trial.suggest_categorical('lr_penalty', ['l1', 'l2'])
        else:  # lbfgs
            lr_params['penalty'] = 'l2'  # lbfgs only supports l2
        
        # ============================================================================
        # STEP 4: Create base estimators with suggested hyperparameters
        # ============================================================================
        rf_classifier = RandomForestClassifier(**rf_params)
        lgbm_classifier = lgb.LGBMClassifier(**lgbm_params)
        
        # ============================================================================
        # STEP 5: Create the stacking classifier
        # ============================================================================
        stack_model = StackingClassifier(
            estimators=[
                ('rf', rf_classifier), 
                ('lgbm', lgbm_classifier)
            ],
            final_estimator=LogisticRegression(**lr_params),
            cv=5,  # Internal cross-validation for generating meta-features
            stack_method='predict_proba',
            n_jobs=-1,
            passthrough=False
        )
        
        # ============================================================================
        # STEP 6: Evaluate the model using cross-validation or validation set
        # ============================================================================
        try:
            if self.use_cv:
                # Use cross-validation for evaluation
                scores = cross_val_score(
                    stack_model, self.X_train, self.y_train, 
                    cv=self.cv, scoring='accuracy', n_jobs=-1
                )
                accuracy = scores.mean()
            else:
                # Use separate validation set
                stack_model.fit(self.X_train, self.y_train)
                y_pred = stack_model.predict(self.X_val)
                accuracy = accuracy_score(self.y_val, y_pred)
            
            return accuracy
            
        except Exception as e:
            # Return a low score if the model fails to train
            print(f"Trial failed with error: {e}")
            return 0.0
    
    def optimize(self, n_trials=100, timeout=None, show_progress=True):
        """
        Run the hyperparameter optimization process.
        
        Parameters:
        -----------
        n_trials : int
            Number of optimization trials
        timeout : int, optional
            Maximum time in seconds for optimization
        show_progress : bool
            Whether to show optimization progress
            
        Returns:
        --------
        tuple
            (study object, best parameters, best score)
        """
        
        print("="*80)
        print("STARTING OPTUNA HYPERPARAMETER OPTIMIZATION")
        print("="*80)
        print(f"Optimization Strategy: {'Cross-Validation' if self.use_cv else 'Validation Set'}")
        print(f"Number of trials: {n_trials}")
        print(f"Timeout: {timeout if timeout else 'None'}")
        print("="*80)
        
        # ============================================================================
        # STEP 7: Create and configure the Optuna study
        # ============================================================================
        study = optuna.create_study(
            direction='maximize',  # We want to maximize accuracy
            pruner=optuna.pruners.MedianPruner(  # Prune unpromising trials early
                n_startup_trials=10,
                n_warmup_steps=5
            ),
            sampler=optuna.samplers.TPESampler(seed=self.random_state)  # Tree-structured Parzen Estimator
        )
        
        # ============================================================================
        # STEP 8: Run the optimization
        # ============================================================================
        if show_progress:
            # Add callback to show progress
            def callback(study, trial):
                if trial.number % 10 == 0:
                    print(f"Trial {trial.number}: Best score so far = {study.best_value:.4f}")
        else:
            callback = None
            
        study.optimize(
            self.objective,
            n_trials=n_trials,
            timeout=timeout,
            callbacks=[callback] if callback else None,
            show_progress_bar=show_progress
        )
        
        # ============================================================================
        # STEP 9: Extract and organize best parameters
        # ============================================================================
        best_params = study.best_params
        
        # Separate parameters for each model
        rf_best_params = {k.replace('rf_', ''): v for k, v in best_params.items() if k.startswith('rf_')}
        lgbm_best_params = {k.replace('lgbm_', ''): v for k, v in best_params.items() if k.startswith('lgbm_')}
        lr_best_params = {k.replace('lr_', ''): v for k, v in best_params.items() if k.startswith('lr_')}
        
        # Add fixed parameters
        rf_best_params.update({'random_state': self.random_state, 'n_jobs': -1})
        lgbm_best_params.update({'random_state': self.random_state, 'n_jobs': -1, 'verbose': -1})
        lr_best_params.update({'random_state': self.random_state})
        
        organized_params = {
            'random_forest': rf_best_params,
            'lightgbm': lgbm_best_params,
            'logistic_regression': lr_best_params,
            'best_score': study.best_value
        }
        
        print("\n" + "="*80)
        print("OPTIMIZATION COMPLETED!")
        print("="*80)
        print(f"Best accuracy: {study.best_value:.4f}")
        print(f"Number of completed trials: {len(study.trials)}")
        print("="*80)
        
        return study, organized_params, study.best_value
    
    def create_optimized_model(self, best_params):
        """
        Create the optimized stacking classifier using best parameters.
        
        Parameters:
        -----------
        best_params : dict
            Dictionary containing best parameters for each model
            
        Returns:
        --------
        StackingClassifier
            Optimized stacking classifier
        """
        
        # Create optimized base estimators
        rf_optimized = RandomForestClassifier(**best_params['random_forest'])
        lgbm_optimized = lgb.LGBMClassifier(**best_params['lightgbm'])
        lr_optimized = LogisticRegression(**best_params['logistic_regression'])
        
        # Create optimized stacking classifier
        optimized_stack = StackingClassifier(
            estimators=[
                ('rf', rf_optimized),
                ('lgbm', lgbm_optimized)
            ],
            final_estimator=lr_optimized,
            cv=5,
            stack_method='predict_proba',
            n_jobs=-1,
            passthrough=False
        )
        
        return optimized_stack

# ============================================================================
# USAGE EXAMPLE
# ============================================================================

def run_optimization_example(x_train, y_train, X_val=None, y_val=None):
    """
    Example function showing how to use the optimizer.
    
    Parameters:
    -----------
    X_train, y_train : Training data
    X_val, y_val : Optional validation data
    """
    
    # Initialize the optimizer
    optimizer = StackingClassifierOptimizer(
        x_train=x_train,
        y_train=y_train,
        X_val=X_val,
        y_val=y_val,
        cv_folds=5,
        random_state=42
    )
    
    # Run optimization
    study, best_params, best_score = optimizer.optimize(
        n_trials=50,  # Adjust based on computational budget
        timeout=3600,  # 1 hour timeout
        show_progress=True
    )
    
    # Create the optimized model
    optimized_model = optimizer.create_optimized_model(best_params)
    
    # Print detailed results
    print("\nBEST PARAMETERS:")
    print("-" * 50)
    print("Random Forest:")
    for param, value in best_params['random_forest'].items():
        print(f"  {param}: {value}")
    
    print("\nLightGBM:")
    for param, value in best_params['lightgbm'].items():
        print(f"  {param}: {value}")
    
    print("\nLogistic Regression (Meta-learner):")
    for param, value in best_params['logistic_regression'].items():
        print(f"  {param}: {value}")
    
    print(f"\nBest Cross-Validation Accuracy: {best_score:.4f}")
    
    # Train the final model
    print("\nTraining optimized model on full training set...")
    optimized_model.fit(X_train, y_train)
    
    return optimized_model, study, best_params

In [None]:
optimized_model, study, best_params = run_optimization_example(
    x_train, y_train
)

predictions = optimized_model.predict(x_test)

#### RF + GB

In [None]:
from sklearn.ensemble import StackingClassifier, GradientBoostingClassifier, RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import roc_auc_score
from tqdm import tqdm
import numpy as np

class TQDMStack(StackingClassifier):
    def fit(self, X, y, **fit_params):
        self.estimators_ = []
        # loop through base estimators with tqdm
        for name, estimator in tqdm(self.estimators, desc="Fitting base estimators"):
            fitted_est = estimator.fit(X, y, **fit_params)
            self.estimators_.append(fitted_est)
        
        # convert to array for stacking logic
        self.estimators_ = np.array(self.estimators_, dtype=object)
        
        # now let the parent class handle the meta learner fit
        return super().fit(X, y, **fit_params)


estimators = [
    ('rf', RandomForestClassifier(n_estimators=200)),
    ('gb', GradientBoostingClassifier(n_estimators=200))
]
stack = TQDMStack(
    estimators=estimators,
    final_estimator=LogisticRegression(),
    stack_method="predict_proba"
)

stack.fit(x_train, y_train)
stack_y_pred_prob = stack.predict_proba(x_test)[:, 1]
stack_roc_auc = roc_auc_score(y_test, stack_y_pred_prob)
print(f"Stacking Classifier ROC AUC: {stack_roc_auc}")

#### Create submission file

In [None]:
final = stack_model.predict_proba(df_test_transformed)[:, 1]
submission = pd.DataFrame({
    'id' : df_test_ids,
    'y' : final
})
submission.to_csv('submission.csv', index=False)
print("submission.csv file has been created.")

