In [26]:
import pandas as pd
X_train = pd.read_csv('X_train_norm_pca.csv')
X_test = pd.read_csv('X_test_norm_pca.csv')
y_train = pd.read_csv('y_train.csv')
y_test = pd.read_csv('y_test.csv')

In [27]:
def optimize_catboost(X_train, X_test, y_train, y_test, number_of_trials=None, param_bounds=None, random_seed=8):
    import numpy as np
    import pandas as pd
    from catboost import CatBoostClassifier
    from sklearn.model_selection import cross_val_score
    from sklearn.metrics import accuracy_score
    import optuna

    default_param_bounds = {
        'iterations': [50, 1000],  
        'depth': [2, 16],  
        'learning_rate': [0.01, 0.3], 
        'l2_leaf_reg': [1, 10],  
        'border_count': [32, 255]  
    }

    if param_bounds:
        default_param_bounds.update(param_bounds)
    if number_of_trials is None:
        number_of_trials = 50  

    trials_df = pd.DataFrame(columns=['trial', 'cv_accuracy', 'test_accuracy', 'parameters'])

    def objective(trial):
        
        iterations = trial.suggest_int(
            'iterations',
            default_param_bounds['iterations'][0],
            default_param_bounds['iterations'][1],
        )
        depth = trial.suggest_int(
            'depth',
            default_param_bounds['depth'][0],
            default_param_bounds['depth'][1],
        )
        learning_rate = trial.suggest_float(
            'learning_rate',
            default_param_bounds['learning_rate'][0],
            default_param_bounds['learning_rate'][1],
        )
        l2_leaf_reg = trial.suggest_float(
            'l2_leaf_reg',
            default_param_bounds['l2_leaf_reg'][0],
            default_param_bounds['l2_leaf_reg'][1],
        )
        border_count = trial.suggest_int(
            'border_count',
            default_param_bounds['border_count'][0],
            default_param_bounds['border_count'][1],
        )

        model = CatBoostClassifier(
            iterations=iterations,
            depth=depth,
            learning_rate=learning_rate,
            l2_leaf_reg=l2_leaf_reg,
            border_count=border_count,
            random_seed=random_seed,
            verbose=0  # Suppress output
        )

        cv_scores = cross_val_score(model, X_train, y_train.values.ravel(), cv=5, scoring='accuracy', n_jobs=-1)
        cv_accuracy = np.mean(cv_scores)

        model.fit(X_train, y_train.values.ravel())
        y_pred = model.predict(X_test)
        test_accuracy = accuracy_score(y_test.values.ravel(), y_pred)

        trial_data = {
            'trial': trial.number,
            'cv_accuracy': cv_accuracy,
            'test_accuracy': test_accuracy,
            'parameters': {
                'iterations': iterations,
                'depth': depth,
                'learning_rate': learning_rate,
                'l2_leaf_reg': l2_leaf_reg,
                'border_count': border_count,
            }
        }
        trials_df.loc[len(trials_df)] = trial_data

        return cv_accuracy

    study = optuna.create_study(direction='maximize')
    study.optimize(objective, n_trials=number_of_trials)

    print('Best hyperparameters:', study.best_params)
    print('Best cross-validation accuracy:', study.best_value)

    best_params = study.best_params
    best_model = CatBoostClassifier(
        iterations=best_params['iterations'],
        depth=best_params['depth'],
        learning_rate=best_params['learning_rate'],
        l2_leaf_reg=best_params['l2_leaf_reg'],
        border_count=best_params['border_count'],
        random_seed=random_seed,
        verbose=0
    )
    best_model.fit(X_train, y_train.values.ravel())

    return best_model, trials_df


In [None]:
best_model, trials_df = optimize_catboost(X_train, X_test, y_train, y_test, number_of_trials=4)

In [None]:
pd.set_option('display.max_colwidth', None)
trials_df['total_acc']=(0.4*trials_df['cv_accuracy'])+(0.6*trials_df['test_accuracy'])
cols = trials_df.columns.tolist() 
cols.insert(1, cols.pop(cols.index('total_acc'))) 
trials_df = trials_df[cols]
trials_df = trials_df.sort_values(by='total_acc', ascending=False)
trials_df