In [None]:
import pandas as pd
X_train = pd.read_csv('X_traincsv')
X_test = pd.read_csv('X_test.csv')
y_train = pd.read_csv('y_train.csv')
y_test = pd.read_csv('y_test.csv')

In [None]:
def optimize_random_forest_with_feature_removal(X_train, X_test, y_train, y_test, thresholds, number_of_trials=None, param_bounds=None, random_seed=8):
    import numpy as np
    import pandas as pd
    from sklearn.ensemble import RandomForestClassifier
    from sklearn.model_selection import cross_val_score
    from sklearn.metrics import accuracy_score
    import optuna

    default_param_bounds = {
        'n_estimators': [10, 300],  # Number of trees
        'max_depth': [1, 30],  # Maximum depth of trees
        'min_samples_split': [2, 20],  # Minimum samples required to split
        'min_samples_leaf': [1, 20],  # Minimum samples per leaf node
        'max_features': ['sqrt', 'log2', None]  # Number of features to consider at each split
    }

    if param_bounds:
        default_param_bounds.update(param_bounds)
    if number_of_trials is None:
        number_of_trials = 50

    all_results = []

    for threshold in thresholds:
        print(f"Running for feature importance threshold: {threshold}")

        temp_model = RandomForestClassifier(random_state=random_seed, n_jobs=-1)
        temp_model.fit(X_train, y_train.values.ravel())
        feature_importances = pd.Series(temp_model.feature_importances_, index=X_train.columns)

        selected_features = feature_importances[feature_importances >= threshold].index
        X_train_reduced = X_train[selected_features]
        X_test_reduced = X_test[selected_features]

        trials_df = pd.DataFrame(columns=['trial', 'cv_accuracy', 'test_accuracy', 'parameters'])

        def objective(trial):
            n_estimators = trial.suggest_int(
                'n_estimators',
                default_param_bounds['n_estimators'][0],
                default_param_bounds['n_estimators'][1],
            )
            max_depth = trial.suggest_int(
                'max_depth',
                default_param_bounds['max_depth'][0],
                default_param_bounds['max_depth'][1],
            )
            min_samples_split = trial.suggest_int(
                'min_samples_split',
                default_param_bounds['min_samples_split'][0],
                default_param_bounds['min_samples_split'][1],
            )
            min_samples_leaf = trial.suggest_int(
                'min_samples_leaf',
                default_param_bounds['min_samples_leaf'][0],
                default_param_bounds['min_samples_leaf'][1],
            )
            max_features = trial.suggest_categorical('max_features', default_param_bounds['max_features'])

            model = RandomForestClassifier(
                n_estimators=n_estimators,
                max_depth=max_depth,
                min_samples_split=min_samples_split,
                min_samples_leaf=min_samples_leaf,
                max_features=max_features,
                random_state=random_seed,
                n_jobs=-1
            )

            cv_scores = cross_val_score(model, X_train_reduced, y_train, cv=5, scoring='accuracy', n_jobs=-1)
            cv_accuracy = np.mean(cv_scores)

            model.fit(X_train_reduced, y_train)
            y_pred = model.predict(X_test_reduced)
            test_accuracy = accuracy_score(y_test, y_pred)

            trial_data = {
                'trial': trial.number,
                'cv_accuracy': cv_accuracy,
                'test_accuracy': test_accuracy,
                'parameters': {
                    'n_estimators': n_estimators,
                    'max_depth': max_depth,
                    'min_samples_split': min_samples_split,
                    'min_samples_leaf': min_samples_leaf,
                    'max_features': max_features,
                }
            }
            trials_df.loc[len(trials_df)] = trial_data

            return cv_accuracy

        study = optuna.create_study(direction='maximize')
        study.optimize(objective, n_trials=number_of_trials)

        print('Best hyperparameters:', study.best_params)
        print('Best cross-validation accuracy:', study.best_value)

        best_params = study.best_params
        best_model = RandomForestClassifier(
            n_estimators=best_params['n_estimators'],
            max_depth=best_params['max_depth'],
            min_samples_split=best_params['min_samples_split'],
            min_samples_leaf=best_params['min_samples_leaf'],
            max_features=best_params['max_features'],
            random_state=random_seed,
            n_jobs=-1
        )
        best_model.fit(X_train_reduced, y_train.values.ravel())

        all_results.append({
            'threshold': threshold,
            'best_params': study.best_params,
            'cv_accuracy': study.best_value,
            'test_accuracy': accuracy_score(y_test.values.ravel(), best_model.predict(X_test_reduced)),
            'reduced_features': len(selected_features)
        })

    results_df = pd.DataFrame(all_results)
    return results_df
thresholds = [0.0, 0.01, 0.015, 0.02]
results_df = optimize_random_forest_with_feature_removal(X_train, X_test, y_train, y_test, thresholds)
