In [None]:
import optuna
import lightgbm as lgb
import numpy as np
import xgboost as xgb
from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import Ridge
from sklearn.model_selection import cross_val_score
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline

In [None]:
cat_features = ["FLAT_TYPE", "FLAT_MODEL", "SUBZONE", "TOWN"]

def find_best_parameter(X, y):
    numerical_features = X.select_dtypes(include=np.number).columns.tolist()
    preprocessor = ColumnTransformer(
        transformers=[
            ('num', 'passthrough', numerical_features),
            ('cat', OneHotEncoder(handle_unknown='ignore'), cat_features)
        ],
        remainder='passthrough'
    )

    def objective(trial, model_name, X_data, y_data):
        if model_name == "LightGBM":
            params = {
                'objective': 'regression_l1',
                'metric': 'rmse',
                'n_estimators': trial.suggest_int('n_estimators', 100, 1000),
                'learning_rate': trial.suggest_float('learning_rate', 0.01, 0.3, log=True),
                'num_leaves': trial.suggest_int('num_leaves', 20, 300),
                'max_depth': trial.suggest_int('max_depth', 3, 12),
                'subsample': trial.suggest_float('subsample', 0.6, 1.0),
                'colsample_bytree': trial.suggest_float('colsample_bytree', 0.6, 1.0),
                'reg_alpha': trial.suggest_float('reg_alpha', 1e-8, 10.0, log=True),
                'reg_lambda': trial.suggest_float('reg_lambda', 1e-8, 10.0, log=True),
                'random_state': 42,
                "verbosity": -1,
                'n_jobs': -1
            }
            model = lgb.LGBMRegressor(**params)

        elif model_name == "XGBoost":
            params = {
                'objective': 'reg:squarederror',
                'eval_metric': 'rmse',
                'n_estimators': trial.suggest_int('n_estimators', 100, 1000),
                'learning_rate': trial.suggest_float('learning_rate', 0.01, 0.3, log=True),
                'max_depth': trial.suggest_int('max_depth', 3, 12),
                'subsample': trial.suggest_float('subsample', 0.6, 1.0),
                'colsample_bytree': trial.suggest_float('colsample_bytree', 0.6, 1.0),
                'gamma': trial.suggest_float('gamma', 1e-8, 1.0, log=True),
                'reg_alpha': trial.suggest_float('reg_alpha', 1e-8, 10.0, log=True),
                'reg_lambda': trial.suggest_float('reg_lambda', 1e-8, 10.0, log=True),
                'random_state': 42,
                'n_jobs': -1
            }
            model = xgb.XGBRegressor(**params)
    
        elif model_name == "RandomForest":
            params = {
                'n_estimators': trial.suggest_int('n_estimators', 100, 1000),
                'max_depth': trial.suggest_int('max_depth', 5, 50),
                'min_samples_split': trial.suggest_int('min_samples_split', 2, 20),
                'min_samples_leaf': trial.suggest_int('min_samples_leaf', 1, 20),
                'max_features': trial.suggest_categorical('max_features', ['sqrt', 'log2']),
                'random_state': 42,
                'n_jobs': -1
            }
            model = RandomForestRegressor(**params)
    
        elif model_name == "Ridge":
            params = {
                'alpha': trial.suggest_float('alpha', 1e-5, 100.0, log=True),
                'random_state': 42
            }
            model = Ridge(**params)
        
        else:
            raise ValueError(f"Model {model_name} is not supported for tuning.")

        pipeline = Pipeline(steps=[('preprocessor', preprocessor), ('regressor', model)])
        score = cross_val_score(pipeline, X_data, y_data, n_jobs=-1, cv=3, scoring='neg_root_mean_squared_error').mean()
    
        return -score
    
    models_to_tune = ["LightGBM", "XGBoost", "RandomForest", "Ridge"]
    best_params_all_models = {}
    n_trials_per_model = 50

    for model_name in models_to_tune:
        print(f"Starting tuning for {model_name}")
        study = optuna.create_study(direction="minimize")
        study.optimize(lambda trial: objective(trial, model_name, X, y), n_trials=n_trials_per_model)
        print(f"Finished tuning {model_name}")
        best_params_all_models[model_name] = {
            "best_value": study.best_value,
            "best_params": study.best_params,
        }
    
    return best_params_all_models