In [None]:
import numpy as np
import pandas as pd

from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error

import xgboost as xgb
import lightgbm as lgb
from catboost import CatBoostRegressor, Pool

from sklearn.ensemble import StackingRegressor
from sklearn.linear_model import Ridge

import optuna
import joblib
import optuna.visualization as vis
import plotly.io as pio

import warnings
warnings.filterwarnings('ignore')

In [None]:
def save_study(study, study_file_name = 'studies/generic_study.pkl', report_file_name = 'studies/generic_study.csv'):
    # Save the study
    joblib.dump(study, study_file_name)

    study_df = study.trials_dataframe()
    study_df.sort_values(by='value', ascending=True).head(3)
    study_df.to_csv(report_file_name, index=False)

    return study_df

def load_study(study_file_name=None, report_file_name=None):
    study = None
    study_df = None
    if study_file_name != None:
        study = joblib.load(study_file_name)
    if report_file_name != None:
        study_df = pd.read_csv(report_file_name)
    
    return study, study_df

In [None]:
test_df  = pd.read_csv("/kaggle/input/playground-series-s4e9/test.csv")
train_df = pd.read_csv("/kaggle/input/playground-series-s4e9/train.csv")

X_train = train_df.drop(['id','price'], axis=1)
y_train = train_df['price']

id_column = test_df['id']
X_test = test_df.drop(['id'], axis=1)

In [None]:
combined_data = pd.concat([X_train, X_test], keys=['train', 'test'])

In [None]:
categorical = combined_data.select_dtypes(include='object').columns

for col in categorical:
    if combined_data[col].isna().sum() > 0:
        mode = combined_data[col].mode()[0]
        combined_data[col].fillna(mode, inplace=True)

encoders = {}
for var in categorical:
    le = LabelEncoder()
    combined_data[var] = le.fit_transform(combined_data[var])
    encoders[var] = le

In [None]:
X_train = combined_data.xs('train')
X_test = combined_data.xs('test')

x1, x2, y1, y2 = train_test_split(X_train, y_train, test_size=0.2, random_state=42, stratify=y_train)

In [None]:
def lgb_objective(trial):
    param = {
        'objective': 'regression',
        'metric': 'rmse',
        'boosting_type': 'gbdt',  # Gradient Boosting Decision Tree
        'num_leaves': trial.suggest_int('num_leaves', 20, 120),
        'learning_rate': trial.suggest_float('learning_rate', 1e-4, 0.1, log=True),
        'n_estimators': trial.suggest_int('n_estimators', 100, 700),
        'feature_fraction': trial.suggest_float('feature_fraction', 0.1, 0.8),
        'bagging_fraction': trial.suggest_float('bagging_fraction', 0.1, 0.8),
        'bagging_freq': trial.suggest_int('bagging_freq', 1, 7),
        'min_child_samples': trial.suggest_int('min_child_samples', 5, 120),
        'device': 'gpu', 
        'gpu_use_dp': False,
        'verbose': -1
    }

    model = lgb.LGBMRegressor(**param)
    model.fit(x1, y1, eval_set=[(x2, y2)], callbacks=[lgb.early_stopping(stopping_rounds=5)])
    
    y_pred = model.predict(x2)
    rmse = mean_squared_error(y2, y_pred, squared=False)
    
    return rmse

lgb_study = optuna.create_study(direction='minimize')

lgb_study.optimize(lgb_objective, n_trials=150)

In [None]:
print("Best Hyperparameters:", lgb_study.best_params)
print("Best RMSE:", lgb_study.best_value)

In [None]:
def cat_objective(trial):
    params = {
        'iterations': trial.suggest_int('iterations', 100, 1500),
        'depth': trial.suggest_int('depth', 2, 10),
        'learning_rate': trial.suggest_loguniform('learning_rate', 1e-3, 0.5),
        'l2_leaf_reg': trial.suggest_loguniform('l2_leaf_reg', 1e-5, 100),
        'bagging_temperature': trial.suggest_uniform('bagging_temperature', 0, 1),
        'random_seed': 42,
        'loss_function': 'RMSE',
        'task_type': 'GPU'
    }

    cat_model = CatBoostRegressor(**params, early_stopping_rounds=20, verbose=0)
    
    cat_model.fit(Pool(x1, y1), eval_set=Pool(x2, y2))
    
    y_pred = cat_model.predict(x2)
    
    rmse = mean_squared_error(y2, y_pred, squared=False)
    
    return rmse

cat_study = optuna.create_study(direction='minimize')
cat_study.optimize(cat_objective, n_trials=250)

In [None]:
print("Best Hyperparameters:", cat_study.best_params)
print("Best RMSE:", cat_study.best_value)

In [None]:
def xgb_objective(trial):
    params = {
        'objective': 'reg:squarederror',
        'eval_metric': 'rmse',
        'learning_rate': trial.suggest_loguniform('learning_rate', 1e-3, 0.3),
        'n_estimators': trial.suggest_int('n_estimators', 200, 1500),
        'max_depth': trial.suggest_int('max_depth', 4, 10),
        'min_child_weight': trial.suggest_int('min_child_weight', 1, 10),
        'subsample': trial.suggest_uniform('subsample', 0.5, 1.0),
        'colsample_bytree': trial.suggest_uniform('colsample_bytree', 0.5, 1.0),
        'lambda': trial.suggest_loguniform('lambda', 1e-5, 10.0),
        'alpha': trial.suggest_loguniform('alpha', 1e-5, 10.0),
        'random_state': 42,
        'tree_method': 'gpu_hist'
    }

    xgb_model = xgb.XGBRegressor(**params, early_stopping_rounds=50, verbosity=0)

    xgb_model.fit(x1, y1, eval_set=[(x2, y2)], verbose=False)

    y_pred = xgb_model.predict(x2)

    rmse = mean_squared_error(y2, y_pred, squared=False)
    
    return rmse

xgb_study = optuna.create_study(direction='minimize')
xgb_study.optimize(xgb_objective, n_trials=250)

In [None]:
print("Best Hyperparameters:", xgb_study.best_params)
print("Best RMSE:", xgb_study.best_value)

In [None]:
# Choosing the best models for each study
top = 5

catboost_top = [cat_study.trials[i].params for i in cat_study.trials_dataframe().sort_values(by='value',ascending=True)['number'][:top]]
lgb_top      = [lgb_study.trials[i].params for i in lgb_study.trials_dataframe().sort_values(by='value',ascending=True)['number'][:top]]
xgb_top      = [xgb_study.trials[i].params for i in xgb_study.trials_dataframe().sort_values(by='value',ascending=True)['number'][:top]]
catboost_models = [CatBoostRegressor(**params) for params in catboost_top]
lgb_models      = [lgb.LGBMRegressor(**params) for params in lgb_top]
xgb_models      = [xgb.XGBRegressor(**params) for params in xgb_top]

In [None]:
stacked_model = StackingRegressor(
    estimators=[('catboost_' + str(i), model) for i, model in enumerate(catboost_models)] + 
               [('lgb_' + str(i), model) for i, model in enumerate(lgb_models)] +
               [('xgb_' + str(i), model) for i, model in enumerate(xgb_models)],
    final_estimator=Ridge()
)

stacked_model.fit(X_train, y_train)
y_pred = stacked_model.predict(X_test)

In [None]:
submission = pd.DataFrame({'id': id_column, 'price': y_pred})
submission.to_csv('submission.csv', index = False)