In [4]:
# Preprocessing
from sklearn.preprocessing import StandardScaler, MinMaxScaler

# Fitting
from sklearn.cross_decomposition import PLSRegression
from sklearn.svm import SVR
from sklearn.linear_model import LinearRegression, Lasso, ElasticNet, Ridge
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from xgboost import XGBRegressor
import lightgbm as LGB
from catboost import CatBoostRegressor

# Validation
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score, KFold, RepeatedKFold

# Fine-Tuning
from sklearn.model_selection import RandomizedSearchCV, GridSearchCV

# Evaluation
from sklearn import metrics
from sklearn.metrics import mean_absolute_error, mean_squared_error

# Pipeline
from sklearn.pipeline import Pipeline, make_pipeline

# Tree-based Model
## Randomforest
[parameter](https://scikit-learn.org/stable/modules/generated/sklearn.ensemble.RandomForestRegressor.html#sklearn.ensemble.RandomForestRegressor)

## GradientBoosting
[parameter](https://scikit-learn.org/stable/modules/generated/sklearn.ensemble.GradientBoostingRegressor.html)

## Xgboost
[parameter](https://xgboost.readthedocs.io/en/latest/parameter.html)

## LightGBM
[parameter](https://lightgbm.readthedocs.io/en/latest/Parameters-Tuning.html)

## CatBoost
[parameter](https://catboost.ai/docs/concepts/parameter-tuning.html)

In [None]:
# hyperparameters
rf_pipe_params = {'n_estimators': [100, 500, 1000],
                  'max_depth': [3, 6, 9, 10, 20, 30],
                  'max_features': ['auto', 'sqrt'],
                  'min_samples_leaf': [1, 2, 4],
                  'min_samples_split': [2, 5, 10],
                 }

gb_pipe_params = {'n_estimators': [100, 500, 1000],
                  'learning_rate': [0.0001, 0.001, 0.01, 0.1],
                  'subsample': [0.7, 0.8, 0.9],
                  'max_depth': [3, 6, 9, 10, 20, 30],
                  'max_features': ['auto', 'sqrt'],
                  'min_samples_leaf': [1, 2, 4],
                  'min_samples_split': [2, 5, 10],
                  
                 }

xgb_pipe_params = {'n_estimators': [100, 500, 1000],
                   'max_depth': [3, 6, 9, 10, 20, 30],
                   'max_features': ['auto', 'sqrt'],
                   'min_samples_leaf': [1, 2, 4],
                   'min_samples_split': [2, 5, 10],
                   'colsample_bytree': [0.7, 0.8],
                  }

lgb_pipe_params = {'n_estimators': [100, 500, 1000],
                   'subsample': [0.7, 0.8, 0.9],
                   'num_leaves': [50, 100, 200],
                   'max_depth': [3, 6, 9, 10, 20, 30],
                   'max_features': ['auto', 'sqrt'],
                   'min_samples_leaf': [1, 2, 4],
                   'min_samples_split': [2, 5, 10],
                  }

In [None]:
seed = 2021
cv = KFold(n_splits=4)
scaler = StandardScaler()
model = RandomForestRegressor(random_state=seed)
pipeline = make_pipeline(scaler, model)

# Cross Validataion
## K-fold CV

In [None]:
scores = cross_val_score(pipeline, X, y, cv = cv)
# if pipeline is regressor, Kfold is used
# else, StratifiedKFold is used

## SearchCV

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, shuffle=True, random_state=seed)

clf = GridSearchCV(estimator=pipeline, 
                   param_distributions=rf_pipe_params, 
                   cv=cv, n_iter=100, verbose=10, random_state=seed, n_jobs=-1)

# Test

In [10]:
clf.fit(X_train, y_train)
best_param = clf.best_params_
y_pred = clf.predict(X_test)

KFold(n_splits=4, random_state=None, shuffle=False)