<a href="https://colab.research.google.com/github/zhukuixi/Kaggle/blob/main/HyperParameterTuning_gridsearch_randomsearch_pipeline_optuna.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Grid Search


In [1]:

import pandas as pd
import numpy as np

from sklearn import ensemble
from sklearn import metrics
from sklearn import model_selection


if __name__=="__main__":
  df = pd.read_csv("/content/sample_data/train.csv")
  X = df.drop("price_range",axis=1).values
  y = df.price_range.values

  classifier = ensemble.RandomForestClassifier(n_jobs=-1)
  param_grid = {"n_estimators":[100,200,300,400],
          "max_depth":[1,3,5,7],
          "criterion":["gini","entropy"]      
          }
  model = model_selection.GridSearchCV(
      estimator=classifier,
      param_grid=param_grid,
      scoring="accuracy",
      verbose=10,
      n_jobs=1,
      cv=5
  )
  model.fit(X,y)
  print(model.best_score_)
  print(model.best_estimator_.get_params)


FileNotFoundError: ignored

# Random Search

In [None]:
  param_grid = {"n_estimators":np.arange(100,1500,100),
          "max_depth":np.arange(1,20),
          "criterion":["gini","entropy"]      
          }
  model = model_selection.RandomizedSearchCV(
      estimator=classifier,
      n_iter = 10,
      param_distributions=param_grid,
      scoring="accuracy",
      verbose=10,
      n_jobs=1,
      cv=5
  )
  model.fit(X,y)
  print(model.best_score_)
  print(model.best_estimator_.get_params)

# Random Search with pipeline
Now you can even tune the hyperparameters in the preprocessing steps involved in the pipeline.

In [None]:
from sklearn import decomposition
from sklearn import preprocessing
from sklearn import pipeline

scl = preprocessing.StandardScaler()
pca = decomposition.PCA()
rf = ensemble.RandomForestClassifier(n_jobs=-1)

classifier = pipeline.Pipeline(
    [("scaling",scl),
     ("pca",pca),
     ("rf",rf)        
    ]
)
# the name of hyperparameter matches the name you define in pipeline
# there are 2 underscore between pca and n_components
param_grid = {"pca__n_components":np.arange(5,10),            
       "rf__n_estimators":np.arange(100,1500,100),
        "rf__max_depth":np.arange(1,20),
        "rf__criterion":["gini","entropy"]      
        }
model = model_selection.RandomizedSearchCV(
    estimator=classifier,
    n_iter=10,
    param_distributions=param_grid,
    scoring="accuracy",
    verbose=10,
    n_jobs=1,
    cv=5
)
model.fit(X,y)
print(model.best_score_)
print(model.best_estimator_.get_params)

# Optuna

In [None]:

from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn import decomposition
from sklearn import preprocessing
from sklearn import pipeline



import optuna

In [49]:
def objective(trail,X,y):
  #scl = preprocessing.StandardScaler()
  #pca = decomposition.PCA(n_components=trail.suggest_int('pca__n_components',5,10))
  rf = ensemble.RandomForestClassifier(
      n_estimators=trail.suggest_int('rf__n_estimators',100,1500,100),
      max_depth = trail.suggest_int('rf__max_depth',1,20),
      criterion = trail.suggest_categorical('rf__criterion',["gini","entropy"])
  )

  kf = model_selection.StratifiedKFold(n_splits=5)
  acc_store = []
  for idx in kf.split(X,y):
    train_idx, test_idx = idx[0],idx[1]
    x_train = X[train_idx]
    y_train = y[train_idx]

    x_test = X[test_idx]
    y_test = y[test_idx]
   # x_train = scl.fit_transform(x_train)
   # x_train = pca.fit_transform(x_train)


    rf.fit(x_train,y_train)
   # preds = rf.predict(pca.transform(scl.transform(x_test)))
    preds = rf.predict(x_test)

    acc_store.append(accuracy_score(preds,y_test))
  return np.mean(acc_store)


In [45]:
from functools import partial

In [50]:
df = pd.read_csv("/content/sample_data/train.csv")
X = df.drop("price_range",axis=1).values
y = df.price_range.values

study = optuna.create_study(direction='maximize')

objective_partial = partial(objective,X=X,y=y)
study.optimize(objective_partial, n_trials=10)

[32m[I 2023-03-23 21:16:55,488][0m A new study created in memory with name: no-name-d298c318-312f-4b1c-9e2f-120bdbe3212b[0m
[32m[I 2023-03-23 21:16:58,775][0m Trial 0 finished with value: 0.873 and parameters: {'rf__n_estimators': 100, 'rf__max_depth': 17, 'rf__criterion': 'gini'}. Best is trial 0 with value: 0.873.[0m
[32m[I 2023-03-23 21:17:39,136][0m Trial 1 finished with value: 0.8845000000000001 and parameters: {'rf__n_estimators': 1300, 'rf__max_depth': 20, 'rf__criterion': 'entropy'}. Best is trial 1 with value: 0.8845000000000001.[0m
[32m[I 2023-03-23 21:18:12,067][0m Trial 2 finished with value: 0.8795 and parameters: {'rf__n_estimators': 1200, 'rf__max_depth': 9, 'rf__criterion': 'entropy'}. Best is trial 1 with value: 0.8845000000000001.[0m
[32m[I 2023-03-23 21:18:14,977][0m Trial 3 finished with value: 0.8310000000000001 and parameters: {'rf__n_estimators': 100, 'rf__max_depth': 5, 'rf__criterion': 'entropy'}. Best is trial 1 with value: 0.8845000000000001.[0