In [1]:
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.datasets import load_iris
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score
from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import RepeatedStratifiedKFold
from sklearn.model_selection import RandomizedSearchCV

 # Hyperparameter Search for Logistic Regression
 `class sklearn.linear_model.LogisticRegression(penalty='l2', *, dual=False, tol=0.0001, C=1.0, fit_intercept=True, intercept_scaling=1, class_weight=None, random_state=None, solver='lbfgs', max_iter=100, multi_class='auto', verbose=0, warm_start=False, n_jobs=None, l1_ratio=None)
 Details of each paramete are located in the documentation of the [class](https://scikit-learn.org/stable/modules/generated/sklearn.linear_model.LogisticRegression.html).

In [2]:
iris = load_iris()
X = iris.data
y = iris.target
iris_df = pd.DataFrame(X, columns=iris.feature_names)
iris_df['target'] = y
iris_df.head()

Unnamed: 0,sepal length (cm),sepal width (cm),petal length (cm),petal width (cm),target
0,5.1,3.5,1.4,0.2,0
1,4.9,3.0,1.4,0.2,0
2,4.7,3.2,1.3,0.2,0
3,4.6,3.1,1.5,0.2,0
4,5.0,3.6,1.4,0.2,0


 ## Grid Search

In [3]:
# define model
model = LogisticRegression()
# define search space
space = dict()
space['solver'] = ['newton-cg', 'lbfgs', 'liblinear', 'sag', 'saga']
space['penalty'] = ['l1', 'l2', 'elasticnet']
space['C'] = [1e-5, 1e-4, 1e-3, 1e-2, 1e-1, 1, 10, 100]
# define search
search = GridSearchCV(model, space)
# define evaluation
cv = RepeatedStratifiedKFold(n_splits=10, n_repeats=3, random_state=1)

# define search
search = GridSearchCV(model, space, scoring='accuracy', n_jobs=-1, cv=cv)

In [4]:
result = search.fit(X, y)
# summarize result
print('Best Score: %s' % result.best_score_)
print('Best Hyperparameters: %s' % result.best_params_)

Best Score: 0.9800000000000001
Best Hyperparameters: {'C': 1, 'penalty': 'l2', 'solver': 'sag'}


 0.69777778 0.33333333 0.72222222 0.48888889        nan        nan
        nan        nan        nan        nan        nan 0.33333333
        nan 0.33333333 0.7        0.70444444 0.33333333 0.69777778
 0.69111111        nan        nan        nan        nan        nan
        nan        nan 0.33333333        nan 0.33333333 0.74222222
 0.74222222 0.51333333 0.74222222 0.74222222        nan        nan
        nan        nan        nan        nan        nan 0.33333333
        nan 0.64444444 0.87333333 0.87333333 0.66666667 0.87333333
 0.86              nan        nan        nan        nan        nan
        nan        nan 0.80444444        nan 0.95333333 0.95777778
 0.95777778 0.84222222 0.96222222 0.97777778        nan        nan
        nan        nan        nan        nan        nan 0.96
        nan 0.97777778 0.96444444 0.96444444 0.96222222 0.98
 0.97777778        nan        nan        nan        nan        nan
        nan        nan 0.95777778        nan 0.97333333 0.97333333
 0.9733

 ## Randomized Search

In [5]:
# define search space
from scipy.stats import loguniform

space = dict()
space['solver'] = ['newton-cg', 'lbfgs', 'liblinear']
space['penalty'] = ['none', 'l1', 'l2', 'elasticnet']
space['C'] = loguniform(1e-5, 100)
# define search
search = RandomizedSearchCV(model, space, n_iter=500, scoring='accuracy', n_jobs=-1, cv=cv, random_state=1)
# execute search
result = search.fit(X, y)
# summarize result
print('Best Score: %s' % result.best_score_)
print('Best Hyperparameters: %s' % result.best_params_)

Best Score: 0.9777777777777779
Best Hyperparameters: {'C': 0.00010648568665651115, 'penalty': 'none', 'solver': 'lbfgs'}


 0.71111111 0.68666667        nan        nan        nan        nan
 0.97111111        nan 0.96888889        nan        nan 0.97777778
 0.68666667        nan 0.96888889 0.33333333 0.97777778 0.69333333
 0.97777778        nan 0.97111111        nan 0.97111111 0.96666667
 0.7               nan 0.97111111        nan 0.97111111 0.96444444
        nan 0.97111111 0.33333333        nan 0.93111111 0.96
 0.33333333 0.82              nan 0.70444444        nan        nan
        nan        nan        nan 0.97111111        nan 0.97777778
        nan 0.97777778        nan        nan        nan        nan
 0.97777778        nan 0.96444444 0.87111111        nan 0.66666667
        nan 0.96888889 0.97111111 0.33333333        nan        nan
        nan        nan 0.97111111 0.96              nan 0.66666667
 0.33333333 0.97111111        nan 0.96666667 0.71111111        nan
 0.33333333        nan 0.97111111 0.78444444        nan        nan
 0.33333333        nan        nan 0.96666667        nan        nan
 

 # Hyperparameter Search for Random Forest Classifier

 ## Grid Search

In [6]:
# example of grid searching key hyperparameters for RandomForestClassifier
from sklearn.datasets import make_blobs
from sklearn.model_selection import RepeatedStratifiedKFold
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import RandomForestClassifier

# define models and parameters
model = RandomForestClassifier()
n_estimators = [10, 25, 50, 100,150, 250, 1000]
max_features = ['sqrt', 'log2']
max_depth = [None, 5, 10, 20, 50, 100]
# define grid search
grid = dict(n_estimators=n_estimators,max_features=max_features,max_depth=max_depth)
cv = RepeatedStratifiedKFold(n_splits=10, n_repeats=3, random_state=1)
grid_search = GridSearchCV(estimator=model, param_grid=grid, n_jobs=-1, cv=cv, scoring='accuracy',error_score=0)
grid_result = grid_search.fit(X, y)

In [7]:
# summarize results
print("Best: %f using %s" % (grid_result.best_score_, grid_result.best_params_))
means = grid_result.cv_results_['mean_test_score']
stds = grid_result.cv_results_['std_test_score']
params = grid_result.cv_results_['params']
for mean, stdev, param in zip(means, stds, params):
    print("%f (%f) with: %r" % (mean, stdev, param))

Best: 0.960000 using {'max_depth': 100, 'max_features': 'log2', 'n_estimators': 25}
0.944444 (0.048939) with: {'max_depth': None, 'max_features': 'sqrt', 'n_estimators': 10}
0.953333 (0.046027) with: {'max_depth': None, 'max_features': 'sqrt', 'n_estimators': 25}
0.953333 (0.039063) with: {'max_depth': None, 'max_features': 'sqrt', 'n_estimators': 50}
0.955556 (0.039752) with: {'max_depth': None, 'max_features': 'sqrt', 'n_estimators': 100}
0.957778 (0.040307) with: {'max_depth': None, 'max_features': 'sqrt', 'n_estimators': 150}
0.960000 (0.040734) with: {'max_depth': None, 'max_features': 'sqrt', 'n_estimators': 250}
0.957778 (0.040307) with: {'max_depth': None, 'max_features': 'sqrt', 'n_estimators': 1000}
0.946667 (0.046825) with: {'max_depth': None, 'max_features': 'log2', 'n_estimators': 10}
0.948889 (0.053564) with: {'max_depth': None, 'max_features': 'log2', 'n_estimators': 25}
0.951111 (0.041929) with: {'max_depth': None, 'max_features': 'log2', 'n_estimators': 50}
0.955556 (0

 ## Randomized Search

In [8]:
from sklearn.model_selection import RandomizedSearchCV
from pprint import pprint

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=1)
# Number of trees in random forest
n_estimators = [int(x) for x in np.linspace(start = 20, stop = 200, num = 5)]
# Number of features to consider at every split
max_features = ['auto', 'sqrt']
# Maximum number of levels in tree
max_depth = [int(x) for x in np.linspace(1, 45, num = 3)]
# Minimum number of samples required to split a node
min_samples_split = [5, 10]

# Create the random grid
random_grid = {'n_estimators': n_estimators,
               'max_features': max_features,
               'max_depth': max_depth,
               'min_samples_split': min_samples_split}

pprint(random_grid)
forest = RandomForestClassifier(n_jobs=-1)
rf_random = RandomizedSearchCV(estimator = forest, param_distributions = random_grid, n_iter = 10, cv = 10, verbose=2, random_state=42, n_jobs = -1, scoring='neg_mean_squared_error')
# Fit the random search model
rf_random.fit(X_train, y_train)

{'max_depth': [1, 23, 45],
 'max_features': ['auto', 'sqrt'],
 'min_samples_split': [5, 10],
 'n_estimators': [20, 65, 110, 155, 200]}
Fitting 10 folds for each of 10 candidates, totalling 100 fits


RandomizedSearchCV(cv=10, estimator=RandomForestClassifier(n_jobs=-1),
                   n_jobs=-1,
                   param_distributions={'max_depth': [1, 23, 45],
                                        'max_features': ['auto', 'sqrt'],
                                        'min_samples_split': [5, 10],
                                        'n_estimators': [20, 65, 110, 155,
                                                         200]},
                   random_state=42, scoring='neg_mean_squared_error',
                   verbose=2)

In [9]:
# summarize results
print("Best: %f using %s" % (grid_result.best_score_, grid_result.best_params_))
means = grid_result.cv_results_['mean_test_score']
stds = grid_result.cv_results_['std_test_score']
params = grid_result.cv_results_['params']
for mean, stdev, param in zip(means, stds, params):
    print("%f (%f) with: %r" % (mean, stdev, param))

Best: 0.960000 using {'max_depth': 100, 'max_features': 'log2', 'n_estimators': 25}
0.944444 (0.048939) with: {'max_depth': None, 'max_features': 'sqrt', 'n_estimators': 10}
0.953333 (0.046027) with: {'max_depth': None, 'max_features': 'sqrt', 'n_estimators': 25}
0.953333 (0.039063) with: {'max_depth': None, 'max_features': 'sqrt', 'n_estimators': 50}
0.955556 (0.039752) with: {'max_depth': None, 'max_features': 'sqrt', 'n_estimators': 100}
0.957778 (0.040307) with: {'max_depth': None, 'max_features': 'sqrt', 'n_estimators': 150}
0.960000 (0.040734) with: {'max_depth': None, 'max_features': 'sqrt', 'n_estimators': 250}
0.957778 (0.040307) with: {'max_depth': None, 'max_features': 'sqrt', 'n_estimators': 1000}
0.946667 (0.046825) with: {'max_depth': None, 'max_features': 'log2', 'n_estimators': 10}
0.948889 (0.053564) with: {'max_depth': None, 'max_features': 'log2', 'n_estimators': 25}
0.951111 (0.041929) with: {'max_depth': None, 'max_features': 'log2', 'n_estimators': 50}
0.955556 (0