In [1]:
from datetime import datetime
print(f'Päivitetty {datetime.now()}')

Päivitetty 2022-10-14 15:36:59.178318


<h1>Esimerkki sopivien hyperparametrien etsimisestä</h1>

Mallille annettavia parametreja on tapana kutsua hyperparametreiksi.

Testidatan käyttö sopivien hyperparametrien valitsemissa ei ole suotavaa, koska tällöin testidata olisi osittain osallisena mallin opettamiseen. Testidatanhan on tarkoitus olla data, jota malli ei ole "nähnyt" opetuksen yhteydessä.

Hyperparametrien valitsemisessa voidaan käyttää **GridSearchCV**-toimintoa, joka jakaa opetusdatan opetusdataan ja validointidataan sekä kokeilee vaihtoehtoisia hyperparametrien arvoja. Lue lisää:

https://scikit-learn.org/stable/modules/generated/sklearn.model_selection.GridSearchCV.html

In [2]:
import pandas as pd
import seaborn as sns

from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier

In [3]:
df = sns.load_dataset('iris')

X = df.drop('species', axis=1)
y = df['species']

X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=2)

In [4]:
# Määritän käytettävän mallin
dtc = DecisionTreeClassifier(random_state=2)

# Katson mallin oletushyperparametrit
dtc.get_params()

{'ccp_alpha': 0.0,
 'class_weight': None,
 'criterion': 'gini',
 'max_depth': None,
 'max_features': None,
 'max_leaf_nodes': None,
 'min_impurity_decrease': 0.0,
 'min_samples_leaf': 1,
 'min_samples_split': 2,
 'min_weight_fraction_leaf': 0.0,
 'random_state': 2,
 'splitter': 'best'}

In [5]:
# Kokeilen erilaisia päätöspuun haarautumisten lukumääriä
parameters = {'max_depth':[1, 2, 3]}

# GridSearchCV kokeilee kaikki vaihtoehdot ja valitsee parhaan
dtc_grid = GridSearchCV(dtc, parameters, verbose=3)
dtc_grid.fit(X_train, y_train)

Fitting 5 folds for each of 3 candidates, totalling 15 fits
[CV 1/5] END .......................max_depth=1;, score=0.652 total time=   0.0s
[CV 2/5] END .......................max_depth=1;, score=0.652 total time=   0.0s
[CV 3/5] END .......................max_depth=1;, score=0.636 total time=   0.0s
[CV 4/5] END .......................max_depth=1;, score=0.636 total time=   0.0s
[CV 5/5] END .......................max_depth=1;, score=0.636 total time=   0.0s
[CV 1/5] END .......................max_depth=2;, score=0.957 total time=   0.0s
[CV 2/5] END .......................max_depth=2;, score=0.957 total time=   0.0s
[CV 3/5] END .......................max_depth=2;, score=1.000 total time=   0.0s
[CV 4/5] END .......................max_depth=2;, score=0.909 total time=   0.0s
[CV 5/5] END .......................max_depth=2;, score=1.000 total time=   0.0s
[CV 1/5] END .......................max_depth=3;, score=0.957 total time=   0.0s
[CV 2/5] END .......................max_depth=3;,

GridSearchCV(estimator=DecisionTreeClassifier(random_state=2),
             param_grid={'max_depth': [1, 2, 3]}, verbose=3)

In [6]:
# Katsotaan paras tulos
print('best score: ', dtc_grid.best_score_)
print('best parameters: ', dtc_grid.best_params_)

# Käytetään parhaiksi todettuja hyperparametrien arvoja
dtc = dtc_grid.best_estimator_

best score:  0.9644268774703558
best parameters:  {'max_depth': 2}


In [7]:
print(f'Opetusdata {dtc.score(X_train, y_train):.3f}')
print(f'Testidata {dtc.score(X_test, y_test):.3f}')

Opetusdata 0.964
Testidata 0.947


## Satunnaismetsä

In [8]:
# Määritän käytettävän mallin
rfc = RandomForestClassifier(random_state=2)

# Katson mallin oletushyperparametrit
rfc.get_params()

{'bootstrap': True,
 'ccp_alpha': 0.0,
 'class_weight': None,
 'criterion': 'gini',
 'max_depth': None,
 'max_features': 'auto',
 'max_leaf_nodes': None,
 'max_samples': None,
 'min_impurity_decrease': 0.0,
 'min_samples_leaf': 1,
 'min_samples_split': 2,
 'min_weight_fraction_leaf': 0.0,
 'n_estimators': 100,
 'n_jobs': None,
 'oob_score': False,
 'random_state': 2,
 'verbose': 0,
 'warm_start': False}

In [9]:
# Keskeisimmät parametrit ovat laskettavien päätöspuiden lukumäärä
# ja yhteen päätöspuuhun mukaan otettavien selittävien muuttujien lukumäärä
parameters = {'n_estimators':[100, 200, 300],
             'max_features':[2, 3, 4]}

# Kokeilen kaikki vaihtoehdot
rfc_grid = GridSearchCV(rfc, parameters, verbose=3)
rfc_grid.fit(X_train, y_train)

Fitting 5 folds for each of 9 candidates, totalling 45 fits
[CV 1/5] END ..max_features=2, n_estimators=100;, score=0.913 total time=   0.0s
[CV 2/5] END ..max_features=2, n_estimators=100;, score=0.957 total time=   0.0s
[CV 3/5] END ..max_features=2, n_estimators=100;, score=1.000 total time=   0.0s
[CV 4/5] END ..max_features=2, n_estimators=100;, score=0.909 total time=   0.0s
[CV 5/5] END ..max_features=2, n_estimators=100;, score=1.000 total time=   0.0s
[CV 1/5] END ..max_features=2, n_estimators=200;, score=0.913 total time=   0.1s
[CV 2/5] END ..max_features=2, n_estimators=200;, score=0.957 total time=   0.1s
[CV 3/5] END ..max_features=2, n_estimators=200;, score=1.000 total time=   0.1s
[CV 4/5] END ..max_features=2, n_estimators=200;, score=0.909 total time=   0.1s
[CV 5/5] END ..max_features=2, n_estimators=200;, score=1.000 total time=   0.1s
[CV 1/5] END ..max_features=2, n_estimators=300;, score=0.913 total time=   0.2s
[CV 2/5] END ..max_features=2, n_estimators=300;,

GridSearchCV(estimator=RandomForestClassifier(random_state=2),
             param_grid={'max_features': [2, 3, 4],
                         'n_estimators': [100, 200, 300]},
             verbose=3)

In [10]:
# Katsotaan paras tulos
print('best score: ', rfc_grid.best_score_)
print('best parameters: ', rfc_grid.best_params_)

# Käytetään parhaiksi todettuja hyperparametrien arvoja
rfc = rfc_grid.best_estimator_

best score:  0.9644268774703558
best parameters:  {'max_features': 3, 'n_estimators': 100}


In [11]:
print(f'Opetusdata {rfc.score(X_train, y_train):.3f}')
print(f'Testidata {rfc.score(X_test, y_test):.3f}')

Opetusdata 1.000
Testidata 0.974


## Gradienttitehostaminen

In [12]:
gbc = GradientBoostingClassifier(random_state=2)
gbc.get_params()

{'ccp_alpha': 0.0,
 'criterion': 'friedman_mse',
 'init': None,
 'learning_rate': 0.1,
 'loss': 'deviance',
 'max_depth': 3,
 'max_features': None,
 'max_leaf_nodes': None,
 'min_impurity_decrease': 0.0,
 'min_samples_leaf': 1,
 'min_samples_split': 2,
 'min_weight_fraction_leaf': 0.0,
 'n_estimators': 100,
 'n_iter_no_change': None,
 'random_state': 2,
 'subsample': 1.0,
 'tol': 0.0001,
 'validation_fraction': 0.1,
 'verbose': 0,
 'warm_start': False}

In [13]:
# Gradienttitehostuksella myös learning_rate on keskeinen parametri
# Liian pieni learning_rate johtaa hitaaseen oppimiseen
# liian suuri learning_rate voi estää optimiratkaisun löytymisen
parameters = {'max_depth':[1, 2, 3],
             'n_estimators':[100, 200, 300, 400],
             'learning_rate':[0.001, 0.01, 0.1]}

gbc_grid = GridSearchCV(gbc, parameters, verbose=3)
gbc_grid.fit(X_train, y_train)

Fitting 5 folds for each of 36 candidates, totalling 180 fits
[CV 1/5] END learning_rate=0.001, max_depth=1, n_estimators=100;, score=0.957 total time=   0.0s
[CV 2/5] END learning_rate=0.001, max_depth=1, n_estimators=100;, score=0.957 total time=   0.0s
[CV 3/5] END learning_rate=0.001, max_depth=1, n_estimators=100;, score=1.000 total time=   0.0s
[CV 4/5] END learning_rate=0.001, max_depth=1, n_estimators=100;, score=0.818 total time=   0.0s
[CV 5/5] END learning_rate=0.001, max_depth=1, n_estimators=100;, score=1.000 total time=   0.0s
[CV 1/5] END learning_rate=0.001, max_depth=1, n_estimators=200;, score=0.957 total time=   0.1s
[CV 2/5] END learning_rate=0.001, max_depth=1, n_estimators=200;, score=0.957 total time=   0.1s
[CV 3/5] END learning_rate=0.001, max_depth=1, n_estimators=200;, score=1.000 total time=   0.1s
[CV 4/5] END learning_rate=0.001, max_depth=1, n_estimators=200;, score=0.864 total time=   0.1s
[CV 5/5] END learning_rate=0.001, max_depth=1, n_estimators=200;,

[CV 1/5] END learning_rate=0.01, max_depth=2, n_estimators=200;, score=1.000 total time=   0.1s
[CV 2/5] END learning_rate=0.01, max_depth=2, n_estimators=200;, score=0.957 total time=   0.1s
[CV 3/5] END learning_rate=0.01, max_depth=2, n_estimators=200;, score=1.000 total time=   0.1s
[CV 4/5] END learning_rate=0.01, max_depth=2, n_estimators=200;, score=0.909 total time=   0.1s
[CV 5/5] END learning_rate=0.01, max_depth=2, n_estimators=200;, score=1.000 total time=   0.1s
[CV 1/5] END learning_rate=0.01, max_depth=2, n_estimators=300;, score=0.957 total time=   0.2s
[CV 2/5] END learning_rate=0.01, max_depth=2, n_estimators=300;, score=0.957 total time=   0.3s
[CV 3/5] END learning_rate=0.01, max_depth=2, n_estimators=300;, score=1.000 total time=   0.2s
[CV 4/5] END learning_rate=0.01, max_depth=2, n_estimators=300;, score=0.909 total time=   0.2s
[CV 5/5] END learning_rate=0.01, max_depth=2, n_estimators=300;, score=1.000 total time=   0.2s
[CV 1/5] END learning_rate=0.01, max_dep

[CV 2/5] END learning_rate=0.1, max_depth=3, n_estimators=300;, score=0.957 total time=   0.2s
[CV 3/5] END learning_rate=0.1, max_depth=3, n_estimators=300;, score=1.000 total time=   0.3s
[CV 4/5] END learning_rate=0.1, max_depth=3, n_estimators=300;, score=0.909 total time=   0.2s
[CV 5/5] END learning_rate=0.1, max_depth=3, n_estimators=300;, score=1.000 total time=   0.3s
[CV 1/5] END learning_rate=0.1, max_depth=3, n_estimators=400;, score=1.000 total time=   0.3s
[CV 2/5] END learning_rate=0.1, max_depth=3, n_estimators=400;, score=0.957 total time=   0.3s
[CV 3/5] END learning_rate=0.1, max_depth=3, n_estimators=400;, score=1.000 total time=   0.3s
[CV 4/5] END learning_rate=0.1, max_depth=3, n_estimators=400;, score=0.909 total time=   0.2s
[CV 5/5] END learning_rate=0.1, max_depth=3, n_estimators=400;, score=1.000 total time=   0.3s


GridSearchCV(estimator=GradientBoostingClassifier(random_state=2),
             param_grid={'learning_rate': [0.001, 0.01, 0.1],
                         'max_depth': [1, 2, 3],
                         'n_estimators': [100, 200, 300, 400]},
             verbose=3)

In [14]:
print('best score: ', gbc_grid.best_score_)
print('best parameters: ', gbc_grid.best_params_)

# Käytetään parhaiksi todettuja hyperparametrien arvoja
gbc = gbc_grid.best_estimator_

best score:  0.9818181818181818
best parameters:  {'learning_rate': 0.01, 'max_depth': 1, 'n_estimators': 300}


In [15]:
print(f'Opetusdata {gbc.score(X_train, y_train):.3f}')
print(f'Testidata {gbc.score(X_test, y_test):.3f}')

Opetusdata 0.982
Testidata 0.974
