# Hyperparameter Tuning

### 1. Membuat synthetic dataset

In [12]:
from sklearn.datasets import make_classification
x, y = make_classification(n_samples=200, n_classes=2, n_features=10, n_redundant=0, random_state=1)

In [13]:
x.shape

(200, 10)

In [14]:
y.shape

(200,)

In [15]:
import pandas as pd

In [16]:
pd.DataFrame(x)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9
0,-1.511077,0.608749,-0.153236,0.507984,-0.324032,-2.432509,1.592056,-0.864830,-0.202901,-0.871422
1,1.445445,0.518969,0.645153,2.038777,-0.396293,1.282142,-2.170249,-1.043400,0.048547,-2.621012
2,0.371670,0.513505,-1.398813,-0.459943,0.644354,0.081768,-1.757065,0.142251,-1.132835,1.853009
3,2.565453,0.145652,1.177052,1.322694,0.194175,-0.641108,0.878631,-0.202694,-1.199798,-0.464115
4,-0.710656,1.050615,0.354602,-1.774596,-0.312230,-0.212373,0.826484,-0.621252,-1.187774,1.131129
...,...,...,...,...,...,...,...,...,...,...
195,-1.098083,-1.277636,0.419595,0.482176,-1.879287,-0.091079,-2.428480,0.032615,1.164204,0.758637
196,0.165211,1.937132,-1.307971,0.074876,-1.786935,1.472396,1.666002,-0.696028,-0.162525,0.976296
197,-0.950909,-0.218733,1.293550,0.590039,-0.679384,-0.438998,-0.188582,-0.045867,-0.972107,-0.704350
198,-0.446699,0.744885,-0.961264,0.494342,-1.494194,-1.458324,2.820244,0.612233,1.679779,0.204377


In [17]:
pd.DataFrame(y)

Unnamed: 0,0
0,1
1,0
2,0
3,1
4,1
...,...
195,0
196,1
197,0
198,1


### 2. Membagi data (80% untuk data latih dan 20% untuk data uji)

In [18]:
from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2)

In [19]:
x_train.shape, y_train.shape

((160, 10), (160,))

In [20]:
x_test.shape, y_test.shape

((40, 10), (40,))

### 3. Membangun model Random Forest

In [21]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score

rf = RandomForestClassifier(max_features=5, n_estimators=100)

In [22]:
rf.fit(x_train, y_train)

In [23]:
x

array([[-1.51107661,  0.60874908, -0.15323616, ..., -0.86482994,
        -0.20290111, -0.87142207],
       [ 1.44544531,  0.51896937,  0.64515265, ..., -1.04339961,
         0.04854689, -2.62101164],
       [ 0.37167029,  0.51350548, -1.39881282, ...,  0.14225137,
        -1.13283476,  1.85300949],
       ...,
       [-0.95090925, -0.21873346,  1.29354962, ..., -0.04586669,
        -0.97210712, -0.70435033],
       [-0.4466992 ,  0.74488454, -0.9612636 , ...,  0.61223252,
         1.67977906,  0.20437739],
       [ 1.00796648,  1.1253235 ,  0.43499832, ...,  0.44838065,
        -1.75951426,  0.39233491]])

In [24]:
pd.DataFrame(x)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9
0,-1.511077,0.608749,-0.153236,0.507984,-0.324032,-2.432509,1.592056,-0.864830,-0.202901,-0.871422
1,1.445445,0.518969,0.645153,2.038777,-0.396293,1.282142,-2.170249,-1.043400,0.048547,-2.621012
2,0.371670,0.513505,-1.398813,-0.459943,0.644354,0.081768,-1.757065,0.142251,-1.132835,1.853009
3,2.565453,0.145652,1.177052,1.322694,0.194175,-0.641108,0.878631,-0.202694,-1.199798,-0.464115
4,-0.710656,1.050615,0.354602,-1.774596,-0.312230,-0.212373,0.826484,-0.621252,-1.187774,1.131129
...,...,...,...,...,...,...,...,...,...,...
195,-1.098083,-1.277636,0.419595,0.482176,-1.879287,-0.091079,-2.428480,0.032615,1.164204,0.758637
196,0.165211,1.937132,-1.307971,0.074876,-1.786935,1.472396,1.666002,-0.696028,-0.162525,0.976296
197,-0.950909,-0.218733,1.293550,0.590039,-0.679384,-0.438998,-0.188582,-0.045867,-0.972107,-0.704350
198,-0.446699,0.744885,-0.961264,0.494342,-1.494194,-1.458324,2.820244,0.612233,1.679779,0.204377


#### Ada 2 metode penilaian akurasi yaitu dengan:
1. `rf.score()`
2. `rf.predict()` dan `accuracy_score()`

In [25]:
# fungsi rf.score()
rf.score(x_test, y_test)

0.85

In [27]:
# fungsi ref.predict() dan accuracy_score()
y_pred = rf.predict(x_test)
accuracy_score(y_pred, y_test)

0.85

keunggulan dari fungsi `ref.predict()` dan `accuracy_score()` adalah kita bisa mendapatkan akses untuk melihat prediksi nilai data:

In [29]:
y_pred, y_test

(array([1, 0, 0, 1, 1, 0, 1, 1, 1, 1, 1, 1, 0, 0, 0, 1, 1, 0, 1, 0, 1, 1,
        1, 1, 1, 1, 1, 0, 1, 0, 0, 1, 0, 1, 0, 0, 1, 0, 0, 0]),
 array([1, 0, 0, 1, 1, 0, 1, 1, 1, 1, 0, 0, 1, 0, 0, 1, 1, 0, 0, 0, 1, 0,
        1, 0, 1, 1, 1, 0, 1, 0, 0, 1, 0, 1, 0, 0, 1, 0, 0, 0]))

### 4. Hyperparameter Tuning

In [30]:
from sklearn.model_selection import GridSearchCV
import numpy as np

max_features_range = np.arange(1,6,1)
n_estimators_range = np.arange(10,210,10)
param_grid = dict(max_features=max_features_range, n_estimators=n_estimators_range)

rf = RandomForestClassifier()
grid = GridSearchCV(estimator=rf, param_grid=param_grid, cv=5)

In [31]:
np.arange(1,6,1)

array([1, 2, 3, 4, 5])

In [32]:
np.arange(10,210,10)

array([ 10,  20,  30,  40,  50,  60,  70,  80,  90, 100, 110, 120, 130,
       140, 150, 160, 170, 180, 190, 200])

In [33]:
grid.fit(x_train, y_train)

In [34]:
print("Parameter terbaik adalah %s dengan skor %0.2f"
      % (grid.best_params_, grid.best_score_))

Parameter terbaik adalah {'max_features': 4, 'n_estimators': 120} dengan skor 0.92


### 5. Membuat DataFrame untuk parameter GridSearch dan skor akurasi

In [38]:
grid_results = pd.concat([pd.DataFrame(grid.cv_results_["params"]),pd.DataFrame(grid.cv_results_["mean_test_score"], columns=["Accuracy"])],axis=1)
grid_results.head()                            

Unnamed: 0,max_features,n_estimators,Accuracy
0,1,10,0.8125
1,1,20,0.80625
2,1,30,0.86875
3,1,40,0.86875
4,1,50,0.86875


### 6. Menyiapkan data untuk membuat countour plot

In [39]:
grid_contour = grid_results.groupby(['max_features','n_estimators']).mean()
grid_contour

Unnamed: 0_level_0,Unnamed: 1_level_0,Accuracy
max_features,n_estimators,Unnamed: 2_level_1
1,10,0.81250
1,20,0.80625
1,30,0.86875
1,40,0.86875
1,50,0.86875
...,...,...
5,160,0.91250
5,170,0.90625
5,180,0.90625
5,190,0.91875


### 7. Data pivoting

In [40]:
grid_reset = grid_contour.reset_index()
grid_reset.columns = ['max_features', 'n_estimators', 'Accuracy']
grid_pivot = grid_reset.pivot('max_features', 'n_estimators')
grid_pivot

  grid_pivot = grid_reset.pivot('max_features', 'n_estimators')


Unnamed: 0_level_0,Accuracy,Accuracy,Accuracy,Accuracy,Accuracy,Accuracy,Accuracy,Accuracy,Accuracy,Accuracy,Accuracy,Accuracy,Accuracy,Accuracy,Accuracy,Accuracy,Accuracy,Accuracy,Accuracy,Accuracy
n_estimators,10,20,30,40,50,60,70,80,90,100,110,120,130,140,150,160,170,180,190,200
max_features,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2,Unnamed: 14_level_2,Unnamed: 15_level_2,Unnamed: 16_level_2,Unnamed: 17_level_2,Unnamed: 18_level_2,Unnamed: 19_level_2,Unnamed: 20_level_2
1,0.8125,0.80625,0.86875,0.86875,0.86875,0.8875,0.88125,0.89375,0.9,0.9,0.89375,0.875,0.9125,0.9125,0.90625,0.89375,0.90625,0.89375,0.90625,0.9125
2,0.8875,0.86875,0.86875,0.89375,0.88125,0.89375,0.89375,0.9,0.8875,0.9,0.9,0.9,0.8875,0.90625,0.9,0.90625,0.90625,0.89375,0.89375,0.89375
3,0.88125,0.9125,0.89375,0.9,0.90625,0.90625,0.90625,0.9,0.90625,0.89375,0.9,0.9,0.9125,0.9,0.90625,0.89375,0.9125,0.9,0.9,0.90625
4,0.8625,0.89375,0.9125,0.90625,0.9,0.9,0.90625,0.9125,0.9,0.90625,0.9125,0.91875,0.9,0.90625,0.89375,0.90625,0.90625,0.9125,0.9125,0.90625
5,0.9,0.88125,0.89375,0.88125,0.91875,0.89375,0.9,0.9125,0.9125,0.90625,0.90625,0.9,0.9125,0.9,0.90625,0.9125,0.90625,0.90625,0.91875,0.90625


In [41]:
### masukkan data yang sudah di-pivot ke dalam variabel
x = grid_pivot.columns.levels[1].values
y = grid_pivot.index.values
z = grid_pivot.values

### 8. Countour Plot (2D)

In [44]:
import plotly.graph_objects as go

layout = go.Layout(
              xaxis=go.layout.XAxis(
              title=go.layout.xaxis.Title(
              text='n_estimators')
            ),
              yaxis=go.layout.YAxis(
              title=go.layout.yaxis.Title(
              text='max_features') 
           ))

fig = go.Figure(data = [go.Contour(z=z, x=x, y=y)], layout=layout )
fig.update_layout(title='Hyperparameter Tuning', autosize=False,
                  width=500, height=500,
                  margin=dict(l=65, r=50, b=65, t=90))

fig.show()

### 9. Surface Plot (3D)

In [45]:
fig = go.Figure(data= [go.Surface(z=z, y=y, x=x)], layout=layout)
fig.update_layout(title='Hyperparameter Tuning',
                  scene = dict(   
                      xaxis_title='n_estimators',
                      yaxis_title='max_features',
                      zaxis_title='Accuracy'),
                  autosize=False,
                  width=800, height=800,
                  margin=dict(l=65, r=50, b=65, t=90))

fig.show()