#### 최댓값과 최솟값 탐색 알고리즘 

In [1]:
import numpy as np
def find_optimal_h(H, S):
    idx = np.argmax(S)
    return H[idx]

H = ["H1", "H2", "H3", "H4", "H5"]
S = [0.8, 0.7, 0.9, 0.6, 0.7]
print(find_optimal_h(H, S))

H3


In [2]:
def find_optimal_h_update(H, S):
    current_max_value = -np.inf
    for h, s in zip(H, S):
        if s > current_max_value:
            current_max_value = s
            h_star = h
    return h_star

H = ["H1", "H2", "H3", "H4", "H5"]
S = [0.8, 0.7, 0.9, 0.6, 0.7]
print(find_optimal_h_update(H, S))

H3


### 구현 실습

In [3]:
import pandas as pd
df1 = pd.read_csv("../../data/classification/optdigits.csv")
df2 = pd.read_csv("../../data/regression/baseball.csv")
X1 = df1.drop('y', axis = 1)
y1 = df1['y']
X2 = df2.drop('y', axis = 1)
y2 = df2['y']

#### GridSearchCV 클래스

In [4]:
grid = {"n_neighbors": [3, 5, 7],
        "metric":["euclidean", "manhattan"]}

In [5]:
from sklearn.model_selection import GridSearchCV
from sklearn.neighbors import KNeighborsClassifier
clf = GridSearchCV(estimator = KNeighborsClassifier(),
                   cv = 5,
                   param_grid = grid,
                   scoring = "accuracy").fit(X1, y1)

In [6]:
result = pd.DataFrame(clf.cv_results_)
display(result[['params', 'mean_test_score', 'mean_fit_time']])

Unnamed: 0,params,mean_test_score,mean_fit_time
0,"{'metric': 'euclidean', 'n_neighbors': 3}",0.982918,0.012633
1,"{'metric': 'euclidean', 'n_neighbors': 5}",0.982562,0.009827
2,"{'metric': 'euclidean', 'n_neighbors': 7}",0.983452,0.011631
3,"{'metric': 'manhattan', 'n_neighbors': 3}",0.97847,0.013135
4,"{'metric': 'manhattan', 'n_neighbors': 5}",0.978648,0.014038
5,"{'metric': 'manhattan', 'n_neighbors': 7}",0.978292,0.009834


In [7]:
print(clf.best_estimator_)
print(clf.best_score_)
print(clf.best_params_)

KNeighborsClassifier(metric='euclidean', n_neighbors=7)
0.9834519572953736
{'metric': 'euclidean', 'n_neighbors': 7}


In [8]:
display(clf.best_estimator_.predict(X1))

array([0, 0, 7, ..., 8, 9, 8], dtype=int64)

In [9]:
from sklearn.neighbors import KNeighborsRegressor
clf = GridSearchCV(estimator = KNeighborsRegressor(),
                   cv = 5,
                   param_grid = grid,
                   scoring = "neg_mean_absolute_error").fit(X2, y2)

result = pd.DataFrame(clf.cv_results_)
display(result[['params', 'mean_test_score', 'mean_fit_time']])

Unnamed: 0,params,mean_test_score,mean_fit_time
0,"{'metric': 'euclidean', 'n_neighbors': 3}",-666.30158,0.006617
1,"{'metric': 'euclidean', 'n_neighbors': 5}",-651.092379,0.007016
2,"{'metric': 'euclidean', 'n_neighbors': 7}",-653.397034,0.00762
3,"{'metric': 'manhattan', 'n_neighbors': 3}",-693.410097,0.005615
4,"{'metric': 'manhattan', 'n_neighbors': 5}",-655.554548,0.007318
5,"{'metric': 'manhattan', 'n_neighbors': 7}",-644.461514,0.006417


#### ParameterGrid 함수

In [10]:
from sklearn.model_selection import ParameterGrid
for param in ParameterGrid(grid):
    print(param)

{'metric': 'euclidean', 'n_neighbors': 3}
{'metric': 'euclidean', 'n_neighbors': 5}
{'metric': 'euclidean', 'n_neighbors': 7}
{'metric': 'manhattan', 'n_neighbors': 3}
{'metric': 'manhattan', 'n_neighbors': 5}
{'metric': 'manhattan', 'n_neighbors': 7}


In [11]:
param = {"metric": "euclidean", "n_neighbors": 3}
KNeighborsClassifier(**param)

KNeighborsClassifier(metric='euclidean', n_neighbors=3)

In [12]:
from sklearn.metrics import *
from sklearn.model_selection import KFold
grid = ParameterGrid(grid)
kf = KFold(n_splits = 5)
best_score = -1
for param in grid:
    total_score = 0
    for train_index, test_index in kf.split(X1):
        X1_train = X1.loc[train_index] 
        X1_test = X1.loc[test_index]
        y1_train = y1.loc[train_index]
        y1_test = y1.loc[test_index]

        model = KNeighborsClassifier(**param).fit(X1_train, y1_train)
        y1_pred = model.predict(X1_test)
        score = accuracy_score(y1_test, y1_pred)
        total_score += score / 5
    if total_score > best_score:
        best_score = total_score
        best_parameter = param

In [13]:
print(best_parameter, best_score)

{'metric': 'euclidean', 'n_neighbors': 7} 0.9830960854092526


In [14]:
kf = KFold(n_splits = 5)
best_score = np.inf
for param in grid:
    total_score = 0
    for train_index, test_index in kf.split(X2):
        X2_train = X2.loc[train_index] 
        X2_test = X2.loc[test_index]
        y2_train = y2.loc[train_index]
        y2_test = y2.loc[test_index]

        model = KNeighborsClassifier(**param).fit(X2_train, y2_train)
        y2_pred = model.predict(X2_test)
        score = mean_absolute_error(y2_test, y2_pred)
        total_score += score / 5
    if total_score < best_score:
        best_score = total_score
        best_parameter = param

In [15]:
print(best_parameter, best_score)

{'metric': 'euclidean', 'n_neighbors': 3} 801.970237050044


#### 모델 선택과 하이퍼 파라미터 최적화 문제로 확장

In [16]:
rf_grid = {"n_estimators":[20, 50, 100, 200],
           "max_depth": [3, 4, 5, 6, 7]}

nn_grid = {"hidden_layer_sizes": [(10, 10), (20, 20), (30, 30), (20, 20, 20, 20)],
           "max_iter": [1000, 10000]}

In [17]:
from sklearn.ensemble import RandomForestClassifier as RFC
from sklearn.neural_network import MLPClassifier as MLP
model_parameter_dict = {RFC:ParameterGrid(rf_grid), MLP:ParameterGrid(nn_grid)}

In [18]:
kf = KFold(n_splits = 5)
best_score = -1
for model_class in model_parameter_dict.keys():
    parameter_grid = model_parameter_dict[model_class]
    for param in parameter_grid:
        total_score = 0
        for train_index, test_index in kf.split(X1):
            X1_train = X1.loc[train_index] 
            X1_test = X1.loc[test_index]
            y1_train = y1.loc[train_index]
            y1_test = y1.loc[test_index]
            model = model_class(**param).fit(X1_train, y1_train)
            y1_pred = model.predict(X1_test)
            score = accuracy_score(y1_test, y1_pred)
            total_score += score / 5
        if total_score > best_score:
            best_score = total_score
            best_parameter = param
            best_model = model_class

In [19]:
print(best_model, best_parameter, best_score)

<class 'sklearn.neural_network._multilayer_perceptron.MLPClassifier'> {'hidden_layer_sizes': (30, 30), 'max_iter': 1000} 0.9731316725978649
