# Hyper parameter Tuning (GridSearchCV)

## asgn16

In [11]:
from sklearn import svm, datasets
iris = datasets.load_iris()
dir(iris)

['DESCR',
 'data',
 'feature_names',
 'filename',
 'frame',
 'target',
 'target_names']

In [12]:
import pandas as pd
df = pd.DataFrame(iris.data, columns=iris.feature_names)
df["flower"] = iris.target
df["flower"] = df["flower"].apply(lambda x: iris.target_names[x])
df.head()

Unnamed: 0,sepal length (cm),sepal width (cm),petal length (cm),petal width (cm),flower
0,5.1,3.5,1.4,0.2,setosa
1,4.9,3.0,1.4,0.2,setosa
2,4.7,3.2,1.3,0.2,setosa
3,4.6,3.1,1.5,0.2,setosa
4,5.0,3.6,1.4,0.2,setosa


In [13]:
iris.target_names

array(['setosa', 'versicolor', 'virginica'], dtype='<U10')

In [14]:
from sklearn.model_selection import train_test_split
x = df.drop(["flower"], axis="columns")
y = df.flower
x_train, x_test, y_train, y_test = train_test_split(x, y, train_size=0.75, random_state=23)
print(len(x_test))
print(x_test)

38
     sepal length (cm)  sepal width (cm)  petal length (cm)  petal width (cm)
139                6.9               3.1                5.4               2.1
125                7.2               3.2                6.0               1.8
67                 5.8               2.7                4.1               1.0
3                  4.6               3.1                1.5               0.2
113                5.7               2.5                5.0               2.0
52                 6.9               3.1                4.9               1.5
18                 5.7               3.8                1.7               0.3
137                6.4               3.1                5.5               1.8
20                 5.4               3.4                1.7               0.2
64                 5.6               2.9                3.6               1.3
73                 6.1               2.8                4.7               1.2
30                 4.8               3.1                1.6  

In [15]:
model = svm.SVC(kernel="rbf", C=30, gamma="auto")
model.fit(x_train, y_train)
model.score(x_test, y_test)

0.9736842105263158

In [16]:
import numpy as np
from sklearn.model_selection import cross_val_score
lm = cross_val_score(svm.SVC(kernel="linear", C=10, gamma="auto"), x, y, cv=5)
rbf = cross_val_score(svm.SVC(kernel="rbf", C=10, gamma="auto"), x, y, cv=5)
rbf2 = cross_val_score(svm.SVC(kernel="rbf", C=20, gamma="auto"), x, y, cv=5)
print("lm:", np.average(lm), "\nrbf:", np.average(rbf), "\nrbf2:", np.average(rbf2))

lm: 0.9733333333333334 
rbf: 0.9800000000000001 
rbf2: 0.9666666666666668


In [21]:
# kernels = ['linear', 'poly', 'rbf', 'sigmoid', 'precomputed']
# cval = list(range(1,101))
# avg_scores = {}
# for kval in kernels:
#     for cval in kernels:
#         cv_scores = cross_val_score(svm.SVC(kernel=kval, C=cval, gamma="auto"), iris.data, iris.target, cv=5)
#         avg_scores[kval + "_" + str(cval)] = np.average(cv_scores)
# avg_scores
kernels = ['rbf', 'linear', "poly"]
C = list(range(1,3))#C = [1, 10, 20]
avg_scores = {}
for kval in kernels:
    for cval in C:
        cv_scores = cross_val_score(svm.SVC(kernel=kval, C=cval, gamma='auto'), iris.data, iris.target, cv=5)
        avg_scores[kval + '_' + str(cval)] = np.average(cv_scores)    
avg_scores

{'rbf_1': 0.9800000000000001,
 'rbf_2': 0.9800000000000001,
 'linear_1': 0.9800000000000001,
 'linear_2': 0.9800000000000001,
 'poly_1': 0.9666666666666666,
 'poly_2': 0.9666666666666666}

In [23]:
from sklearn.model_selection import GridSearchCV
clf = GridSearchCV(svm.SVC(gamma="auto"), {
    "C": list(range(1,3)),#C = [1, 10, 20]
    "kernel": ['rbf', 'linear', "poly"]
}, cv=5, return_train_score=False)
clf.fit(iris.data, iris.target)
clf.cv_results_

{'mean_fit_time': array([0.00098472, 0.0007503 , 0.00306787, 0.00144377, 0.00080147,
        0.0035624 ]),
 'std_fit_time': array([0.00067278, 0.00037555, 0.00204205, 0.00040015, 0.00040074,
        0.00251332]),
 'mean_score_time': array([0.00071526, 0.00026383, 0.00119753, 0.00059795, 0.0003994 ,
        0.00049391]),
 'std_score_time': array([0.00041305, 0.00042253, 0.00014518, 0.00037264, 0.00048916,
        0.00044914]),
 'param_C': masked_array(data=[1, 1, 1, 2, 2, 2],
              mask=[False, False, False, False, False, False],
        fill_value='?',
             dtype=object),
 'param_kernel': masked_array(data=['rbf', 'linear', 'poly', 'rbf', 'linear', 'poly'],
              mask=[False, False, False, False, False, False],
        fill_value='?',
             dtype=object),
 'params': [{'C': 1, 'kernel': 'rbf'},
  {'C': 1, 'kernel': 'linear'},
  {'C': 1, 'kernel': 'poly'},
  {'C': 2, 'kernel': 'rbf'},
  {'C': 2, 'kernel': 'linear'},
  {'C': 2, 'kernel': 'poly'}],
 'split0_t

In [24]:
df = pd.DataFrame(clf.cv_results_)#cv=5
df

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_C,param_kernel,params,split0_test_score,split1_test_score,split2_test_score,split3_test_score,split4_test_score,mean_test_score,std_test_score,rank_test_score
0,0.000985,0.000673,0.000715,0.000413,1,rbf,"{'C': 1, 'kernel': 'rbf'}",0.966667,1.0,0.966667,0.966667,1.0,0.98,0.01633,1
1,0.00075,0.000376,0.000264,0.000423,1,linear,"{'C': 1, 'kernel': 'linear'}",0.966667,1.0,0.966667,0.966667,1.0,0.98,0.01633,1
2,0.003068,0.002042,0.001198,0.000145,1,poly,"{'C': 1, 'kernel': 'poly'}",1.0,1.0,0.9,0.933333,1.0,0.966667,0.042164,5
3,0.001444,0.0004,0.000598,0.000373,2,rbf,"{'C': 2, 'kernel': 'rbf'}",0.966667,1.0,0.966667,0.966667,1.0,0.98,0.01633,1
4,0.000801,0.000401,0.000399,0.000489,2,linear,"{'C': 2, 'kernel': 'linear'}",0.966667,1.0,0.966667,0.966667,1.0,0.98,0.01633,1
5,0.003562,0.002513,0.000494,0.000449,2,poly,"{'C': 2, 'kernel': 'poly'}",1.0,1.0,0.9,0.933333,1.0,0.966667,0.042164,5


In [27]:
df[["param_C", "param_kernel", "mean_test_score"]]

Unnamed: 0,param_C,param_kernel,mean_test_score
0,1,rbf,0.98
1,1,linear,0.98
2,1,poly,0.966667
3,2,rbf,0.98
4,2,linear,0.98
5,2,poly,0.966667


In [28]:
dir(clf)

['__abstractmethods__',
 '__class__',
 '__delattr__',
 '__dict__',
 '__dir__',
 '__doc__',
 '__eq__',
 '__format__',
 '__ge__',
 '__getattribute__',
 '__getstate__',
 '__gt__',
 '__hash__',
 '__init__',
 '__init_subclass__',
 '__le__',
 '__lt__',
 '__module__',
 '__ne__',
 '__new__',
 '__reduce__',
 '__reduce_ex__',
 '__repr__',
 '__setattr__',
 '__setstate__',
 '__sizeof__',
 '__str__',
 '__subclasshook__',
 '__weakref__',
 '_abc_impl',
 '_check_is_fitted',
 '_check_n_features',
 '_check_refit_for_multimetric',
 '_estimator_type',
 '_format_results',
 '_get_param_names',
 '_get_tags',
 '_more_tags',
 '_pairwise',
 '_repr_html_',
 '_repr_html_inner',
 '_repr_mimebundle_',
 '_required_parameters',
 '_run_search',
 '_validate_data',
 'best_estimator_',
 'best_index_',
 'best_params_',
 'best_score_',
 'classes_',
 'cv',
 'cv_results_',
 'decision_function',
 'error_score',
 'estimator',
 'fit',
 'get_params',
 'inverse_transform',
 'multimetric_',
 'n_features_in_',
 'n_jobs',
 'n_splits

In [32]:
print(clf.best_estimator_)
print(clf.best_index_)
print(clf.best_params_)
print(clf.best_score_)

SVC(C=1, gamma='auto')
0
{'C': 1, 'kernel': 'rbf'}
0.9800000000000001


In [44]:
from sklearn.model_selection import RandomizedSearchCV
rs = RandomizedSearchCV(svm.SVC(gamma="auto"),{
    "C": list(range(1,5)),
    "kernel": ['rbf', 'linear', "poly"]
}, cv=5, return_train_score=False, n_iter=12)
rs.fit(iris.data, iris.target)
pd.DataFrame(rs.cv_results_)[["param_C", "param_kernel", "mean_test_score"]]

Unnamed: 0,param_C,param_kernel,mean_test_score
0,1,rbf,0.98
1,1,linear,0.98
2,1,poly,0.966667
3,2,rbf,0.98
4,2,linear,0.98
5,2,poly,0.966667
6,3,rbf,0.973333
7,3,linear,0.973333
8,3,poly,0.966667
9,4,rbf,0.986667


In [39]:
from sklearn import svm
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression

In [43]:
model_params = {
    "svm": {
        "model": svm.SVC(gamma="auto"),
        "params": {
            "C": list(range(1,5)),
            "kernel": ['rbf', 'linear', "poly"]
        }
    }, 
    "random_forest": {
        "model": RandomForestClassifier(),
        "params": {"n_estimators": list(range(1,5))
        }
    },
    "logistic_regression": {
        "model": LogisticRegression(solver="liblinear", multi_class="auto"),
        "params": {
            "C": list(range(1,5))
        }
    }
}

dict

In [48]:
scores = []
for model_name, mp in model_params.items():
    clf = GridSearchCV(mp["model"], mp["params"], cv=5, return_train_score=False)
    clf.fit(iris.data, iris.target)
    scores.append({
        "model": model_name,
        "best_score": clf.best_score_,
        "best_params": clf.best_params_})

In [50]:
df = pd.DataFrame(scores, columns=["model", "best_score", "best_params"])
df

Unnamed: 0,model,best_score,best_params
0,svm,0.98,"{'C': 1, 'kernel': 'rbf'}"
1,random_forest,0.94,{'n_estimators': 1}
2,logistic_regression,0.96,{'C': 1}


## asgn16

In [59]:
from sklearn.datasets import load_digits
from sklearn import svm
from sklearn.ensemble import RandomForestClassifier as rfc
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import GaussianNB
from sklearn.naive_bayes import MultinomialNB
from sklearn.tree import DecisionTreeClassifier

In [62]:
digits = load_digits()
dir(digits)

1797

In [63]:
model_params = {
    "svm": {
        "model": svm.SVC(gamma="auto"),
        "params": {
            "C": list(range(1, 101)),
            "kernel": ["rbf", "linear", "poly"]
        }
    },
    "random_forest": {
        "model": rfc(),
        "params": {
            "n_estimators": list(range(1, 101))
        }
    },
    "logistic_regression": {
        "model": LogisticRegression(solver="liblinear", multi_class="auto"),
        "params":{
            "C": list(range(1, 101))
        }
    },
    'naive_bayes_gaussian': {
        'model': GaussianNB(),
        'params': {}
    },
    'naive_bayes_multinomial': {
        'model': MultinomialNB(),
        'params': {}
    },
    'decision_tree': {
        'model': DecisionTreeClassifier(),
        'params': {
            'criterion': ['gini','entropy'],
            
        }
    }     
}       

In [None]:
from sklearn.model_selection import GridSearchCV
import pandas as pd
scores = []
for model_name, mp in model_params.items():
    clf = GridSearchCV(mp["model"], mp["params"], cv=100, return_train_score=False)
    clf.fit(digits.data, digits.target)
    scores.append({
        "model": model_name,
        "best_score": clf.best_score_,
        "best_params": clf.best_params_
    })
df = pd.DataFrame(scores, columns=["model", "best_score", "best_params"])
df