### GridSearch & Pipelines
GridSearch is an optimization tool that we use when tuning hyperparameters. We define the grid of parameters that we want to search through, and we select the best combination of parameters for our data.

# 1 - One way
Itera un algoritmo sobre un conjunto de hiperparametros

In [9]:
import warnings
import numpy as np
import pandas as pd

warnings.filterwarnings("ignore", category=DeprecationWarning)

In [10]:

from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler
from sklearn.feature_selection import SelectKBest

from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC

In [11]:
from sklearn import svm, datasets
from sklearn.model_selection import GridSearchCV, train_test_split

import warnings
warnings.filterwarnings("ignore")
warnings.filterwarnings(action="ignore",category=DeprecationWarning)
warnings.filterwarnings(action="ignore",category=FutureWarning)

iris = datasets.load_iris()

X = iris.data
y = iris.target

X_train, X_test, y_train, y_test = train_test_split(X,
                                                    y,
                                                    test_size=0.2,
                                                    random_state=42)
svc = svm.SVC()

parameters = {
    'kernel': ['linear', 'rbf', 'sigmoid'],
    'C': [0.001, 0.01, 0.5, 1, 5, 10, 100],
    'gamma': ['scale', 'auto'],
    'coef0': [-10, -1, 0, 0.1, 0.5, 1, 10, 100]
}

grid = GridSearchCV(estimator = svc,
                   param_grid = parameters,
                    n_jobs = -1,
                    scoring = 'accuracy',
                    cv = 10)

grid.fit(X_train, y_train)





In [12]:
print("Best estimator:", grid.best_estimator_)
print("Best params:", grid.best_params_)
print("Best score:", grid.best_score_)

Best estimator: SVC(C=5, coef0=-10, kernel='linear')
Best params: {'C': 5, 'coef0': -10, 'gamma': 'scale', 'kernel': 'linear'}
Best score: 0.9583333333333334


In [13]:
best_estimator = grid.best_estimator_
best_estimator.score(X_test, y_test)

0.9666666666666667

# 2: Almost-Pro way

La forma pro es la que hace esto mismo y va recogiendo los errores de entrenamiento, de validación y tiene la capacidad de parar el proceso cuando se requiera además de guardar el modelo en local una vez terminado si es mejor que el que había anteriormente y de cargar el modelo anterior y seguir reentrenando.

In [17]:
pipe = Pipeline(steps=[("classifier", RandomForestClassifier())])

logistic_params = {
    'classifier': [LogisticRegression()],
    'classifier__penalty': ['l1', 'l2'],
    'classifier__C': np.arange(0.1, 4, 0.5)
}

random_forest_params = {
    'classifier': [RandomForestClassifier()],
    'classifier__n_estimators': [10, 100, 500, 1000],
    'classifier__max_features': [1,2,3]
}

svc_params = {
    'classifier': [svm.SVC()],
    'classifier__kernel': ['linear', 'rbf', 'sigmoid']
}

search_space = [logistic_params, random_forest_params, svc_params]

grid = GridSearchCV(pipe,
                   search_space,
                   cv = 10,
                   n_jobs=-1)

grid.fit(X_train, y_train)

In [18]:
grid.score(X_test, y_test)

1.0

In [19]:
print(grid.predict(X_test))
print(y_test)

[1 0 2 1 1 0 1 2 1 1 2 0 0 0 0 1 2 1 1 2 0 2 0 2 2 2 2 2 0 0]
[1 0 2 1 1 0 1 2 1 1 2 0 0 0 0 1 2 1 1 2 0 2 0 2 2 2 2 2 0 0]


In [20]:
grid.best_estimator_['classifier']

In [25]:
grid.best_estimator_

In [26]:
grid.best_score_

0.9583333333333334

# 3 Another way

In [28]:
reg_log = Pipeline(steps = [
    ("imputer", SimpleImputer()),
    ("scaler", StandardScaler()),
    ("reglog", LogisticRegression())])

svc = Pipeline([
    ("scaler", StandardScaler()),
    ("selectkbest", SelectKBest()),
    ("svc", svm.SVC())])

rand_forest_param = {
    'n_estimators': [10,100,500, 1000],
    'max_features': [1,2,3]
}

rand_forest = RandomForestClassifier()

re_log_param = {
    "imputer__strategy": ['mean', 'median', 'most_frequent'],
    "reglog__penalty": ["l1", "l2"],
    "reglog__C": np.arange(0.1, 4, 0.5)
}

svc_param = {
    "selectkbest__k": [1,2,3],
    "svc__C": np.arange(0.1, 0.9, 0.1),
    "svc__kernel": ['linear', 'poly', 'rbf']
}

gs_reg_log = GridSearchCV(reg_log,
re_log_param,
cv=10,
scoring = 'accuracy',
n_jobs = -1,
verbose = 1)

gs_svm = GridSearchCV(svc,
svc_param,
cv=10,
scoring = 'accuracy',
n_jobs = -1,
verbose = 1)

gs_rand_forest = GridSearchCV(rand_forest,
rand_forest_param,
cv=10,
scoring = 'accuracy',
n_jobs = -1,
verbose = 1)

grids = {
    "gs_reg_log": gs_reg_log,
    "gs_svm": gs_svm,
    "gs_rand_forest": gs_rand_forest

}




In [29]:
%%time 
for nombre, grid_search in grids.items():
    grid_search.fit(X_train, y_train)


Fitting 10 folds for each of 48 candidates, totalling 480 fits
Fitting 10 folds for each of 72 candidates, totalling 720 fits
Fitting 10 folds for each of 12 candidates, totalling 120 fits
Wall time: 1min 50s


In [30]:
grids.items()

dict_items([('gs_reg_log', GridSearchCV(cv=10,
             estimator=Pipeline(steps=[('imputer', SimpleImputer()),
                                       ('scaler', StandardScaler()),
                                       ('reglog', LogisticRegression())]),
             n_jobs=-1,
             param_grid={'imputer__strategy': ['mean', 'median',
                                               'most_frequent'],
                         'reglog__C': array([0.1, 0.6, 1.1, 1.6, 2.1, 2.6, 3.1, 3.6]),
                         'reglog__penalty': ['l1', 'l2']},
             scoring='accuracy', verbose=1)), ('gs_svm', GridSearchCV(cv=10,
             estimator=Pipeline(steps=[('scaler', StandardScaler()),
                                       ('selectkbest', SelectKBest()),
                                       ('svc', SVC())]),
             n_jobs=-1,
             param_grid={'selectkbest__k': [1, 2, 3],
                         'svc__C': array([0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8]),
    

In [32]:
best_grids = [(i, j.best_score_) for i, j in grids.items()]

best_grids = pd.DataFrame(best_grids, columns = ["Grid", "Best score"])
best_grids.sort_values(by = "Best score", ascending = False)

Unnamed: 0,Grid,Best score
1,gs_svm,0.95
0,gs_reg_log,0.941667
2,gs_rand_forest,0.933333


In [33]:
print("Best estimator:", gs_svm.best_estimator_)
print("Best params:", gs_svm.best_params_)
print("Best score:", gs_svm.best_score_)

Best estimator: Pipeline(steps=[('scaler', StandardScaler()), ('selectkbest', SelectKBest(k=2)),
                ('svc', SVC(C=0.1, kernel='linear'))])
Best params: {'selectkbest__k': 2, 'svc__C': 0.1, 'svc__kernel': 'linear'}
Best score: 0.9499999999999998


In [34]:
estimador = gs_svm.best_estimator_
estimador.score(X_test, y_test)

1.0

In [35]:
estimador.predict(X_test) 

array([1, 0, 2, 1, 1, 0, 1, 2, 1, 1, 2, 0, 0, 0, 0, 1, 2, 1, 1, 2, 0, 2,
       0, 2, 2, 2, 2, 2, 0, 0])

In [36]:
estimador.predict(X_test) - y_test

array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0])

In [37]:
iris['feature_names']

['sepal length (cm)',
 'sepal width (cm)',
 'petal length (cm)',
 'petal width (cm)']

In [38]:
estimador

In [39]:
estimador['selectkbest'].get_params()

{'k': 2,
 'score_func': <function sklearn.feature_selection._univariate_selection.f_classif(X, y)>}

In [40]:
estimador['selectkbest'].pvalues_

array([1.72477507e-23, 2.69962606e-14, 1.93619072e-72, 3.57639330e-65])

In [41]:
estimador['selectkbest'].scores_

array([ 84.80836804,  41.29284269, 925.55642345, 680.77560309])

In [42]:
import pickle

with open("finished_model.model", "wb") as archivo_salida:
    pickle.dump(estimador, archivo_salida)


"""
'r'	Open for reading (default)
'w'	Open for writing, truncating (overwriting) the file first
'rb' or 'wb'	Open in binary mode (read/write using byte data)

Text files
Buffered binary files
Raw binary files


"""



"\n'r'\tOpen for reading (default)\n'w'\tOpen for writing, truncating (overwriting) the file first\n'rb' or 'wb'\tOpen in binary mode (read/write using byte data)\n\nText files\nBuffered binary files\nRaw binary files\n\n\n"

In [43]:
# Leer
with open("finished_model.model", "rb") as archivo_entrada:
    pipeline_importado = pickle.load(archivo_entrada)


"""
8 |_ 2
 0   4  !_ 2
     0    2 |_ 2
        0     1

1 0 0 0 = ____ x 2*3  +   _____* 2**2 + ______*2**1 + ____2**0  = ___1__x8 + ___0___x4 + ___0___x2  + __0___ X1
3 2 1 0

8 (decimal) = 1000
__ __ __ __ __ __ __ __


7   6  5  4  3  2  1  0


99000000
00000099
00001000

"""

'\n8 |_ 2\n 0   4  !_ 2\n     0    2 |_ 2\n        0     1\n\n1 0 0 0 = ____ x 2*3  +   _____* 2**2 + ______*2**1 + ____2**0  = ___1__x8 + ___0___x4 + ___0___x2  + __0___ X1\n3 2 1 0\n\n8 (decimal) = 1000\n__ __ __ __ __ __ __ __\n\n\n7   6  5  4  3  2  1  0\n\n\n99000000\n00000099\n00001000\n\n'

In [None]:
pipeline_importado

In [44]:
new_flowers = np.array([[6.9, 3.1, 5.1, 2.3],
[5.8, 2.7, 3.9, 1.2]])

In [45]:
pipeline_importado.predict(new_flowers)

array([2, 1])

In [46]:
gs_svm.best_estimator_.predict(X_test)

array([1, 0, 2, 1, 1, 0, 1, 2, 1, 1, 2, 0, 0, 0, 0, 1, 2, 1, 1, 2, 0, 2,
       0, 2, 2, 2, 2, 2, 0, 0])

In [None]:
# joblib