In [1]:
from sklearn.svm import SVC
from sklearn.datasets import load_breast_cancer
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler

In [9]:
cancer=load_breast_cancer()


sklearn.utils.Bunch

In [10]:
X_train,X_test,y_train,y_test=train_test_split(cancer.data,cancer.target,random_state=0)


In [11]:
scaler=MinMaxScaler()
scaler.fit(X_train)
X_train_scaled=scaler.transform(X_train)
svm=SVC()
svm.fit(X_train_scaled,y_train)
X_test_scaled=scaler.transform(X_test)
print("Test score :{:.2f}".format(svm.score(X_test_scaled,y_test)))

Test score :0.97


In [12]:
# parameter tuning using grid search cv
from sklearn.model_selection import GridSearchCV
param_grid = {'C': [0.001, 0.01, 0.1, 1, 10, 100],
'gamma': [0.001, 0.01, 0.1, 1, 10, 100]}
grid=GridSearchCV(SVC(),param_grid=param_grid,cv=5)
grid.fit(X_train_scaled,y_train)



GridSearchCV(cv=5, estimator=SVC(),
             param_grid={'C': [0.001, 0.01, 0.1, 1, 10, 100],
                         'gamma': [0.001, 0.01, 0.1, 1, 10, 100]})

In [13]:
grid.best_params_

{'C': 1, 'gamma': 1}

In [14]:
grid.best_estimator_

SVC(C=1, gamma=1)

In [15]:
grid.best_score_

0.9812311901504789

In [21]:
# creating pipeline without grid search
from sklearn.pipeline import Pipeline
pipe=Pipeline([("scaler",MinMaxScaler()),("svm",SVC())])



In [19]:
X_train; # multi dimensional array
y_train ;# single dimensional array

In [22]:
pipe.fit(X_train,y_train)

Pipeline(steps=[('scaler', MinMaxScaler()), ('svm', SVC())])

In [24]:
pipe.score(X_test,y_test).round(2)

0.97

# using pipelines in grid searches

In [27]:
# preparing parameter grid for pipeline
param_grid = {'svm__C': [0.001, 0.01, 0.1, 1, 10, 100],
'svm__gamma': [0.001, 0.01, 0.1, 1, 10, 100]}

In [28]:
grid=GridSearchCV(pipe,param_grid=param_grid,cv=5)
grid.fit(X_train,y_train)

GridSearchCV(cv=5,
             estimator=Pipeline(steps=[('scaler', MinMaxScaler()),
                                       ('svm', SVC())]),
             param_grid={'svm__C': [0.001, 0.01, 0.1, 1, 10, 100],
                         'svm__gamma': [0.001, 0.01, 0.1, 1, 10, 100]})

In [30]:
grid.best_estimator_

Pipeline(steps=[('scaler', MinMaxScaler()), ('svm', SVC(C=1, gamma=1))])

In [31]:
grid.score(X_test,y_test)

0.972027972027972

In [32]:
grid.best_params_

{'svm__C': 1, 'svm__gamma': 1}

In [35]:
pipe.steps

[('scaler', MinMaxScaler()), ('svm', SVC())]

### make_pipeline

In [42]:
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
pipe=make_pipeline(StandardScaler(),PCA(n_components=2),StandardScaler())

In [43]:
pipe.steps

[('standardscaler-1', StandardScaler()),
 ('pca', PCA(n_components=2)),
 ('standardscaler-2', StandardScaler())]

In [49]:
pipe.named_steps["pca"].n_components

2

## accessing attributes in grid searched pipeline.


In [53]:
# grid search logistic regression on cancer dataset
from sklearn.linear_model import LogisticRegression
pipe=make_pipeline(StandardScaler(),LogisticRegression())


In [56]:
X_train,X_test,y_train,y_test=train_test_split(cancer.data,cancer.target,random_state=4)
param_grid={"logisticregression__C":[0.01,0.1,1,10,100]}
grid=GridSearchCV(pipe,param_grid=param_grid,cv=5)
grid.fit(X_train,y_train)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver opt

GridSearchCV(cv=5,
             estimator=Pipeline(steps=[('standardscaler', StandardScaler()),
                                       ('logisticregression',
                                        LogisticRegression())]),
             param_grid={'logisticregression__C': [0.01, 0.1, 1, 10, 100]})

In [57]:
grid.best_estimator_

Pipeline(steps=[('standardscaler', StandardScaler()),
                ('logisticregression', LogisticRegression(C=1))])

In [63]:
grid.best_estimator_.named_steps["logisticregression"].coef_

array([[-0.43570655, -0.34266946, -0.40809443, -0.5344574 , -0.14971847,
         0.61034122, -0.72634347, -0.78538827,  0.03886087,  0.27497198,
        -1.29780109,  0.04926005, -0.67336941, -0.93447426, -0.13939555,
         0.45032641, -0.13009864, -0.10144273,  0.43432027,  0.71596578,
        -1.09068862, -1.09463976, -0.85183755, -1.06406198, -0.74316099,
         0.07252425, -0.82323903, -0.65321239, -0.64379499, -0.42026013]])

## grid searching which model to use


In [68]:
pipe=Pipeline([("preprocessing",StandardScaler()),("classifier",SVC())])

In [69]:
from sklearn.ensemble import RandomForestClassifier

In [71]:
param_grid=[{"classifier":[SVC()],"preprocessing":[StandardScaler(),None],"classifier__gamma":[0.001,0.01,0.1,1,10,100],
            "classifier__C":[0.001,0.01,0.1,1,10,100]},
           {"classifier":[RandomForestClassifier(n_estimators=100)],"preprocessing":[None],"classifier__max_features":[1,2,3]}]

In [72]:
X_train,X_test,y_train,y_test=train_test_split(cancer.data,cancer.target,random_state=0)
grid=GridSearchCV(pipe,param_grid=param_grid,cv=5)
grid.fit(X_train,y_train)

GridSearchCV(cv=5,
             estimator=Pipeline(steps=[('preprocessing', StandardScaler()),
                                       ('classifier', SVC())]),
             param_grid=[{'classifier': [SVC(C=10, gamma=0.01)],
                          'classifier__C': [0.001, 0.01, 0.1, 1, 10, 100],
                          'classifier__gamma': [0.001, 0.01, 0.1, 1, 10, 100],
                          'preprocessing': [StandardScaler(), None]},
                         {'classifier': [RandomForestClassifier()],
                          'classifier__max_features': [1, 2, 3],
                          'preprocessing': [None]}])

In [74]:
grid.best_params_

{'classifier': SVC(C=10, gamma=0.01),
 'classifier__C': 10,
 'classifier__gamma': 0.01,
 'preprocessing': StandardScaler()}