## Building a model

In [27]:
#import a dataset
from sklearn import datasets
iris=datasets.load_iris()
iris

{'data': array([[5.1, 3.5, 1.4, 0.2],
        [4.9, 3. , 1.4, 0.2],
        [4.7, 3.2, 1.3, 0.2],
        [4.6, 3.1, 1.5, 0.2],
        [5. , 3.6, 1.4, 0.2],
        [5.4, 3.9, 1.7, 0.4],
        [4.6, 3.4, 1.4, 0.3],
        [5. , 3.4, 1.5, 0.2],
        [4.4, 2.9, 1.4, 0.2],
        [4.9, 3.1, 1.5, 0.1],
        [5.4, 3.7, 1.5, 0.2],
        [4.8, 3.4, 1.6, 0.2],
        [4.8, 3. , 1.4, 0.1],
        [4.3, 3. , 1.1, 0.1],
        [5.8, 4. , 1.2, 0.2],
        [5.7, 4.4, 1.5, 0.4],
        [5.4, 3.9, 1.3, 0.4],
        [5.1, 3.5, 1.4, 0.3],
        [5.7, 3.8, 1.7, 0.3],
        [5.1, 3.8, 1.5, 0.3],
        [5.4, 3.4, 1.7, 0.2],
        [5.1, 3.7, 1.5, 0.4],
        [4.6, 3.6, 1. , 0.2],
        [5.1, 3.3, 1.7, 0.5],
        [4.8, 3.4, 1.9, 0.2],
        [5. , 3. , 1.6, 0.2],
        [5. , 3.4, 1.6, 0.4],
        [5.2, 3.5, 1.5, 0.2],
        [5.2, 3.4, 1.4, 0.2],
        [4.7, 3.2, 1.6, 0.2],
        [4.8, 3.1, 1.6, 0.2],
        [5.4, 3.4, 1.5, 0.4],
        [5.2, 4.1, 1.5, 0.1],
  

In [28]:
x=iris.data
y=iris.target

In [29]:
from sklearn.model_selection import train_test_split
x_train,x_test,y_train,y_test=train_test_split(x,y,test_size=0.4)

In [30]:
from sklearn.neighbors import KNeighborsClassifier
my_classifier=KNeighborsClassifier(n_neighbors=3)

In [31]:
my_classifier.fit(x_train,y_train)
predict=my_classifier.predict(x_test)
print(predict)
from sklearn.metrics import accuracy_score
print(accuracy_score(y_test,predict))

[1 0 2 2 1 1 1 0 2 2 2 1 1 2 0 0 0 2 2 1 1 1 2 0 2 1 2 0 0 2 0 2 1 1 2 1 1
 1 1 1 2 1 2 2 1 0 2 1 1 1 2 2 2 0 0 1 2 1 0 0]
0.9166666666666666


## Building a Pipeline

In [17]:
from sklearn.preprocessing import MinMaxScaler
from sklearn.neighbors import KNeighborsClassifier

In [32]:
from sklearn.pipeline import Pipeline
pipe_knn = Pipeline([('minmax', MinMaxScaler()),
 ('knn', KNeighborsClassifier())])
pipe_knn.fit(x_train, y_train)
score = pipe_knn.score(x_test, y_test)
print('kNN pipeline test accuracy: %.3f' % score)

kNN pipeline test accuracy: 0.900


In [33]:
from sklearn.model_selection import cross_val_score,cross_val_predict
scores = cross_val_score(pipe_knn, x, y, cv=10)
print(scores)
import numpy as np
print(np.mean(scores))

[1.         0.93333333 1.         0.93333333 0.93333333 0.93333333
 0.8        1.         1.         1.        ]
0.9533333333333334


## Building Multiple Pipelines

In [34]:
from sklearn.datasets import load_iris
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.pipeline import Pipeline
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from sklearn import svm
from sklearn import tree

# Load and split the data
iris = load_iris()
X_train, X_test, y_train, y_test = train_test_split(iris.data, iris.target, test_size=0.2, random_state=42)

# Construct some pipelines
pipe_lr = Pipeline([('scl', StandardScaler()),
                    ('pca', PCA(n_components=2)),
                    ('clf', LogisticRegression())])

pipe_svm = Pipeline([('scl', StandardScaler()),
                     ('pca', PCA(n_components=2)),
                     ('clf', svm.SVC())])

pipe_dt = Pipeline([('scl', StandardScaler()),
                    ('pca', PCA(n_components=2)),
                    ('clf', tree.DecisionTreeClassifier())])

pipe_knn = Pipeline([('scl', StandardScaler()),
                     ('pca', PCA(n_components=2)),
                    ('clf', KNeighborsClassifier())])

# List of pipelines for ease of iteration
pipelines = [pipe_lr, pipe_svm, pipe_dt,pipe_knn]

# Dictionary of pipelines and classifier types for ease of reference
pipe_dict = {0: 'Logistic Regression', 1: 'Support Vector Machine', 
             2: 'Decision Tree',3:'KNearest'}

# Fit the pipelines
for pipe in pipelines:
    pipe.fit(X_train, y_train)

# Compare accuracies
for idx, val in enumerate(pipelines):
    print('%s pipeline test accuracy: %.3f' % (pipe_dict[idx], 
                                               val.score(X_test, y_test)))

# Identify the most accurate model on test data
best_acc = 0.0
best_clf = 0
best_pipe = ''
for idx, val in enumerate(pipelines):
    if val.score(X_test, y_test) > best_acc:
        best_acc = val.score(X_test, y_test)
        best_pipe = val
        best_clf = idx
print('Classifier with best accuracy: %s' % pipe_dict[best_clf])

Logistic Regression pipeline test accuracy: 0.933
Support Vector Machine pipeline test accuracy: 0.900
Decision Tree pipeline test accuracy: 0.900
KNearest pipeline test accuracy: 0.933
Classifier with best accuracy: Logistic Regression




## Saving the best model for later use

In [35]:
# Save pipeline to file
from sklearn.externals import joblib
joblib.dump(best_pipe, 'best_pipeline.pkl', compress=1)
print('Saved %s pipeline to file' % pipe_dict[best_clf])

Saved Logistic Regression pipeline to file


## Load the model whenever you want

In [37]:
# load the model from disk
loaded_model = joblib.load("best_pipeline.pkl")
result = loaded_model.score(X_test, y_test)
print(result)

0.9333333333333333
