# Building a model

In [1]:
from sklearn import datasets
iris = datasets.load_iris()

In [2]:
x = iris.data
y = iris.target

In [3]:
from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size = 0.4)

In [4]:
from sklearn.neighbors import KNeighborsClassifier
my_classifier = KNeighborsClassifier(n_neighbors = 3)

In [5]:
my_classifier.fit(x_train,y_train)
predict = my_classifier.predict(x_test)
print(predict)
from sklearn.metrics import accuracy_score
print(accuracy_score(y_test, predict))

[2 2 0 2 2 0 2 2 1 0 0 0 1 1 0 2 1 0 0 0 1 2 0 1 1 2 1 0 0 0 2 0 0 2 2 2 0
 2 2 0 2 2 1 1 0 2 0 2 1 1 1 0 1 0 1 2 0 0 0 0]
0.95


# Building a Pipeline

In [6]:
from sklearn.preprocessing import MinMaxScaler
from sklearn.neighbors import KNeighborsClassifier

In [7]:
from sklearn.pipeline import Pipeline
pipe_knn = Pipeline([('minmax', MinMaxScaler()), ('knn', KNeighborsClassifier())])
pipe_knn.fit(x_train, y_train)
score = pipe_knn.score(x_test, y_test)
print('kNN pipeline test accuracy: %.3f' % score)

kNN pipeline test accuracy: 0.967


# Building multiple Pipelines

In [8]:
from sklearn.datasets import load_iris
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.pipeline import Pipeline
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from sklearn import svm
from sklearn import tree

In [9]:
iris = load_iris()
x_train, x_test, y_train, y_test = train_test_split(iris.data, iris.target, test_size = 0.4)

# Construct some pipelines
pipe_lr = Pipeline([('scl', StandardScaler()), ('pca', PCA(n_components = 2)), ('clf', LogisticRegression())])
pipe_svm = Pipeline([('scl', StandardScaler()), ('pca', PCA(n_components = 2)), ('clf', svm.SVC())])
pipe_dt = Pipeline([('scl', StandardScaler()), ('pca', PCA(n_components = 2)), ('clf', tree.DecisionTreeClassifier())])
pipe_knn = Pipeline([('scl', StandardScaler()), ('pca', PCA(n_components = 2)), ('clf', KNeighborsClassifier())])

# List of pipelines for ease of iteration
pipelines = [pipe_lr, pipe_svm, pipe_dt, pipe_knn]

# Dictionary of pipelines and classifier types for ease of reference
pipe_dict = {0: 'Logistic Regression', 1: 'Support Vector Machine', 2: 'Decision Tree', 3: 'KNearest'}

# Fit the pipelines
for pipe in pipelines:
    pipe.fit(x_train, y_train)
    
# Compare accuracies
for idx, val in enumerate(pipelines):
    print('%s pipeline test accuracy: %.3f' % (pipe_dict[idx], val.score(x_test, y_test)))
    
# Identify the most accurate model on test data
best_acc = 0.0
best_clf = 0.0
best_pipe = ''
for idx, val in enumerate(pipelines):
    if val.score(x_test, y_test) > best_acc:
        best_acc = val.score(x_test, y_test)
        best_pipe = val
        best_cld = idx
    print('Classifier with best accuracy: %s' % pipe_dict[best_clf])

Logistic Regression pipeline test accuracy: 0.800
Support Vector Machine pipeline test accuracy: 0.883
Decision Tree pipeline test accuracy: 0.900
KNearest pipeline test accuracy: 0.867
Classifier with best accuracy: Logistic Regression
Classifier with best accuracy: Logistic Regression
Classifier with best accuracy: Logistic Regression
Classifier with best accuracy: Logistic Regression




# Saving the best model for later use

In [10]:
# Save pipeline to file
from sklearn.externals import joblib # externals because we are gonna dump everything in disk.
joblib.dump(best_pipe, 'best_pipeline.pk1', compress = 1) # Pipe is available in best_pipe
print('Saved %s pipeline to file' % pipe_dict[best_clf])

Saved Logistic Regression pipeline to file


# Load the model whenever you want

In [13]:
# Load the model from disk
loaded_model = joblib.load("best_pipeline.pk1")
result = loaded_model.score(x_test, y_test)
print(result)

0.9
