### https://github.com/krishnaik06/Pipelines-Using-Sklearn

### https://analyticsindiamag.com/hands-on-tutorial-on-machine-learning-pipelines-with-scikit-learn/

In [1]:
from sklearn.datasets import load_iris
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from joblib import dump

In [2]:
iris_df = load_iris()

In [3]:
iris_df

{'data': array([[5.1, 3.5, 1.4, 0.2],
        [4.9, 3. , 1.4, 0.2],
        [4.7, 3.2, 1.3, 0.2],
        [4.6, 3.1, 1.5, 0.2],
        [5. , 3.6, 1.4, 0.2],
        [5.4, 3.9, 1.7, 0.4],
        [4.6, 3.4, 1.4, 0.3],
        [5. , 3.4, 1.5, 0.2],
        [4.4, 2.9, 1.4, 0.2],
        [4.9, 3.1, 1.5, 0.1],
        [5.4, 3.7, 1.5, 0.2],
        [4.8, 3.4, 1.6, 0.2],
        [4.8, 3. , 1.4, 0.1],
        [4.3, 3. , 1.1, 0.1],
        [5.8, 4. , 1.2, 0.2],
        [5.7, 4.4, 1.5, 0.4],
        [5.4, 3.9, 1.3, 0.4],
        [5.1, 3.5, 1.4, 0.3],
        [5.7, 3.8, 1.7, 0.3],
        [5.1, 3.8, 1.5, 0.3],
        [5.4, 3.4, 1.7, 0.2],
        [5.1, 3.7, 1.5, 0.4],
        [4.6, 3.6, 1. , 0.2],
        [5.1, 3.3, 1.7, 0.5],
        [4.8, 3.4, 1.9, 0.2],
        [5. , 3. , 1.6, 0.2],
        [5. , 3.4, 1.6, 0.4],
        [5.2, 3.5, 1.5, 0.2],
        [5.2, 3.4, 1.4, 0.2],
        [4.7, 3.2, 1.6, 0.2],
        [4.8, 3.1, 1.6, 0.2],
        [5.4, 3.4, 1.5, 0.4],
        [5.2, 4.1, 1.5, 0.1],
  

In [4]:
X_train,X_test,y_train,y_test = train_test_split(iris_df.data,iris_df.target,test_size=0.3,random_state=0)

### Pipelines Creation
#### 1. Data Preprocessing by using Standard Scaler
#### 2. Reduce Dimension using PCA
#### 3. Apply  Classifier

In [5]:
pipeline_lr=Pipeline([('scalar1',StandardScaler()),# fit and transform will be applied
                     ('pca1',PCA(n_components=2)),# fit and transform will be applied
                     ('lr_classifier',LogisticRegression(random_state=0))]) # Only fit will be applied

In [6]:
pipeline_dt=Pipeline([('scalar2', StandardScaler()),
                      ('pca2', PCA(n_components=2)),
                      ('dt_classifier', DecisionTreeClassifier())])

In [7]:
pipeline_randomforest=Pipeline([('scalar2', StandardScaler()),
                              ('pca2', PCA(n_components=2)),
                              ('rf_classifier', RandomForestClassifier())])

In [8]:
## Lets make the list of pipelines
pipelines = [pipeline_lr, pipeline_dt, pipeline_randomforest]

In [9]:
best_accuracy=0.0
best_classifier=0
best_pipeline=""

In [10]:
# Dictionary of pipelines and classifier types for ease of reference
pipe_dict = {0: 'Logistic Regression', 1: 'Decision Tree', 2: 'RandomForest'}

for pipe in pipelines:
    pipe.fit(X_train,y_train) # It will apply fit_transform in StandardScaler&PCA & fit alone in classifier(DecisionTreeClassifier)

In [11]:
accuracies = []
for index,model in enumerate(pipelines):
    score = model.score(X_test,y_test)
    print("{} Accuracy: {}".format(pipe_dict[index],score))
    accuracies.append(score)

Logistic Regression Accuracy: 0.8666666666666667
Decision Tree Accuracy: 0.9111111111111111
RandomForest Accuracy: 0.9111111111111111


In [12]:
print(accuracies)
import numpy as np
max_accuracy_index = np.argmax(accuracies)
print('max_accuracy_index : ', max_accuracy_index)
print('max_accuracy : ', np.amax(accuracies))
print(pipelines[max_accuracy_index])
print(pipe_dict[max_accuracy_index])

[0.8666666666666667, 0.9111111111111111, 0.9111111111111111]
max_accuracy_index :  1
max_accuracy :  0.9111111111111111
Pipeline(steps=[('scalar2', StandardScaler()), ('pca2', PCA(n_components=2)),
                ('dt_classifier', DecisionTreeClassifier())])
Decision Tree


### Saving Best pipeline in disk in joblib format

In [13]:
dump(pipelines[max_accuracy_index], filename="iris_pipeline_classification.joblib")

['iris_pipeline_classification.joblib']

In [15]:
prediction = pipelines[max_accuracy_index].predict([[5.1, 3.5, 1.4, 0.2]])
prediction

array([0])

In [16]:
prediction[0]

0