# MACHINE LEARNING PIPELINES

### FLOWER IRIS DATA

Machine learning pipelines can be handy to write cleaner code with less number of lines.


## *Importing Libraries*

In [1]:
from sklearn.datasets import load_iris
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier

In [2]:
iris_df = load_iris()

In [3]:
iris_df.data

array([[5.1, 3.5, 1.4, 0.2],
       [4.9, 3. , 1.4, 0.2],
       [4.7, 3.2, 1.3, 0.2],
       [4.6, 3.1, 1.5, 0.2],
       [5. , 3.6, 1.4, 0.2],
       [5.4, 3.9, 1.7, 0.4],
       [4.6, 3.4, 1.4, 0.3],
       [5. , 3.4, 1.5, 0.2],
       [4.4, 2.9, 1.4, 0.2],
       [4.9, 3.1, 1.5, 0.1],
       [5.4, 3.7, 1.5, 0.2],
       [4.8, 3.4, 1.6, 0.2],
       [4.8, 3. , 1.4, 0.1],
       [4.3, 3. , 1.1, 0.1],
       [5.8, 4. , 1.2, 0.2],
       [5.7, 4.4, 1.5, 0.4],
       [5.4, 3.9, 1.3, 0.4],
       [5.1, 3.5, 1.4, 0.3],
       [5.7, 3.8, 1.7, 0.3],
       [5.1, 3.8, 1.5, 0.3],
       [5.4, 3.4, 1.7, 0.2],
       [5.1, 3.7, 1.5, 0.4],
       [4.6, 3.6, 1. , 0.2],
       [5.1, 3.3, 1.7, 0.5],
       [4.8, 3.4, 1.9, 0.2],
       [5. , 3. , 1.6, 0.2],
       [5. , 3.4, 1.6, 0.4],
       [5.2, 3.5, 1.5, 0.2],
       [5.2, 3.4, 1.4, 0.2],
       [4.7, 3.2, 1.6, 0.2],
       [4.8, 3.1, 1.6, 0.2],
       [5.4, 3.4, 1.5, 0.4],
       [5.2, 4.1, 1.5, 0.1],
       [5.5, 4.2, 1.4, 0.2],
       [4.9, 3

## *Splitting The Data*

In [4]:
X_train, X_test, y_train, y_test = train_test_split(iris_df.data, iris_df.target, test_size = 0.2, random_state = 0)

## *Creating a pipeline*
1. Preprocessing the data using StandardScaler
2. Applying PCA to reduce the dimensions
3. Applying the classifiers

In [14]:
pipeline_lr = Pipeline([('scalar1',StandardScaler()),
                     ('pca1',PCA(n_components=2)),
                     ('lr_classifier',LogisticRegression(random_state=0))])


pipeline_dt = Pipeline([('scalar2',StandardScaler()),
                     ('pca2',PCA(n_components=2)),
                     ('dt_classifier',DecisionTreeClassifier())])

pipeline_rf = Pipeline([('scalar3',StandardScaler()),
                     ('pca3',PCA(n_components=2)),
                     ('rf_classifier',RandomForestClassifier())])

In [16]:
#List of pipelines
pipelines = [pipeline_lr, pipeline_lr, pipeline_rf]

best_accuracy = 0.0
best_classifier_index = 0
best_pipeline = ""

In [17]:
# Making a dictionary of classifier indexes and name
pipe_dict = {
    0: 'Logistic Regression',
    1: 'Decision Tree',
    2: 'Random Forest'
}

## *Fitting the models*

In [19]:
for pipe in pipelines:
    pipe.fit(X_train, y_train)

In [22]:
for i, model in enumerate(pipelines):
    if model.score(X_test, y_test) > best_accuracy:
        best_accuracy = model.score(X_test, y_test)
        best_pipeline = model
        best_classifier_index = i

print('Classifier with best accuracy: {}'.format(pipe_dict[best_classifier_index]))

Classifier with best accuracy: Random Forest


In [23]:
for i, model in enumerate(pipelines):
    print("{} has a test accuracy of {}.".format(pipe_dict[i], model.score(X_test, y_test)))

Logistic Regression has a test accuracy of 0.8666666666666667.
Decision Tree has a test accuracy of 0.8666666666666667.
Random Forest has a test accuracy of 0.9.
