In [2]:
from sklearn.datasets import load_iris
from sklearn.model_selection import train_test_split
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler
from sklearn import tree
from sklearn.pipeline import Pipeline

### Load the data and split it for training

In [3]:
# Load and Split the Dataset

iris_DS = load_iris()

X_train, X_test, y_train, y_test = train_test_split(iris_DS.data, iris_DS.target, test_size=0.2)

### Simple Pipeline for Decision Tree classifier

In [4]:
# Create the pipeline

pipe_lines = Pipeline([('scl', StandardScaler()),
    ('pca',PCA(n_components=2)), ('clf', tree.DecisionTreeClassifier(random_state=42))])

pipe_lines.fit(X_train, y_train)

print('Test accuracy: %.3f' % pipe_lines.score(X_test,y_test))

Test accuracy: 0.867


#### Pipeline with GridSearch

In [5]:
import numpy as np
from sklearn.model_selection import GridSearchCV
param_range = [1,2,3,4,5]

# Grid search parameters for Decision Tree

grid_params = [{'clf__criterion': ['gini', 'entropy'],
		'clf__min_samples_leaf': param_range,
		'clf__max_depth': param_range,
		'clf__min_samples_split': param_range[1:],
		'clf__presort': [True, False]}]

GS_with_DS = GridSearchCV(pipe_lines, grid_params, scoring='accuracy',
            cv=10)

GS_with_DS.fit(X_train,y_train)

print('Best accuracy: %.3f' % GS_with_DS.best_score_)
print('\n Best params: \n', GS_with_DS.best_params_)




Best accuracy: 0.942

 Best params: 
 {'clf__criterion': 'gini', 'clf__max_depth': 5, 'clf__min_samples_leaf': 2, 'clf__min_samples_split': 2, 'clf__presort': True}




In [6]:
dt_model = GS_with_DS.best_estimator_
print(dt_model)

Pipeline(memory=None,
         steps=[('scl',
                 StandardScaler(copy=True, with_mean=True, with_std=True)),
                ('pca',
                 PCA(copy=True, iterated_power='auto', n_components=2,
                     random_state=None, svd_solver='auto', tol=0.0,
                     whiten=False)),
                ('clf',
                 DecisionTreeClassifier(class_weight=None, criterion='gini',
                                        max_depth=5, max_features=None,
                                        max_leaf_nodes=None,
                                        min_impurity_decrease=0.0,
                                        min_impurity_split=None,
                                        min_samples_leaf=2, min_samples_split=2,
                                        min_weight_fraction_leaf=0.0,
                                        presort=True, random_state=42,
                                        splitter='best'))],
         verbose=False)
