## PCA with Pipeline

In [7]:
from sklearn.pipeline import Pipeline
from sklearn.decomposition import PCA
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split

import pandas as pd

In [4]:
# extracting data from the csv file
headers = ['preg', 'plas', 'pres', 'skin', 'test', 'mass', 'pedi', 'age', 'class']

prima_df = pd.read_csv("../datasets/prima_indian_diabetes.csv", names=headers)

prima_df.head()

Unnamed: 0,preg,plas,pres,skin,test,mass,pedi,age,class
0,6,148,72,35,0,33.6,0.627,50,1
1,1,85,66,29,0,26.6,0.351,31,0
2,8,183,64,0,0,23.3,0.672,32,1
3,1,89,66,23,94,28.1,0.167,21,0
4,0,137,40,35,168,43.1,2.288,33,1


In [5]:
# seperating data and the target
X = prima_df.values[:, 0:8]
y = prima_df.values[:, 8]

### PCA evaluation

Transform the data into PCA to find the important components of data

In [8]:
# Spliting the data into train and test
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42)


In [21]:
# Creating a pipeling with sequence of steps
pipe = Pipeline([
        ('pca', PCA(n_components=4)),
        ('classifier', DecisionTreeClassifier())
    ])

# Fitting the transformed data into pipeline
pipe.fit(X_train, y_train)

Pipeline(memory=None,
     steps=[('pca', PCA(copy=True, iterated_power='auto', n_components=4, random_state=None,
  svd_solver='auto', tol=0.0, whiten=False)), ('classifier', DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=None,
            max_features=None, max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, presort=False, random_state=None,
            splitter='best'))])

In [22]:
# Predicting the test data
predicted = pipe.predict(X_test)

### Cross validation on the model

In [28]:
from sklearn.cross_validation import cross_val_score

print(cross_val_score(pipe, X_test, predicted))

[ 0.70930233  0.77380952  0.72619048]


In [25]:
from sklearn.metrics import confusion_matrix, classification_report

print(classification_report(y_test, predicted))
print(confusion_matrix(y_test, predicted))

             precision    recall  f1-score   support

        0.0       0.80      0.76      0.78       168
        1.0       0.57      0.63      0.60        86

avg / total       0.72      0.72      0.72       254

[[128  40]
 [ 32  54]]
