###  Dataset
CIFAR-10 dataset consists of 60000 32x32 colour images in 10 classe

### Goal 

We create a classification pipeline using first two dimension reduction methods (with number of dimentions 2,3) and then KNN method with number of neighborhoods 3 and 5.

In [17]:
import random
import os
import numpy as np
import matplotlib.pyplot as plt

from six.moves import cPickle as pickle

from sklearn.decomposition import PCA
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.neighbors import KNeighborsClassifier
from sklearn import svm

                               
from sklearn.pipeline import make_pipeline,Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import GridSearchCV

from sklearn.metrics import mean_squared_error
from sklearn.metrics import accuracy_score


# This is a bit of magic to make matplotlib figures appear inline in the notebook
# rather than in a new window.
%matplotlib inline
plt.rcParams['figure.figsize'] = (10.0, 8.0) # set default size of plots
plt.rcParams['image.interpolation'] = 'nearest'
plt.rcParams['image.cmap'] = 'gray'

# Some more magic so that the notebook will reload external python modules;
# see http://stackoverflow.com/questions/1907993/autoreload-of-modules-in-ipython
%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [23]:
import pickle

In [18]:
def load_CIFAR10(ROOT):
    """ load all of cifar """
    xs=[]
    ys=[]
    for b in range(1,6):
        filename = os.path.join(ROOT, 'data_batch_%d' % (b, ))
        with open(filename, 'rb') as f:
            datadict = pickle.load(f,encoding='latin1')
            X = datadict['data']
            Y = datadict['labels']
            #X is of shape (10000, 3072) and Y is a list
            """X = X.reshape(10000, 3, 32, 32).transpose(0,2,3,1).astype("float")
            Y = np.array(Y)"""
            xs.append(X)
            ys.append(Y)
    X_tr = np.concatenate(xs)
    Y_tr = np.concatenate(ys)
    del X, Y
    filename = os.path.join(ROOT, 'test_batch')
    #loading test data 
    with open(filename, 'rb') as f:
        datadict = pickle.load(f,encoding='latin1')
        X_te = datadict['data']
        Y_te = datadict['labels']
        """X_te = X.reshape(10000, 3, 32, 32).transpose(0,2,3,1).astype("int")
        Y_te = np.array(Y)"""
        
    
    return X_tr, Y_tr, X_te, Y_te

In [19]:
cifar10_dir = 'cifar-10-batches-py'
X_tr, y_tr, X_te, y_te = load_CIFAR10(cifar10_dir)

In [8]:
print("X_train shape is",X_tr.shape)
print("X_test shape is",X_te.shape)
print("y_train shape is",y_tr.shape)
print("ytest shape is",y_tr.shape)
print("number of calsses is",np.unique(y_tr))

X_train shape is (50000, 3072)
X_test shape is (10000, 3072)
y_train shape is (50000,)
ytest shape is (50000,)
number of calsses is [0 1 2 3 4 5 6 7 8 9]


In [9]:
sc=StandardScaler()
pca=PCA()
lda=LinearDiscriminantAnalysis()
svm=svm.SVC(max_iter=3000)
pipe=Pipeline(steps=[('sc',sc),('pca', pca),('svm',svm)])

In [10]:
params_grid = {
'svm__C':  [10],
'svm__kernel': ['linear'],
'svm__gamma': [0.001,.01],
'pca__n_components': [2,3]
}




In [11]:
grid_search = GridSearchCV(pipe, param_grid = params_grid, cv=5)
grid_search.fit(X_tr,y_tr)



GridSearchCV(cv=5,
             estimator=Pipeline(steps=[('sc', StandardScaler()), ('pca', PCA()),
                                       ('svm', SVC(max_iter=3000))]),
             param_grid={'pca__n_components': [2, 3], 'svm__C': [10],
                         'svm__gamma': [0.001, 0.01],
                         'svm__kernel': ['linear']})

In [12]:
print(grid_search.best_params_)
model=grid_search.best_estimator_
print(model)



{'pca__n_components': 2, 'svm__C': 10, 'svm__gamma': 0.01, 'svm__kernel': 'linear'}
Pipeline(steps=[('sc', StandardScaler()), ('pca', PCA(n_components=2)),
                ('svm', SVC(C=10, gamma=0.01, kernel='linear', max_iter=3000))])


In [13]:
Y_pred=model.predict(X_te)

In [14]:
final_mse = mean_squared_error(y_te,Y_pred )
final_rmse = np.sqrt(final_mse)
accuracy_score=accuracy_score(y_te,Y_pred )


In [15]:
print("final_rmse", final_rmse)
print("accuracy_score is", accuracy_score)

final_rmse 4.933244368567201
accuracy_score is 0.0931


In [20]:
# Use a nearest neighbor classifier to evaluate the methods

sc=StandardScaler()
pca=PCA()
lda=LinearDiscriminantAnalysis()
svm=svm.SVC(max_iter=2000)
pipe=Pipeline(steps=[('sc',sc),('pca', pca), ('lda', lda),('svm',svm)])



params_grid = [{
'svm__C': [1, 10],
'svm__kernel': ['linear', 'rbf'],
'svm__gamma': [0.001, 0.0001],

'pca__n_components': [2,3],
'lda':[None]
},
{
'svm__C': [1, 10],
'svm__kernel': ['linear', 'rbf'],
'svm__gamma': [0.001, 0.0001],
'pca':[None],
'lda__n_components': [2,3]
}]

grid_search = GridSearchCV(pipe, param_grid = params_grid, cv=5)
grid_search.fit(X_tr,y_tr)


print(grid_search.best_params_)
model=grid_search.best_estimator_
print(model)

Y_pred=model.predict(X_te)

final_mse = mean_squared_error(y_te,Y_pred )
final_rmse = np.sqrt(final_mse)
accuracy_score=accuracy_score(y_te,Y_pred )














{'lda__n_components': 3, 'pca': None, 'svm__C': 10, 'svm__gamma': 0.001, 'svm__kernel': 'rbf'}
Pipeline(steps=[('sc', StandardScaler()), ('pca', None),
                ('lda', LinearDiscriminantAnalysis(n_components=3)),
                ('svm', SVC(C=10, gamma=0.001, max_iter=2000))])


In [31]:
print(model)

Pipeline(steps=[('sc', StandardScaler()), ('pca', None),
                ('lda', LinearDiscriminantAnalysis(n_components=3)),
                ('svm', SVC(C=10, gamma=0.001, max_iter=2000))])


In [85]:
print("final_rmse", final_rmse)
print("accuracy_score is", accuracy_score)

final_rmse 4.120376196417022
accuracy_score is 0.2556


In [32]:
pickle.dump(model, open('svc', 'wb'))

In [34]:
model1 = pickle.load(open('./svc', 'rb'))

In [None]:
""""transform=make_pipeline(StandardScaler(),
                    LinearDiscriminantAnalysis(n_components=2))
X_te_pr=transform.fit_transform(X_te,y_te)""""