In [5]:
import numpy as np
import pandas as pd

In [6]:
from sklearn.svm import LinearSVC,SVC
from sklearn.decomposition import PCA
from sklearn.model_selection import GridSearchCV, train_test_split
from sklearn.pipeline import Pipeline

In [7]:
with np.load('cifar4-train.npz',allow_pickle=False) as npz_file:
    data = dict(npz_file.items())
    print (data.keys())

dict_keys(['pixels', 'overfeat', 'labels', 'names', 'allow_pickle'])


In [8]:
#create X/y arrays
X = data['overfeat']
y = data['labels']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=1000, random_state=0)

In [9]:
from sklearn.dummy import DummyClassifier

# Evaluate baseline
dummy = DummyClassifier(strategy='most_frequent')
dummy.fit(X_train, y_train)
accuracy = dummy.score(X_test, y_test)
print('Baseline accuracy ("most-frequent"): {:.3f}'.format(accuracy))

Baseline accuracy ("most-frequent"): 0.227


1) Chose LinearSVC as it scales better to a large number of samples.

2) Chose SVC as it's used for non linear classification, which is appropriate for our classification tasks.

### SVM classifier with linear kernel

In [10]:
pca = PCA(n_components=150)
pipe = Pipeline([('pca',pca),('linearsvc',LinearSVC())])

#create grid search with cross-validation object
grid_cv = GridSearchCV(pipe,{'linearsvc__C':[0.001,0.01,0.1,1,10]},cv=5)

#fit the estimator
grid_cv.fit(X_train,y_train)

GridSearchCV(cv=5, error_score='raise',
       estimator=Pipeline(memory=None,
     steps=[('pca', PCA(copy=True, iterated_power='auto', n_components=150, random_state=None,
  svd_solver='auto', tol=0.0, whiten=False)), ('linearsvc', LinearSVC(C=1.0, class_weight=None, dual=True, fit_intercept=True,
     intercept_scaling=1, loss='squared_hinge', max_iter=1000,
     multi_class='ovr', penalty='l2', random_state=None, tol=0.0001,
     verbose=0))]),
       fit_params=None, iid=True, n_jobs=1,
       param_grid={'linearsvc__C': [0.001, 0.01, 0.1, 1, 10]},
       pre_dispatch='2*n_jobs', refit=True, return_train_score=True,
       scoring=None, verbose=0)

In [11]:
grid_cv.cv_results_.keys()

dict_keys(['mean_fit_time', 'std_fit_time', 'mean_score_time', 'std_score_time', 'param_linearsvc__C', 'params', 'split0_test_score', 'split1_test_score', 'split2_test_score', 'split3_test_score', 'split4_test_score', 'mean_test_score', 'std_test_score', 'rank_test_score', 'split0_train_score', 'split1_train_score', 'split2_train_score', 'split3_train_score', 'split4_train_score', 'mean_train_score', 'std_train_score'])

In [12]:
df = pd.DataFrame.from_items([('C',grid_cv.cv_results_['param_linearsvc__C']),
                             ('mean test',grid_cv.cv_results_['mean_test_score']),
                             ('std test',grid_cv.cv_results_['std_test_score']),
                              ('mean train',grid_cv.cv_results_['mean_train_score']),
                              ('std train',grid_cv.cv_results_['std_train_score']),])

In [13]:
df

Unnamed: 0,C,mean test,std test,mean train,std train
0,0.001,0.8335,0.008461,0.868439,0.003336
1,0.01,0.8325,0.012877,0.875752,0.004441
2,0.1,0.82775,0.010599,0.876939,0.005193
3,1.0,0.789,0.012334,0.821562,0.009081
4,10.0,0.7605,0.029278,0.793819,0.019945


In [14]:
df['mean test'].idxmax()

0

In [15]:
print ('Linear SVM - top accuracy across folds: {:.3f} (std: {:.3f}) with C: {}'.format(df.loc[0,'mean test'],
                                                                                        df.loc[0,'std test'],
                                                                                       df.loc[0,'C']))

Linear SVM - top accuracy across folds: 0.834 (std: 0.008) with C: 0.001


### SVM classifier with RBF kernel

In [17]:
svc_rbf = SVC(kernel='rbf',random_state=0)
pipe = Pipeline([('pca',pca),('svc_rbf',svc_rbf)])

grid_cv_rbf = GridSearchCV(pipe, {'svc_rbf__C':[0.1,1,10,20],'svc_rbf__gamma':[0.001,0.01,0.1,1]},cv=5)
grid_cv_rbf.fit(X_train,y_train)

GridSearchCV(cv=5, error_score='raise',
       estimator=Pipeline(memory=None,
     steps=[('pca', PCA(copy=True, iterated_power='auto', n_components=150, random_state=None,
  svd_solver='auto', tol=0.0, whiten=False)), ('svc_rbf', SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape='ovr', degree=3, gamma='auto', kernel='rbf',
  max_iter=-1, probability=False, random_state=0, shrinking=True,
  tol=0.001, verbose=False))]),
       fit_params=None, iid=True, n_jobs=1,
       param_grid={'svc_rbf__C': [0.1, 1, 10, 20], 'svc_rbf__gamma': [0.001, 0.01, 0.1, 1]},
       pre_dispatch='2*n_jobs', refit=True, return_train_score=True,
       scoring=None, verbose=0)

In [18]:
grid_cv_rbf.cv_results_.keys()

dict_keys(['mean_fit_time', 'std_fit_time', 'mean_score_time', 'std_score_time', 'param_svc_rbf__C', 'param_svc_rbf__gamma', 'params', 'split0_test_score', 'split1_test_score', 'split2_test_score', 'split3_test_score', 'split4_test_score', 'mean_test_score', 'std_test_score', 'rank_test_score', 'split0_train_score', 'split1_train_score', 'split2_train_score', 'split3_train_score', 'split4_train_score', 'mean_train_score', 'std_train_score'])

In [19]:
df_rbf = pd.DataFrame.from_items([('C',grid_cv_rbf.cv_results_['param_svc_rbf__C']),
                                  ('gamma',grid_cv_rbf.cv_results_['param_svc_rbf__gamma']),
                             ('mean test',grid_cv_rbf.cv_results_['mean_test_score']),
                             ('std test',grid_cv_rbf.cv_results_['std_test_score']),
                              ])

In [20]:
df_rbf.sort_values('mean test',ascending=False).head()

Unnamed: 0,C,gamma,mean test,std test
12,20.0,0.001,0.80875,0.010669
8,10.0,0.001,0.8075,0.011109
4,1.0,0.001,0.803,0.007927
0,0.1,0.001,0.6275,0.014225
5,1.0,0.01,0.25625,0.000786


In [21]:
df_rbf['mean test'].idxmax()

12

In [22]:
print ('RBF SVM - top accuracy across folds: {:.3f} (std: {:.3f}) with C: {} and gamma {}'.format(df_rbf.loc[8,'mean test'],
                                                                                        df_rbf.loc[8,'std test'],
                                                                                       df_rbf.loc[8,'C'],
                                                                                                 df_rbf.loc[8,'gamma']))

RBF SVM - top accuracy across folds: 0.807 (std: 0.011) with C: 10 and gamma 0.001


### Tuned estimators on 1,000 test set 

In [23]:
accuracy = grid_cv.score(X_test,y_test)
accuracy_rbf = grid_cv_rbf.score(X_test,y_test)
print ('Linear SVM accuracy (test set): {:.3f}'.format(accuracy))
print ('RBF SVM accuracy (test set): {:.3f}'.format(accuracy_rbf))

Linear SVM accuracy (test set): 0.815
RBF SVM accuracy (test set): 0.809


#### Model Selection - SVM RBF Kernel

Looking at all the results, SVM Linear, SVM RBF, Logistics and FC NN seems to have the best test accuracy. 

    1) I eliminated SVM Linear as I consider image classification as not linear. 
    2) I equally elimiated Logistics because afer running the predict_proba() and printing out the images, it did not reflect correctly what was predicted against the real image.
    3) I finally chose SVM RBF based on Occam's razor, simpler one is usually better.  

In [24]:
#load test set
with np.load('cifar4-test.npz',allow_pickle=False) as npz_file:
    test_set = dict(npz_file.items())
    print (test_set.keys())

X_test_set = test_set['overfeat']

dict_keys(['pixels', 'overfeat', 'allow_pickle'])


In [28]:
results = grid_cv_rbf.predict(X_test_set)

In [29]:
#save to an .npy file
np.save('test-predictions.npy',results)

In [30]:
results


array([2, 3, 2, 3, 2, 3, 3, 3, 0, 3, 3, 0, 1, 1, 0, 2, 1, 1, 2, 1, 1, 0,
       1, 2, 3, 1, 2, 0, 2, 1, 0, 0, 2, 1, 3, 0, 1, 1, 1, 3, 2, 3, 0, 2,
       2, 2, 1, 1, 0, 2, 3, 1, 0, 1, 2, 0, 0, 2, 0, 1, 0, 0, 0, 0, 2, 1,
       2, 2, 0, 1, 2, 3, 3, 2, 3, 3, 3, 3, 3, 1, 2, 0, 2, 3, 3, 0, 2, 3,
       3, 2, 2, 1, 1, 0, 1, 2, 1, 1, 3, 1, 3, 2, 0, 3, 2, 2, 1, 2, 1, 3,
       0, 1, 1, 0, 3, 1, 2, 1, 1, 2, 0, 1, 2, 1, 2, 2, 3, 1, 1, 1, 0, 1,
       0, 3, 3, 2, 2, 1, 1, 1, 0, 3, 3, 2, 1, 2, 3, 2, 1, 1, 1, 2, 2, 2,
       3, 1, 3, 0, 3, 2, 0, 2, 0, 1, 2, 1, 0, 3, 3, 3, 0, 3, 0, 3, 0, 2,
       1, 2, 2, 0, 0, 2, 1, 0, 1, 2, 3, 1, 1, 0, 3, 0, 0, 0, 2, 3, 1, 2,
       3, 2, 3, 0, 3, 0, 1, 2, 1, 1, 2, 1, 0, 2, 1, 0, 3, 1, 0, 0, 0, 0,
       1, 3, 3, 2, 1, 0, 3, 3, 0, 1, 2, 1, 2, 3, 1, 3, 2, 2, 2, 2, 1, 3,
       1, 3, 3, 2, 1, 3, 2, 2, 2, 1, 2, 2, 3, 2, 2, 1, 1, 3, 3, 1, 2, 2,
       1, 1, 1, 3, 3, 0, 1, 3, 0, 2, 0, 2, 1, 2, 1, 1, 0, 2, 0, 3, 3, 2,
       2, 3, 3, 1, 1, 0, 2, 1, 2, 2, 3, 2, 0, 3, 1,