In [None]:
# from tpot import TPOTClassifier

# model selections
from sklearn.svm import LinearSVC
# from sklearn.linear_model import LogisticRegression
# from sklearn.tree import DecisionTreeClassifier
# from sklearn.ensemble import RandomForestClassifier
# from sklearn.neighbors import KNeighborsClassifier
# from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
# from sklearn.naive_bayes import GaussianNB

# preprocessing steps
import sklearn.model_selection
from sklearn.model_selection import train_test_split, StratifiedKFold, StratifiedShuffleSplit, cross_val_score
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.decomposition import PCA

# metrics
from sklearn.metrics import confusion_matrix, accuracy_score, classification_report, precision_score, recall_score

# classics
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import itertools
import pickle
from pprint import pprint 

rseed = 4444

%matplotlib inline

# Getting the Data

In [None]:
df_pxl = pd.read_pickle('df_pxl_features.pkl')

In [2]:
df_hog = pd.read_pickle('df_hog_features.pkl')

In [None]:
df_sift = pd.read_pickle('./pkls/df_sift_features_500.pkl')
df_sift = df_sift.drop('indx', axis=1)

In [3]:
targets= ['ak47', 'american-flag', 'backpack', 'baseball-bat',
          'baseball-glove', 'basketball-hoop', 'bat', 'bathtub', 'bear',
          'beer-mug', 'billiards', 'binoculars', 'birdbath', 'blimp',
          'bonsai', 'boom-box', 'bowling-ball', 'bowling-pin', 'boxing-glove']

In [None]:
df_pxl = df_pxl[df_pxl['category'].isin(targets)]

In [4]:
df_hog = df_hog[df_hog['category'].isin(targets)]

In [None]:
# oddly, there are a few more images in hog, than in the original df.
# I don't undertand that.. but I don't think we're off on our data.
df_hog.category.value_counts()[df.category.value_counts().sort_index() != df_hog.category.value_counts().sort_index()]

In [None]:
print('PXL Categories')
df.category.value_counts().sort_values(ascending=False).head(10)

In [None]:
print('HOG Categories')
df_hog.category.value_counts().sort_values(ascending=False).head(10)

In [None]:
plt.imshow(df.iloc[0,1:].astype('uint8').reshape(32,32), cmap='gray');

# Preparing and Splitting Data for Model

Stratified data using custom `Ntrain_test_split` function. Was considering using in sklearn's  [stratify option](http://scikit-learn.org/stable/modules/generated/sklearn.model_selection.train_test_split.html#sklearn.model_selection.train_test_split) within the TTS function or using the [StratifuedShuffleSplit](http://scikit-learn.org/stable/modules/generated/sklearn.model_selection.StratifiedShuffleSplit.html) function.

In [5]:
def Ntrain_test_split(X, y, Ntrain=40, Ntest=10, dtype='float', random_seed=rseed, exclude=None):
    """Grabs N training samples for each category in the training data
    
    X - features to split
    y - predictors to split (these must be categorical)
    Ntrain - number of training samples you want from each category
    Ntest - number of test samples you want from each category
    random_seed - random seed applied to the sampling
    exclude - list of categories to be excluded from the TTS
    
    """
    # set a seed if needed
    if random_seed:
        np.random.seed(random_seed)
    
    unique = y.unique()
#     print(unique)
    if exclude:
        # remove all labels to exclude
        index = np.argwhere(np.in1d(unique, exclude))
        unique = np.delete(unique, index)
    
    # prepare indexes
    train = np.zeros(len(unique) * Ntrain, dtype)
    test  = np.zeros(len(unique) * Ntest, dtype)
    
    # tracks lower and higher bounds of training and test arrays
    trlb = 0
    trhb = Ntrain
    telb = 0
    tehb = Ntest
    
    for cat in unique:
#         print('\nCategory :', cat)
        # randomly sample N indicies
        train_test = np.random.choice(y[y == cat].index, Ntrain+Ntest, replace=False)
#         print('Sampled set', train_test)
        
        # use the first m indicies for the training sample, and use the rest for the test
        # save them into the training and test index storage
        train[trlb:trhb] = train_test[:Ntrain]
        test[telb:tehb] = train_test[Ntrain:]
        
#         print('Training set :', train)
#         print('Test set :', test)
        
        # increment the bound trackers
        trlb, trhb, telb, tehb = trlb+Ntrain, trhb+Ntrain, telb+Ntest, tehb+Ntest
        
#     print(train)
    X_train = X.loc[train,:]
    y_train = y.loc[train]
    X_test =  X.loc[test,:]
    y_test =  y.loc[test]

    return X_train, y_train, X_test, y_test    

**Building training test split**

In [None]:
# # delete any data if needed
# del y_test
# del y_train
# del X_test
# del X_train

In [None]:
X_train, y_train, X_test, y_test = Ntrain_test_split(df_pxl.iloc[:,1:], df_pxl.category,
                                                     Ntrain = 40, Ntest = 10,
                                                     random_seed=rseed)

In [6]:
hX_train, hy_train, hX_test, hy_test = Ntrain_test_split(df_hog.iloc[:,1:], df_hog.category, 
                                                         Ntrain = 40, Ntest = 10,
                                                         random_seed=rseed)

In [None]:
sX_train, sy_train, sX_test, sy_test = Ntrain_test_split(df_sift.iloc[:,1:], df_sift.label, 
                                                         Ntrain = 40, Ntest = 10,
                                                         random_seed=rseed)

In [None]:
# another possibility ..
X_train_tts, X_test_tts, y_train_tts, y_test_tts = train_test_split(X_df, y_df, test_size=0.4, random_state=rseed, stratify=y_df)

In [None]:
X_train_mclut, y_train_mclut, X_test_mclut, y_test_mclut = Ntrain_test_split(X_df, y_df, 
                                                                             random_seed=rseed, 
                                                                             exclude=['clutter'])

In [None]:
X_train_sm, y_train_sm, X_test_sm, y_test_sm = Ntrain_test_split(X_df, y_df, 
                                                                 Ntrain=10, Ntest=2,
                                                                 random_seed=rseed)

**Testing their shapes**

In [None]:
print(X_train.shape, X_train_mclut.shape, X_train_sm.shape)

In [None]:
print(X_test.shape, X_test_mclut.shape, X_test_sm.shape)

In [None]:
print(y_test.shape, y_test_mclut.shape, y_test_sm.shape)

In [None]:
# do this if you need to ensure that the category variables line up from the test split
# y_test_copy = pd.DataFrame(y_df.loc[y_test.index])
# y_test_copy['cat_test'] = y_test
# y_test_copy

# Linear SVM Classifier

#### TODO:
1. Build visualizations
    - ROC curves
    - Confusion Matrix
2. Discover metrics for this
3. Do grid search on parameters

In [None]:
from sklearn.linear_model import SGDClassifier

`Accuracy: 0.40`

In [None]:
# # Standardize our data around meanCreate a SVC classifier using a linear kernel
pipe_lrSVC = Pipeline([('scaler', StandardScaler()),
                       ('clf', LinearSVC(random_state=0))])
%time pipe_lrSVC.fit(X_train, y_train)

`Accuracy: 0.05`

In [None]:
# # Standardize our data around mean. Create a SVC classifier using a linear kernel and SGD
n_iter = np.ceil(10**6 / hX_train.shape[0])
pipe_lrSVC = Pipeline([('scaler', StandardScaler()),
                       ('clf', SGDClassifier(loss='squared_hinge', 
                                             penalty='l2',
                                             alpha=0.001,
                                             random_state=rseed,
                                             n_iter = 20))])
%time pipe_lrSVC.fit(hX_train, hy_train)

In [None]:
pipe_lrSVC.score(X_test, y_test)

In [None]:
df_hog.info()

In [None]:
pipe_lrSVC = Pipeline([('scaler', StandardScaler()),
                       ('clf', SGDClassifier(loss='squared_hinge', 
                                             penalty='l2',
                                             alpha=0.001,
                                             random_state=rseed))])

In [None]:
r_train = np.random.choice(X_train.index, len(X_train), replace=False)
num = []

pipe_lrSVC.named_steps['scaler'].fit(X_train)

batchsize = 100
for ind in range(0,len(r_train),batchsize):
    partial_pipe_fit(pipe_lrSVC, X_train.loc[r_train[ind:ind+batchsize], :], y_train.loc[r_train[ind:ind+batchsize]])

In [None]:
def partial_pipe_fit(pipeline_obj, X,y):
    X = pipeline_obj.named_steps['scaler'].transform(X)
    pipeline_obj.named_steps['clf'].partial_fit(X,y, classes=y_train.unique())

In [None]:
pipe_lrSVC.score(X_test,y_test)

In [None]:
# PIXEL FEATURES

# with open('pipe_lrSVC.pkl', 'wb') as f:
#     pickle.dump(pipe_lrSVC, f)

with open('pxl_lrSVC.pkl', 'rb') as f:
    pxl_lrSVC = pickle.load(f)


# HOG FEATURES

In [None]:
pxl_lrSVC.named_steps['clf'].coef_

In [None]:
pxl_lrSVC.score(X_test, y_test)

In [None]:
y_pred_actual = pipe_lrSVC.predict(X_test)
# y_pred_mclut = pipe_lrSVC.predict(X_test_mclut)

In [None]:
# pd.DataFrame({'actual' : y_test,
#               'prediction': y_pred})

In [None]:
# Compute confusion matrix without clutter category
cnf_matrix = confusion_matrix(y_test, y_pred_actual)
cnf_matrix_mclut = confusion_matrix(y_test_mclut, y_pred_mclut)

In [None]:
plt.figure(figsize=(10,10))
plt.imshow(cnf_matrix, interpolation='nearest', cmap=diverging_cmap)
plt.title('Linear SVM')
plt.colorbar()
plt.ylabel('True label (Category #)')
plt.xlabel('Predicted label (Category #)');

In [None]:
plt.figure(figsize=(10,10))
plt.imshow(cnf_matrix_mclut, interpolation='nearest', cmap=diverging_cmap)
plt.title('Linear SVM')
plt.colorbar()
plt.ylabel('True label (Category #)')
plt.xlabel('Predicted label (Category #)');

http://scikit-learn.org/stable/modules/generated/sklearn.dummy.DummyClassifier.html#sklearn.dummy.DummyClassifierm

# PXL Features

KNN & SVM  
Misclassifications  
Showcasing  
HEATMAP The hog  
Neural net - less data to train on a convolutional neural net, smaller model, internet of things  
Business case - grocery store (something that runs on handscanners), getting new items from a grocery store  
 - price checks on grocery stores can improve as bar codes can fade from materials

Logo recognition on clothing (direct consumer marketing)  
Accuracy as a function of categories  

In [None]:
# # Standardize our data around meanCreate a SVC classifier using a linear kernel
pxl_pipe_lrSVC = Pipeline([('scaler', StandardScaler()),
                           ('clf', LinearSVC(random_state=0))])
%time pxl_pipe_lrSVC.fit(X_train, y_train)
y_pred_pxl = pxl_pipe_lrSVC.predict(X_test)

In [None]:
print('Accuracy:', accuracy_score(y_test, y_pred_pxl))

In [None]:
pprint(classification_report(y_test, y_pred_pxl))

In [None]:
# Compute confusion matrix
pxl_cnf_matrix = confusion_matrix(y_test, y_pred_pxl)
df_pxl_cm = pd.merge(pd.DataFrame(y_train.unique(), columns=['category']),
                     pd.DataFrame(pxl_cnf_matrix), left_index=True, right_index=True)

In [None]:
sns.set_context('poster')
plt.figure(figsize=(15,15))
plt.imshow(pxl_cnf_matrix, interpolation='nearest', cmap=diverging_cmap)
plt.title('LinearSVM Pixel Features', fontsize=25, y=1.02)
plt.grid(False)
plt.colorbar(drawedges=True,
            spacing='proportional',
            ticks=range(0,11),
            shrink=.82)
plt.ylabel('True label (Category #)')
plt.yticks(range(0,19), df_pxl_cm.category)

plt.xlabel('Predicted label (Category #)');
plt.xticks(range(0,19), range(1,20));

In [None]:
N = 20
worst_performers = df_pxl_cm.loc[df_pxl_cm.max(axis=1).sort_values()[:N].index,:].sort_index()

# sns.set_context('poster')
plt.figure(figsize=(12,9))
plt.imshow(worst_performers.iloc[:,1:], interpolation='nearest',aspect='auto', cmap=diverging_cmap)
plt.title('Worst Performers Using Pixel Features')
plt.grid(False)
plt.colorbar()
plt.tight_layout()
# plt.ylabel('True label (Category #)')
plt.xlabel('Predicted label (Category #)');
plt.yticks(range(0,N), worst_performers.category);
plt.tick_params(axis='y', which='both', labelsize=21)

In [None]:
N = 20
best_performers = df_pxl_cm.loc[df_pxl_cm.max(axis=1).sort_values()[-N:].index,:].sort_index()

# sns.set_context('poster')
plt.figure(figsize=(12,9))
plt.imshow(best_performers.iloc[:,1:], interpolation='nearest',aspect='auto', cmap=diverging_cmap)
plt.title('Best Performers Using Pixel Features')
plt.grid(False)
# plt.colorbar()
plt.tight_layout()
# plt.ylabel('True label (Category #)')
plt.xlabel('Predicted label (Category #)');
plt.yticks(range(0,N), best_performers.category);
plt.tick_params(axis='y', which='both', labelsize=21)

# HOG Features

In [None]:
# HOG FEATURES

# with open('hog_pipe_lrSVC.pkl', 'wb') as f:
#     pickle.dump(hog_pipe_lrSVC, f)

# with open('hog_pxl_lrSVC.pkl', 'rb') as f:
#     hog_pipe_lrSVC = pickle.load(f)

### No clustering

In [7]:
# # Standardize our data around meanCreate a SVC classifier using a linear kernel
hog_pipe_lrSVC = Pipeline([('scaler', StandardScaler()),
                           ('clf', LinearSVC(random_state=0))])
%time hog_pipe_lrSVC.fit(hX_train, hy_train)
print('Accuracy:', hog_pipe_lrSVC.score(hX_test, hy_test))

CPU times: user 8.6 s, sys: 58.4 ms, total: 8.66 s
Wall time: 8.7 s
Accuracy: 0.236842105263


In [8]:
hog_pred = hog_pipe_lrSVC.predict(hX_test)

In [10]:
pprint(classification_report(hy_test, hog_pred))

('                 precision    recall  f1-score   support\n'
 '\n'
 '           ak47       0.22      0.20      0.21        10\n'
 '  american-flag       0.00      0.00      0.00        10\n'
 '       backpack       0.60      0.30      0.40        10\n'
 '   baseball-bat       0.40      0.40      0.40        10\n'
 ' baseball-glove       0.25      0.20      0.22        10\n'
 'basketball-hoop       0.17      0.10      0.12        10\n'
 '            bat       0.17      0.20      0.18        10\n'
 '        bathtub       0.17      0.20      0.18        10\n'
 '           bear       0.07      0.20      0.11        10\n'
 '       beer-mug       0.55      0.60      0.57        10\n'
 '      billiards       0.25      0.20      0.22        10\n'
 '     binoculars       0.75      0.30      0.43        10\n'
 '       birdbath       0.11      0.30      0.16        10\n'
 '          blimp       0.08      0.10      0.09        10\n'
 '         bonsai       0.23      0.30      0.26        10\n'
 '

### Clustering

In [None]:
from sklearn.cluster import MiniBatchKMeans
kmeans = MiniBatchKMeans(1024, random_state=rseed)
# kmeans_217 = MiniBatchKMeans(217, random_state=rseed)

In [None]:
rand_select = np.random.choice(hX_train.index, size=10280, replace=False)
batch_size = 2000
for ind in range(0,hX_train.shape[0],batch_size):
    print(ind)
    kmeans.partial_fit(hX_train.loc[rand_select[ind:ind+batch_size], :])

In [None]:
for ind in range(0,hX_train.shape[0],batch_size):
    print(ind)
    kmeans_217.partial_fit(hX_train.loc[rand_select[ind:ind+batch_size], :])

In [None]:
khX_train = kmeans.transform(hX_train)
# k217hX_train = kmeans_217.transform(hX_train)

### SVM

In [None]:
# # Standardize our data around meanCreate a SVC classifier using a linear kernel
hog_pipe_lrSVC = Pipeline([('scaler', StandardScaler()),
                           ('clf', LinearSVC(random_state=0))])
%time hog_pipe_lrSVC.fit(khX_train, hy_train)

In [None]:
# # # Standardize our data around meanCreate a SVC classifier using a linear kernel
# hog217_pipe_lrSVC = Pipeline([('scaler', StandardScaler()),
#                               ('clf', LinearSVC(random_state=0))])
# %time hog217_pipe_lrSVC.fit(k217hX_train, hy_train)

In [None]:
khX_test = kmeans.transform(hX_test)

In [None]:
khog_pred = hog_pipe_lrSVC.predict(khX_test)

In [None]:
%time hog_pipe_lrSVC.score(khX_test, hy_test)

### Sparse Encoding

In [None]:
from sklearn.decomposition import SparseCoder

In [None]:
std_scaler = StandardScaler()

In [None]:
scaledClusters = std_scaler.fit_transform(kmeans.cluster_centers_)

In [None]:
scdr = SparseCoder(scaledClusters)

In [None]:
sparse_matrix = scdr.transform(hX_train)

### Results

In [None]:
# diverging_cmap = matplotlib.colors.ListedColormap(sns.cubehelix_palette(8).as_hex())
# Altering the cubehelix: 
# http://seaborn.pydata.org/generated/seaborn.cubehelix_palette.html#seaborn.cubehelix_palette
# See here for where cmap comes from:
# http://stackoverflow.com/questions/37902459/how-do-i-use-seaborns-color-palette-as-a-colormap-in-matplotlib
diverging_cmap = sns.cubehelix_palette(start=2,
                                       rot=0.2,
                                       gamma=1.5,
                                       hue=1, 
                                       light=1, 
                                       dark=0, 
                                       as_cmap = True)

diverging_cube = sns.cubehelix_palette(n_colors=257,
                                       start=2,
                                       rot=0.2,
                                       gamma=1.5,
                                       hue=1, 
                                       light=0.9, 
                                       dark=0.3,
                                       reverse=False)

In [None]:
# Compute confusion matrix
hog_cnf_matrix = confusion_matrix(hy_test, hog_pred)
df_hog_cm = pd.merge(pd.DataFrame(hy_test.unique(), columns=['category']),
                     pd.DataFrame(hog_cnf_matrix), left_index=True, right_index=True)

In [None]:
sns.set_context('poster')
plt.figure(figsize=(15,15))
plt.imshow(hog_cnf_matrix, interpolation='nearest', cmap=diverging_cmap)
plt.title('LinearSVM HOG Features', fontsize=25, y=1.02)
plt.grid(False)
plt.colorbar(drawedges=True,
            spacing='proportional',
            ticks=range(0,11),
            shrink=.82)
plt.ylabel('True label (Category #)')
plt.yticks(range(0,19), df_hog_cm.category)
plt.xlabel('Predicted label (Category #)');
plt.xticks(range(0,19), range(1,20));

In [None]:
N = 5
worst_performers = df_hog_cm.loc[df_hog_cm.max(axis=1).sort_values()[:N].index,:].sort_index()

# sns.set_context('poster')
plt.figure(figsize=(12,3))
plt.imshow(worst_performers.iloc[:,1:], interpolation='nearest',aspect='auto', cmap=diverging_cmap)
plt.title('Worst Performers using HOG Features', fontsize=25, y=1.02)
plt.grid(False)
plt.colorbar(drawedges=True,
            spacing='proportional',
            ticks=range(0,11),
            shrink=.82)
plt.tight_layout()
# plt.ylabel('True label (Category #)')
plt.xlabel('Predicted label (Category #)');
plt.yticks(range(0,N), worst_performers.category);
plt.tick_params(axis='y', which='both', labelsize=21)

In [None]:
N = 5
best_performers = df_hog_cm.loc[df_hog_cm.max(axis=1).sort_values()[-N:].index,:].sort_index()

# sns.set_context('poster')
plt.figure(figsize=(12,3))
plt.imshow(best_performers.iloc[:,1:], interpolation='nearest',aspect='auto', cmap=diverging_cmap)
plt.title('Best Performers Using HOG Features', fontsize=25, y=1.02)
plt.grid(False)
plt.colorbar(drawedges=True,
            spacing='proportional',
            ticks=range(0,11),
            shrink=.82)
plt.tight_layout()
# plt.ylabel('True label (Category #)')
plt.xlabel('Predicted label (Category #)');
plt.yticks(range(0,N), best_performers.category);
plt.tick_params(axis='y', which='both', labelsize=21)

# SIFT Features

In [None]:
from sklearn.decomposition import sparse_encode
from sklearn.decomposition import MiniBatchDictionaryLearning

In [None]:
# Dictionary
dimensions = 37
mbdl = MiniBatchDictionaryLearning(dimensions, split_sign=True, random_state=rseed)
mbdl.fit(sX_train)

# Sparse Encoder
code = sparse_encode(sX_train, mbdl.components_)

# # Standardize our data around meanCreate a SVC classifier using a linear kernel
sift_pipe_lrSVC = Pipeline([('scaler', StandardScaler()),
                            ('clf', LinearSVC(random_state=0))])
%time sift_pipe_lrSVC.fit(code, sy_train)
sX_test_sp = sparse_encode(sX_test, mbdl.components_)
sift_pred = sift_pipe_lrSVC.predict(sX_test_sp)

In [None]:
pprint(classification_report(sy_test, sift_pred))

In [None]:
print('Accuracy:', sift_pipe_lrSVC.score(sX_test_sp, sy_test))
# print('F1-Score:', f1_score(sy_test, sX_test_sp))

### Results

In [None]:
sy_test.nunique()

In [None]:
# Compute confusion matrix
sift_cnf_matrix = confusion_matrix(sy_test, sift_pred)

In [None]:
df_sift_cm = pd.merge(pd.DataFrame(sy_test.unique(), columns=['category']),
                     pd.DataFrame(sift_cnf_matrix), left_index=True, right_index=True)

In [None]:
sns.set_context('poster')
plt.figure(figsize=(15,15))
plt.imshow(sift_cnf_matrix, interpolation='nearest', cmap=diverging_cmap)
plt.title('LinearSVM SIFT Features',  fontsize=25, y=1.02)
plt.grid(False)
plt.colorbar(drawedges=True,
            spacing='proportional',
            ticks=range(0,11),
            shrink=.82)
plt.xticks(range(0,19), range(0,20))
plt.yticks(range(0,19), df_sift_cm.category)

plt.ylabel('True label (Category #)')
plt.xlabel('Predicted label (Category #)');

In [None]:
N = 5
worst_performers = df_sift_cm.loc[df_sift_cm.max(axis=1).sort_values()[:N].index,:].sort_index()

# sns.set_context('poster')
plt.figure(figsize=(12,3))
plt.imshow(worst_performers.iloc[:,1:], interpolation='nearest',aspect='auto', cmap=diverging_cmap)
plt.title('Worst Performers using SIFT Features', fontsize=25, y=1.02)
plt.grid(False)
# plt.colorbar()
plt.tight_layout()
# plt.ylabel('True label (Category #)')
plt.xlabel('Predicted label (Category #)');
plt.yticks(range(0,N), worst_performers.category);
plt.tick_params(axis='y', which='both', labelsize=21)

In [None]:
N = 5
best_performers = df_sift_cm.loc[df_sift_cm.max(axis=1).sort_values()[-N:].index,:].sort_index()

# sns.set_context('poster')
plt.figure(figsize=(12,3))
plt.imshow(best_performers.iloc[:,1:], interpolation='nearest',aspect='auto', cmap=diverging_cmap)
plt.title('Best Performers Using SIFT Features', fontsize=25, y=1.02)
plt.grid(False)
plt.colorbar(drawedges=True,
            spacing='proportional',
            ticks=range(0,11),
            shrink=.82)
plt.tight_layout()
# plt.ylabel('True label (Category #)')
plt.xlabel('Predicted label (Category #)');
plt.yticks(range(0,N), best_performers.category);
plt.tick_params(axis='y', which='both', labelsize=21)

In [None]:
# weights plot? 
# sns.heatmap(sift_pipe_lrSVC.steps[1][-1].coef_)

### Experiments on Dimensionality

In [None]:
acc = []

In [None]:
for dimensions in range(80,150):
    # Dictionary
    mbdl = MiniBatchDictionaryLearning(dimensions, split_sign=True, random_state=rseed)
    mbdl.fit(sX_train)

    # Sparse Encoder
    code = sparse_encode(sX_train, mbdl.components_)

    # Standardize our data around mean & Create a SVC classifier using a linear kernel
    sift_pipe_lrSVC = Pipeline([('scaler', StandardScaler()),
                                ('clf', LinearSVC(random_state=0))])
    sift_pipe_lrSVC.fit(code, sy_train)
    
    acc.append(sift_pipe_lrSVC.score(sparse_encode(sX_test, mbdl.components_), sy_test))

In [None]:
acc.index(max(acc))

In [None]:
# Dictionary
mbdl = MiniBatchDictionaryLearning(37, split_sign=True, random_state=rseed)
mbdl.fit(sX_train)

# Sparse Encoder
code = sparse_encode(sX_train, mbdl.components_)

# Standardize our data around mean & Create a SVC classifier using a linear kernel
sift_pipe_lrSVC = Pipeline([('scaler', StandardScaler()),
                            ('clf', LinearSVC(random_state=0))])
sift_pipe_lrSVC.fit(code, sy_train)

In [None]:
sX_test_sparse = sparse_encode(sX_test, mbdl.components_)

In [None]:
sns.set_style("darkgrid")

plt.figure(figsize=(15,6))
plt.plot(acc);
sns.despine()
plt.title('# of Visual Words vs. Accuracy', fontsize=25, y=1.02)
plt.ylabel('Accuracy')
plt.xlabel('# of Visual Words')
plt.xlim((-0.2,125))
plt.xticks(range(0,130,10));
# plt.xticks()[0]

In [None]:
with open('dict_acc', 'wb') as f:
    pickle.dump(acc, f)