In [1]:
import numpy as np
import pandas as pd
import scipy
import sklearn.decomposition
from sklearn.decomposition import PCA
from sklearn import svm
from sklearn.multiclass import OneVsRestClassifier
from sklearn.grid_search import GridSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn import mixture

# load MNIST data

In [2]:
import os, struct
from array import array as pyarray
from numpy import append, array, int8, uint8, zeros

def load_mnist(dataset="training", digits=np.arange(10),
               path=r'C:\Users\David\Documents\ETHZ 2015-2017\'16 HERBST\THESIS\MNIST'):
    """
    Loads MNIST files into 3D numpy arrays

    Adapted from: http://abel.ee.ucla.edu/cvxopt/_downloads/mnist.py
    """

    if dataset == "training":
        fname_img = os.path.join(path, 'train-images.idx3-ubyte')
        fname_lbl = os.path.join(path, 'train-labels.idx1-ubyte')
    elif dataset == "testing":
        fname_img = os.path.join(path, 't10k-images.idx3-ubyte')
        fname_lbl = os.path.join(path, 't10k-labels.idx1-ubyte')
    else:
        raise ValueError("dataset must be 'testing' or 'training'")

    flbl = open(fname_lbl, 'rb')
    magic_nr, size = struct.unpack(">II", flbl.read(8))
    lbl = pyarray("b", flbl.read())
    flbl.close()

    fimg = open(fname_img, 'rb')
    magic_nr, size, rows, cols = struct.unpack(">IIII", fimg.read(16))
    img = pyarray("B", fimg.read())
    fimg.close()

    ind = [ k for k in range(size) if lbl[k] in digits ]
    N = len(ind)

    images = zeros((N, rows, cols), dtype=uint8)
    labels = zeros((N, 1), dtype=int8)
    for i in range(len(ind)):
        images[i] = array(img[ ind[i]*rows*cols : (ind[i]+1)*rows*cols ]).reshape((rows, cols))
        labels[i] = lbl[ind[i]]

    return images, labels

In [3]:
images_train, labels_train = load_mnist(dataset="training")
images_test, labels_test = load_mnist(dataset="testing")

In [4]:
# format labels
use_labels_train = pd.Series(labels_train.flatten())
use_labels_test = pd.Series(labels_test.flatten())

# preprocessing

In [5]:
def divide_2704_into_16s():
    out = []
    for i in range(168):
        out += [16 * (i+1)]
    return out

In [6]:
def robust_equalization(scattering_vecs):
    
    robust_scat_vecs = []
    
    divide_indices = divide_2704_into_16s()
    
    # list of 5000x16 arrays
    X_list = np.split(np.array(scattering_vecs), divide_indices, axis=1)
    X = []
    
    for X_path in X_list:
        
        # find the signal which produces the biggest energy coeffs in this path
        norms = np.linalg.norm(X_path, axis=1)
        
        const = max(norms)
        
        X_path = X_path / abs(const)
        X += [X_path]
        
    X = np.concatenate(np.array(X), axis=1)
    
    for j in range(len(X)):
        robust_scat_vecs += [X[j,:]]
    
    return robust_scat_vecs

In [7]:
# normalize so that all coeffs are in [-1,1]
def normalize_scattering_vecs(scattering_vecs, norm_type='hard', per_path=True):
    
    norm_scat_vecs = []
    
    # per coeff: make every single coeff in [-1,1]
    if not per_path:
        
        X = np.array(scattering_vecs).T
        
        # hard normalization
        if norm_type == 'hard':
            for i in range(len(X)):
                if i % 16 == 0:
                    dim_i = X[i,:]
                    const = max(dim_i.max(), dim_i.min(), key=abs)
                    X[i,:] = X[i,:] / abs(const)

        # soft normalization
        elif norm_type == 'soft':
            for i in range(len(X)):
                dim_i = X[i,:]

                # find mean
                mean = dim_i.mean()

                # find standard deviation
                std = dim_i.std()

                # subtract mean, divide by twice standard deviation
                X[i,:] = (X[i,:] - mean) / (2 * std)

        else:
            raise("Error: norm_type must be either 'hard' or 'soft'!")
        
        X = X.T
    
    # make every 16 predictors (one path) in [-1,1]
    else:
        
        divide_indices = divide_2704_into_16s()
        #divide_indices = divide_8281_into_49s()
        
        # list of 5000x16 arrays
        X_list = np.split(np.array(scattering_vecs), divide_indices, axis=1)
        X = []
        
        # hard normalization
        if norm_type == 'hard':
            for X_path in X_list:
                const = max(np.amax(X_path), np.amin(X_path), key=abs)
                X_path = X_path / abs(const)
                X += [X_path]
        else:
            raise("Error: norm_type must be 'hard'!")
        
        X = np.concatenate(np.array(X), axis=1)
    
    for j in range(len(X)):
        norm_scat_vecs += [X[j,:]]
    
    return norm_scat_vecs

# load and preprocess SCNet and PCANet coeffs

In [5]:
imgs_6000 = images_train[:6000]
labels_6000 = use_labels_train[:6000]
labels_6000 = pd.Series(labels_6000, index=np.arange(6000))
labels_5000 = labels_6000[:5000]

In [44]:
# train on SCNet output
test_5000 = np.loadtxt('scattering_vecs_5000_sigma08.txt', delimiter=',')
test_5000_6000 = np.loadtxt('scattering_vecs_5000_6000_sigma08.txt', delimiter=',')
test_6000 = np.reshape(np.append(test_5000, test_5000_6000), (6000,2704))

In [26]:
# train on perturbed MNIST datasets
test_6000_rand = np.loadtxt('rand_scattering_vecs_6000.txt', delimiter=',')
test_6000_rot = np.loadtxt('rot_scattering_vecs_6000.txt', delimiter=',')

In [26]:
# train on PCANet
test_5000_PCANet_temp = np.loadtxt('PCANet_train_new.txt', delimiter=',')
test_5000_6000_PCANet_temp = np.loadtxt('PCANet_test_new.txt', delimiter=',')
test_6000_PCANet = np.reshape(np.append(test_5000_PCANet_temp, test_5000_6000_PCANet_temp),
                                   (6000, 18432))

In [12]:
test_6000_RandNet = np.loadtxt('RandNet_v2.txt', delimiter=',')

In [37]:
test_5000_LDANet_temp = np.loadtxt('LDANet_train.txt', delimiter=',')
test_5000_6000_LDANet_temp = np.loadtxt('LDANet_test.txt', delimiter=',')
test_6000_LDANet = np.reshape(np.append(test_5000_LDANet_temp, test_5000_6000_LDANet_temp),
                                   (6000, 18432))

In [45]:
random_6000_subset = np.random.choice(6000, 1000, replace=False)
pd.Series(random_6000_subset).to_csv('random_6000_subset.csv')

In [46]:
random_6000_subset = pd.Series.from_csv('random_6000_subset.csv').values
validation = random_6000_subset
training = np.setdiff1d(np.arange(6000), validation)
labels_6000 = use_labels_train[:6000]
labels_5000 = labels_6000[training]

In [25]:
labels_5000.to_csv('rand_labels_training_6000.csv')
labels_6000[validation].to_csv('rand_labels_validation_6000.csv')

In [16]:
norm_scat_vecs_6000 = normalize_scattering_vecs(test_6000, norm_type='hard',
                                                per_path=True)
norm_scat_vecs_5000 = norm_scat_vecs_6000[:5000]
robust_scat_vecs_6000 = robust_equalization(test_6000)
robust_scat_vecs_5000 = robust_scat_vecs_6000[:5000]

In [27]:
norm_scat_vecs_6000_rand = normalize_scattering_vecs(test_6000_rand, norm_type='hard',
                                                     per_path=True)
norm_scat_vecs_5000_rand = norm_scat_vecs_6000_rand[:5000]
robust_scat_vecs_6000_rand = robust_equalization(test_6000_rand)
robust_scat_vecs_5000_rand = robust_scat_vecs_6000_rand[:5000]

In [12]:
norm_scat_vecs_6000_rot = normalize_scattering_vecs(test_6000_rot, norm_type='hard',
                                                    per_path=True)
norm_scat_vecs_5000_rot = norm_scat_vecs_6000_rot[:5000]
robust_scat_vecs_6000_rot = robust_equalization(test_6000_rot)
robust_scat_vecs_5000_rot = robust_scat_vecs_6000_rot[:5000]

In [30]:
norm_scat_vecs_5000_temp = np.array(norm_scat_vecs_6000)[training]
robust_scat_vecs_5000_temp = np.array(robust_scat_vecs_6000)[training]
norm_scat_vecs_5000 = []
robust_scat_vecs_5000 = []
for i in range(5000):
    norm_scat_vecs_5000 += [norm_scat_vecs_5000_temp[i]]
    robust_scat_vecs_5000 += [robust_scat_vecs_5000_temp[i]]

In [73]:
norm_scat_vecs_5000_rand_temp = np.array(norm_scat_vecs_6000_rand)[training]
robust_scat_vecs_5000_rand_temp = np.array(robust_scat_vecs_6000_rand)[training]
norm_scat_vecs_5000_rand = []
robust_scat_vecs_5000_rand = []
for i in range(5000):
    norm_scat_vecs_5000_rand += [norm_scat_vecs_5000_rand_temp[i]]
    robust_scat_vecs_5000_rand += [robust_scat_vecs_5000_rand_temp[i]]

In [16]:
norm_scat_vecs_5000_rot_temp = np.array(norm_scat_vecs_6000_rot)[training]
robust_scat_vecs_5000_rot_temp = np.array(robust_scat_vecs_6000_rot)[training]
norm_scat_vecs_5000_rot = []
robust_scat_vecs_5000_rot = []
for i in range(5000):
    norm_scat_vecs_5000_rot += [norm_scat_vecs_5000_rot_temp[i]]
    robust_scat_vecs_5000_rot += [robust_scat_vecs_5000_rot_temp[i]]

In [47]:
# R data files
test_6000_imgs = images_train[:6000]
test_5000_imgs = np.array(test_6000_imgs)[training]
test_5000_6000_imgs = np.array(test_6000_imgs)[validation]

pd.DataFrame(test_5000_imgs).to_csv('test_5000_temp.csv')
pd.DataFrame(test_5000_6000_imgs).to_csv('test_5000_6000_temp.csv')

In [65]:
# R data files for RAND
rand_6000_imgs = np.loadtxt('rand_imgs_6000.txt', delimiter=',')
rand_5000_imgs = np.array(rand_6000_imgs)[training]
rand_5000_6000_imgs = np.array(rand_6000_imgs)[validation]

pd.DataFrame(rand_5000_imgs).to_csv('rand_5000_temp.csv')
pd.DataFrame(rand_5000_6000_imgs).to_csv('rand_5000_6000_temp.csv')

In [66]:
# R data files for ROT
rot_6000_imgs = np.loadtxt('rot_imgs_6000.txt', delimiter=',')
rot_5000_imgs = np.array(rot_6000_imgs)[training]
rot_5000_6000_imgs = np.array(rot_6000_imgs)[validation]

pd.DataFrame(rot_5000_imgs).to_csv('rot_5000_temp.csv')
pd.DataFrame(rot_5000_6000_imgs).to_csv('rot_5000_6000_temp.csv')

In [19]:
# PCANet
test_5000_PCANet_temp = np.array(test_6000_PCANet)[training]
test_5000_PCANet = []
for i in range(5000):
    test_5000_PCANet += [test_5000_PCANet_temp[i]]

In [36]:
# RandNet
test_5000_RandNet_temp = np.array(test_6000_RandNet)[training]
test_5000_RandNet = []
for i in range(5000):
    test_5000_RandNet += [test_5000_RandNet_temp[i]]

In [10]:
# LDANet
test_5000_LDANet_temp = np.array(test_6000_LDANet)[training]
test_5000_LDANet = []
for i in range(5000):
    test_5000_LDANet += [test_5000_LDANet_temp[i]]

# PCA classifier

In [30]:
# probably need some sort of normalization here
# d is number of principal components to keep, found by CV
def build_PCA_classifier(labels, scattering_vecs, d=100):
    
    scattering_vecs = pd.Series(scattering_vecs, index=labels.index)
    
    df = pd.concat({'labels':labels, 'scattering_vecs':scattering_vecs}, axis=1)
    groups = df.groupby('labels')
    counts = groups.count()
    
    # find avg scattering vector s_k for each class
    avg_class_vecs = groups.sum() / counts
    
    num_scattering_coeffs = len(avg_class_vecs.T[0].values[0])
    
    all_pcas = {}
    
    # run PCA on each class, separate first d components
    for i in range(10):
        scattering_vecs_i = groups.get_group(i)
        
        # project onto orthog compl of first d components
        pca = PCA(n_components=d)
        
        # transform series of arrays into single 2-D array
        X = np.empty([counts.T[i], num_scattering_coeffs])
        j = 0
        for k, v in scattering_vecs_i.T.items():
            v = v.values[0]
            X[j,] = v
            j += 1
        
        # run PCA on the 2-D array of class obs
        all_pcas[i] = pca.fit(X)
        
    # output avg vectors s_k, transformed PCA spaces
    return avg_class_vecs, all_pcas

In [31]:
def run_PCA_classifier(s, avg_class_vecs, all_pcas):
    
    assert(type(s) == np.ndarray)
    
    deviation = {}
    for class_k, vec_k in avg_class_vecs.T.items():
        
        # avg class scattering vector
        s_k = vec_k.values[0]
        
        # deviation from avg class scattering vector
        s_minus_s_k = s - s_k
        
        # project deviation onto affine approximation space
        pca = all_pcas[class_k]
        proj_s_minus_s_k = pca.inverse_transform(pca.transform(s_minus_s_k))
        
        # approximation error vector
        approx_error = s_minus_s_k - proj_s_minus_s_k
        
        # norm of approximation error vector
        deviation[class_k] = np.linalg.norm(approx_error)
        
    # predicted class has avg vec that is closest to s, ignoring first d principal components
    deviation = pd.Series(deviation)
    class_pred = deviation.idxmin()
    
    return class_pred

### testing PCA

In [32]:
def test_PCA(filename='PCA_6000_preds_rob_', labels_train=labels_5000,
             vecs_train=robust_scat_vecs_5000, vecs_all=robust_scat_vecs_6000, size=6000, d=50):
    
    avg_class_vecs, all_pcas = build_PCA_classifier(labels_train, vecs_train, d=d)
    
    PCA_preds = {}
    
    for i in range(size):
        s = vecs_all[i]
        PCA_preds[i] = run_PCA_classifier(s, avg_class_vecs, all_pcas)
    
    PCA_preds = pd.Series(PCA_preds)
    PCA_preds.to_csv(filename + str(d) + '.csv')
    
    return

# SVM classifier

In [33]:
def SVM_classifier_OVR(labels, scattering_vecs):
    
    scattering_vecs = pd.Series(scattering_vecs, index=labels.index)
    num_scattering_coeffs = len(scattering_vecs.iloc[0])
    
    # transform series of arrays into single 2-D array
    X = np.empty([len(scattering_vecs), num_scattering_coeffs])
    j = 0
    for k, v in scattering_vecs.T.items():
        X[j,] = v
        j += 1
    
    y = labels.values
    
    #clf = svm.LinearSVC(C=1)  # PCANet, RandNet
    clf = svm.LinearSVC(C=0.5)  # LDANet
    #clf = OneVsRestClassifier(svm.SVC(C=13, gamma=0.02))  # 6000
    #clf = OneVsRestClassifier(svm.SVC(C=20, gamma=0.05))  # rand, 6000
    #clf = OneVsRestClassifier(svm.SVC(C=50, gamma=0.01))  # rot, 6000
    
    return clf.fit(X, y)

### testing SVM

In [34]:
def test_SVM(filename='SVM_OVR_preds_6000.csv', labels_train=labels_5000, vecs_train=test_5000_LDANet,
             vecs_all=test_6000_LDANet, size=6000):
    
    clf_SVM_OVR = SVM_classifier_OVR(labels_train, vecs_train)
    SVM_OVR_preds = {}
    
    for i in range(size):
        s = vecs_all[i]
        SVM_OVR_preds[i] = clf_SVM_OVR.predict(s)[0]
    
    SVM_OVR_preds = pd.Series(SVM_OVR_preds)
    SVM_OVR_preds.to_csv(OVR_filename)
    
    return SVM_OVR_preds

In [22]:
def CV_SVM_classifier(classifier, C_grid): #, gamma_grid):
    
    #assert(type(gamma_grid) == list)
    assert(type(C_grid) == list)
    
    #param_grid = {'C':C_grid, 'gamma':gamma_grid}
    param_grid = {'C':C_grid}
    
    grid_searcher = GridSearchCV(classifier, param_grid, cv=10)
    
    return grid_searcher

In [23]:
#clf = svm.SVC()
clf = svm.LinearSVC()
clf.decision_function_shape='ovr'
grid_searcher_SVM = CV_SVM_classifier(clf, C_grid=[0.1,0.5,1,2,5,10]) #, gamma_grid=[10,100,1000])
grid_searcher_SVM.fit(test_5000_LDANet, labels_5000)
print(grid_searcher_SVM.best_params_)
print(grid_searcher_SVM.best_score_)

{'C': 0.5}
0.9742


# LASSO classifier

In [11]:
def LASSO_classifier(labels, scattering_vecs, C=1):
    
    clf_lasso = LogisticRegression(C=C, penalty='l1', tol=0.001)
    
    return clf_lasso.fit(scattering_vecs, labels)

### testing LASSO

In [13]:
def test_LASSO(filename='LASSO_preds_6000_C=', labels_train=labels_5000,
               vecs_train=test_5000_LDANet, vecs_all=test_6000_LDANet, size=6000, C=11):
    
    clf_LASSO = LASSO_classifier(labels_train, vecs_train, C=C)
    LASSO_preds = {}
    
    for i in range(size):
        s = vecs_all[i]
        LASSO_preds[i] = clf_LASSO.predict(s)[0]
    
    LASSO_preds = pd.Series(LASSO_preds)
    LASSO_preds.to_csv(filename + str(C) + '.csv')
    
    return

In [14]:
def CV_LASSO_classifier(classifier, C_grid):
    
    assert(type(C_grid) == list)
    
    param_grid = {'C':C_grid}
    
    grid_searcher = GridSearchCV(classifier, param_grid, cv=10)
    
    return grid_searcher

In [15]:
clf = LogisticRegression(penalty='l1', tol=0.001)
grid_searcher_LASSO = CV_LASSO_classifier(clf, C_grid=[10,20,40,80])
grid_searcher_LASSO.fit(test_5000_LDANet, labels_5000)
print(grid_searcher_LASSO.best_params_)
print(grid_searcher_LASSO.best_score_)

{'C': 20}
0.9752


# Random Forest classifier

In [20]:
def RF_classifier(labels, scattering_vecs, n_estimators=100, max_features=50, max_depth=70,
                  random_state=0):
    
    clf_RF = RandomForestClassifier(n_estimators=n_estimators, max_features=max_features,
                                    random_state=random_state, min_samples_leaf=1)
    
    return clf_RF.fit(scattering_vecs, labels)

### testing RF

In [21]:
def test_RF(filename='RF_preds_6000_n', labels_train=labels_5000, vecs_train=robust_scat_vecs_5000,
            vecs_all=robust_scat_vecs_6000, size=6000, n_estimators=400, max_features=100,
            max_depth=70):
    
    clf_RF = RF_classifier(labels_train, vecs_train, n_estimators=n_estimators,
                           max_features=max_features, max_depth=max_depth, random_state=0)
    RF_preds = {}
    
    for i in range(size):
        s = vecs_all[i]
        RF_preds[i] = clf_RF.predict(s)[0]
    
    RF_preds = pd.Series(RF_preds)
    RF_preds.to_csv(filename + str(n_estimators) + '.csv')
    
    return

In [37]:
test_RF(filename='RF_preds_6000_f80_d70_n', n_estimators=500, max_features=80, max_depth=70)

In [25]:
def CV_RF_classifier(classifier, n_grid, max_features_grid):
    
    assert(type(n_grid) == list)
    
    param_grid = {'n_estimators':n_grid, 'max_features':max_features_grid}
    
    grid_searcher = GridSearchCV(classifier, param_grid, cv=10)
    
    return grid_searcher

In [2]:
clf_rf = RandomForestClassifier()
grid_searcher_RF = CV_RF_classifier(clf_rf, n_grid=[600], max_features_grid=[50,100,200,300,400])
rid_searcher_RF.fit(test_5000_PCANet_v2, labels_5000)
print(grid_searcher_RF.best_params_)
print(grid_searcher_RF.best_score_)

In [36]:
grid_searcher_RF.grid_scores_

[mean: 0.91660, std: 0.01306, params: {'n_estimators': 500, 'max_features': 40, 'max_depth': 70},
 mean: 0.91740, std: 0.01384, params: {'n_estimators': 500, 'max_features': 50, 'max_depth': 70},
 mean: 0.91780, std: 0.01527, params: {'n_estimators': 500, 'max_features': 60, 'max_depth': 70},
 mean: 0.91780, std: 0.01452, params: {'n_estimators': 500, 'max_features': 70, 'max_depth': 70},
 mean: 0.91800, std: 0.01515, params: {'n_estimators': 500, 'max_features': 80, 'max_depth': 70},
 mean: 0.91740, std: 0.01481, params: {'n_estimators': 500, 'max_features': 90, 'max_depth': 70},
 mean: 0.91780, std: 0.01291, params: {'n_estimators': 500, 'max_features': 100, 'max_depth': 70}]