In [1]:
from keras.utils.np_utils import to_categorical
import matplotlib.pyplot as plt
import numpy as np
import os
%matplotlib inline
np.set_printoptions(threshold=np.nan)
np.random.seed(117)
modelSaveDir = 'model/'
if not os.path.exists(modelSaveDir):
    os.mkdir(modelSaveDir)

Using TensorFlow backend.


In [2]:
from load_affnist import loadmat
Xtrain = []
ytrain = []
for i in range(1,33):
    mat = loadmat('data/training_and_validation_batches/' + str(i) + '.mat')
    Xtrain.append(mat['affNISTdata']['image'].T)
    ytrain.append(mat['affNISTdata']['label_int'])
X_train = np.vstack(Xtrain)
y_train = np.vstack(ytrain)
y_train = y_train.reshape((y_train.shape[0] * y_train.shape[1],))
testDICT = loadmat('data/test.mat')
X_test = testDICT['affNISTdata']['image'].T
y_test = testDICT['affNISTdata']['label_int']

In [3]:
# label = np.unique(y_test)
X_train = X_train.reshape((X_train.shape[0], 1600))
X_test = X_test.reshape((X_test.shape[0], 1600))
X_train = X_train.astype('float32')/255.
X_test = X_test.astype('float32')/255.
X_train.shape, y_train.shape, X_test.shape, y_test.shape

((1920000, 1600), (1920000,), (320000, 1600), (320000,))

In [4]:
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis as LDA
from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis as QDA
from sklearn.linear_model import LogisticRegression

def classify(X_train, X_test, y_train, y_test, classifier):
    if classifier == "lda":
        clf = LDA()
    elif classifier == "qda":
        clf = QDA()
    elif classifier == "logreg":
        clf = LogisticRegression(multi_class='multinomial', solver='newton-cg')
    clf.fit(X_train, y_train)
    pred = clf.predict(X_test)
    print(pred[:10])
    cfmtx = confusion_matrix(y_test, pred)
    df = pd.DataFrame(cfmtx, columns=label, index=label)
    df.columns.name = 'pred\\true'
    print(df)
    prfs = precision_recall_fscore_support(y_test, pred, average='macro')
    print('precision:', prfs[0], '\nrecall:', prfs[1], '\nfscore:', prfs[2])

In [5]:
# LDA
classify(X_train, X_test, y_train, y_test, 'lda')



MemoryError: 

In [None]:
# QDA
classify(X_train, X_test, y_train, y_test, 'qda')

In [None]:
# logistic regression
classify(X_train, X_test, y_train, y_test, 'logreg')

In [None]:
# the variance to determine the components in pca 
var_ladder =[]
for i in range(15):
    var_ladder.append(0.2+i*0.05)
var_ladder_output = ['%.2f' % elem for elem in var_ladder]
print(var_ladder_output)

In [None]:
from sklearn.decomposition import PCA
def pca_data(X_train,X_test,var_explained):
    pca=PCA(svd_solver='randomized',whiten=True)
    pca.fit(X_train)
    var=np.cumsum(pca.explained_variance_ratio_)
    pca=PCA(np.argwhere(var>var_explained)[0][0],svd_solver='randomized',whiten=True)
    pca.fit(X_train)
    X_pca_train=pca.transform(X_train)
    X_pca_test=pca.transform(X_test)
    return X_pca_train, X_pca_test

X_train_list = []
X_test_list = []
for var in var_ladder:
    X_train_current, X_test_current = pca_data(X_train,X_test,var)
    X_train_list.append(X_train_current)
    X_test_list.append(X_test_current)

In [None]:
# store the intermediate result since the cell above is toooooooooooo slow!
import pickle
with open("pca_training.pkl","wb") as handle:
    pickle.dump(X_train_list,handle)
with open("pca_testing.pkl","wb") as handle:
    pickle.dump(X_test_list,handle)

In [None]:
# helper function for calculating accuracy
from sklearn.metrics import accuracy_score
def score(X_train, X_test, y_train, y_test, classifier):
    if classifier == "lda":
        clf = LDA()
    elif classifier == "qda":
        clf = QDA()
    elif classifier == "logreg":
        clf = LogisticRegression(multi_class='multinomial', solver='newton-cg')
    clf.fit(X_train, y_train)
    pred = clf.predict(X_test)
    return accuracy_score(y_test,pred)

In [None]:
def printlist(list):
    for i in list:
        print(i,end=" ")
    print()

In [None]:
# LDA: accuracy ~ total-variance-explained
lda_accuracies = []
for i in range(15):
    acc = score(X_train_list[i],X_test_list[i],y_train,y_test,'lda')
    lda_accuracies.append(acc)
printlist(lda_accuracies)

In [None]:
# QDA: accuracy ~ total-variance-explained
qda_accuracies = []
for i in range(15):
    acc = score(X_train_list[i],X_test_list[i],y_train,y_test,'qda')
    qda_accuracies.append(acc)
printlist(qda_accuracies)

In [None]:
# Logistic Regression: accuracy ~ total-variance-explained
logreg_accuracies=[]
for i in range(15):
    acc = score(X_train_list[i],X_test_list[i],y_train,y_test,'logreg')
    logreg_accuracies.append(acc)
printlist(logreg_accuracies)

In [None]:
ax=plt.subplot(111)
plt.plot(var_ladder,lda_accuracies,label="lda")
plt.plot(var_ladder,qda_accuracies,label="qda")
plt.plot(var_ladder,logreg_accuracies,label="logreg")
leg = plt.legend(loc=4, ncol=1, shadow=False, fancybox=False)
leg.get_frame().set_alpha(0.5)
plt.title("classification accuracy on PCA dimensionality reduced data")
plt.xlabel('total variance explained')
plt.ylabel('accuracy')
plt.show()

In [None]:
pca=PCA(svd_solver='randomized',whiten=True)
pca.fit(X_train)
var=np.cumsum(pca.explained_variance_ratio_)
print("best number of dimension: " + str(np.argwhere(var>0.75)[0][0]))

In [None]:
# Dimension reduction using PCA, code ref: https://www.kaggle.com/ddmngml/pca-and-svm-on-mnist-dataset

n_components = 33
pca = PCA(n_components=n_components, svd_solver='randomized',whiten=True).fit(X_train)
X_train_pca = pca.transform(X_train)
X_test_pca = pca.transform(X_test)

In [None]:
plt.hist(pca.explained_variance_ratio_, bins=n_components, log=True)
pca.explained_variance_ratio_.sum()

In [None]:
# PCA-LDA
classify(X_train_pca, X_test_pca, y_train, y_test, 'lda')

In [None]:
# PCA-QDA
classify(X_train_pca, X_test_pca, y_train, y_test, 'qda')

In [None]:
# PCA-logistic regression
classify(X_train_pca, X_test_pca, y_train, y_test, 'logreg')