# Homework #3 - Kaggle Competition

In [1]:
from time import time
import logging
import matplotlib.pyplot as plt
import numpy as np
from sklearn.decomposition import PCA
from sklearn.svm import SVC
from sklearn.svm import LinearSVC
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix

from sklearn.model_selection import cross_val_score
from sklearn.ensemble import AdaBoostClassifier
from sklearn.multiclass import OneVsOneClassifier
from sklearn.tree import DecisionTreeClassifier

from sklearn.neural_network import MLPClassifier

In [2]:
#  Images are single-channel with pixel intensity represented by a float value from 0 to 255,
#  the single-channel intensity is the mean value of the original RGB channel intensities.

# X_train.npy: training data numpy array.
# Each row in the array corresponds to an image unrolled to a vector (50 x 37 = 1850 dimension)
X_train = np.load('X_train.npy')
# y_train.npy: labels (0-6) of each data corresponding to the image in the same row in X_train.npy
y_train = np.load('y_train.npy')

In [3]:
# Generate testResults.csv
# Run this with y_pred

def saveTestResults(y_pred, filename='testResults.csv'):
    fo = open(filename, 'w')
    fo.write("ImageId,PredictedClass\n")
    for i in range(len(y_pred)):
        if i == len(y_pred) - 1:
            fo.write(str(i) + "," + str(y_pred[i]))
        else:
            fo.write(str(i) + "," + str(y_pred[i]) + "\n")
    fo.close()
    


In [4]:
# to calculate number of classes
def calculateClass(label):
    n_classes = []
    for n_class in label:
        if n_class not in n_classes:
            n_classes.append(n_class)
        else:
            continue
    print(n_classes)
    return len(n_classes)
 
# Split data (75% train, 25% test)
X_train_split, X_test_split, y_train_split, y_test_split = train_test_split(X_train, y_train, test_size=0.25, random_state=42)

print("Total train data size:")
print("n_samples: %d" % len(X_train_split))
print("n_features: %d" % len(X_train_split[0]))
print("n_classes: %d" % calculateClass(y_train_split))

print("Total test data size:")
print("n_samples: %d" % len(X_test_split))
print("n_features: %d" % len(X_test_split[0]))
print("n_classes: %d" % calculateClass(y_test_split))

Total train data size:
n_samples: 724
n_features: 1850
[1, 6, 3, 2, 5, 0, 4]
n_classes: 7
Total test data size:
n_samples: 242
n_features: 1850
[4, 0, 1, 3, 2, 6, 5]
n_classes: 7


In [5]:
n_components = 150
h = 50
w = 37
print("Extracting the top %d eigenfaces from %d faces"
      % (n_components, X_train_split.shape[0]))
t0 = time()
pca = PCA(n_components=n_components, svd_solver='randomized',
          whiten=True).fit(X_train_split)
print("done in %0.3fs" % (time() - t0))

eigenfaces = pca.components_.reshape((n_components, h, w))
print("Projecting the input data on the eigenfaces orthonormal basis")

t0 = time()
X_train_pca = pca.transform(X_train_split)
X_test_pca = pca.transform(X_test_split)

print("done in %0.3fs" % (time() - t0))

Extracting the top 150 eigenfaces from 724 faces
done in 0.186s
Projecting the input data on the eigenfaces orthonormal basis
done in 0.014s


In [6]:
# GridSearchCV with SVC
print("Fitting the classifier to the training set")
t0 = time()

# Runs with Neural nets.
clf = MLPClassifier(solver='lbfgs', alpha=2e-4,
                     hidden_layer_sizes=(1000,), random_state=1)
clf = clf.fit(X_train_pca, y_train_split)
print("done in %0.3fs" % (time() - t0))
print("Best estimator found by grid search:")
print(clf)



Fitting the classifier to the training set
done in 0.579s
Best estimator found by grid search:
MLPClassifier(activation='relu', alpha=0.0002, batch_size='auto', beta_1=0.9,
       beta_2=0.999, early_stopping=False, epsilon=1e-08,
       hidden_layer_sizes=(1000,), learning_rate='constant',
       learning_rate_init=0.001, max_iter=200, momentum=0.9,
       nesterovs_momentum=True, power_t=0.5, random_state=1, shuffle=True,
       solver='lbfgs', tol=0.0001, validation_fraction=0.1, verbose=False,
       warm_start=False)


In [7]:
print("Ein")
internal_pred = clf.predict(X_train_pca)
print(classification_report(y_train_split, internal_pred))

print("Eout")
t0 = time()
y_pred = clf.predict(X_test_pca)
print(classification_report(y_test_split, y_pred))
print("done in %0.3fs" % (time() - t0))


Ein
             precision    recall  f1-score   support

          0       1.00      1.00      1.00        38
          1       1.00      1.00      1.00       143
          2       1.00      1.00      1.00        59
          3       1.00      1.00      1.00       303
          4       1.00      1.00      1.00        53
          5       1.00      1.00      1.00        49
          6       1.00      1.00      1.00        79

avg / total       1.00      1.00      1.00       724

Eout
             precision    recall  f1-score   support

          0       1.00      0.62      0.76        13
          1       0.97      0.88      0.92        40
          2       0.90      0.73      0.81        26
          3       0.85      0.96      0.90        92
          4       0.83      0.76      0.79        33
          5       0.89      0.73      0.80        11
          6       0.71      0.89      0.79        27

avg / total       0.87      0.86      0.85       242

done in 0.008s
