In [None]:
# -*- coding: utf-8 -*-
import numpy as np
import pandas as pd
from PIL import Image
import os.path

# Keras
from keras.utils import np_utils
from keras.models import Sequential
from keras.layers import Dense, Dropout, Activation, Conv2D, MaxPooling2D, Flatten

# Image preview
import matplotlib.pyplot as plt
% matplotlib inline

# Numpy print full array
np.set_printoptions(threshold=np.inf)

# Constants
DIRECTORY = "ml-2018spring-hw3/"

# Parameters

In [None]:
# Functions
# def extract_feature(x):


def get_training_data(valid_size=0):
    filename = "train.csv"
    filepath = DIRECTORY + filename

    if os.path.exists(filepath):
        data = pd.read_csv(filepath)
        x_raw = data["feature"]
        y_raw = data["label"]
        
        
        #  Split features into array & reshape to (48, 48, 1)
        x = x_raw.str.split(expand=True).values.reshape(-1, 48, 48, 1).astype('int')
        # One hot encoding
        y = np_utils.to_categorical(y_raw)
        
        # Split validation set
        if valid_size > 0:
            x_train = x[:-valid_size]
            x_valid = x[-valid_size:]
            y_train = y[:-valid_size]
            y_valid = y[-valid_size:]
        else:
            x_train = x
            y_train = y
            x_valid = []
            y_valid = []
    else:
        print("Error: No such file at %s" % filepath)

    return (x_train, y_train), (x_valid, y_valid)
        
def get_accuracy(y_hypo, y):
    return len(y_hypo[y_hypo==y]) / len(y_hypo)
    
def get_testing_data(original):
    if original:
        filename = "test.csv"
        # TODO
    else:
        filename = "test_X"
    filepath = DIRECTORY + filename

    if os.path.exists(filepath):
        x = pd.read_csv(filepath, dtype=float).as_matrix()
        return x
        
def output_prediction(y_test, filename="output.csv"):
    arr = [[i+1, int(y_test[i])] for i in range(len(y_test))]
    dw = pd.DataFrame(arr, columns = ["id", "label"])
    dw.to_csv(filename, index=False)

def normalize(x_set, norm_column=[]):
    x_all = np.concatenate(x_set, axis=0)
    mean = np.mean(x_all, axis=0)
    std = np.std(x_all, axis=0)
    
    for x in x_set:
        for column in norm_column:
            x[:, column] = np.subtract(x[:, column], mean[column])
            x[:, column] = np.true_divide(x[:, column], std[column])
            
    return x_set

In [None]:
(x_train, y_train), (x_valid, y_valid) = get_training_data()

In [None]:
# Normalization
x_train = x_train / 255
x_valid = x_valid / 255

In [None]:
plt.imshow(x_train[17])
plt.show()

In [None]:
model = Sequential()

# CNN
# input shape (1, 48, 48)
model.add(Conv2D(25, (5, 5), input_shape=(48, 48, 1)))
model.add(MaxPooling2D((2, 2)))
model.add(Conv2D(36, (3, 3)))
model.add(MaxPooling2D((2, 2)))
model.add(Flatten())

# DNN
model.add(Dense(units=50, activation="relu"))
# model.add(Dropout(0.6))
# model.add(Dense(units=50, activation="relu"))
# model.add(Dropout(0.6))

# Output layer
model.add(Dense(units=7, activation="softmax"))

model.compile(loss="categorical_crossentropy", optimizer="adam", metrics=['accuracy'])
model.fit(x_train, y_train, batch_size=1000, epochs=3)

# score = model.evaluate(x_valid, y_valid)
# print('Total loss on Testing Set:', score[0])
# print('Accuracy of Testing Set:', score[1])

# y_test = model.predict(x_test)
# prob = np.argmax(y_test, axis=1)
# output_prediction(prob , "best.csv")