In [None]:
# -*- coding: utf-8 -*-
import numpy as np
import pandas as pd
from PIL import Image
import os.path

# Keras
from keras.utils import np_utils
from keras.models import Sequential
from keras.layers import Dense, Dropout, Activation, Conv2D, MaxPooling2D, Flatten
from keras import regularizers
from keras.preprocessing.image import ImageDataGenerator

# Image preview
import matplotlib.pyplot as plt
% matplotlib inline

# Numpy print full array
np.set_printoptions(threshold=np.inf)

# Constants
DIRECTORY = "ml-2018spring-hw3/"

# Parameters

In [None]:
# Functions
# def extract_feature(x):

def get_training_data(horizontal_flip=False, shuffle_data=False, validation_split=0.0):
    filename = "train.csv"
    filepath = DIRECTORY + filename

    if os.path.exists(filepath):
        data = pd.read_csv(filepath)
        x_raw = data["feature"]
        y_raw = data["label"]
        
        
        #  Split features into array & reshape to (48, 48, 1)
        x = x_raw.str.split(expand=True).values.reshape(-1, 48, 48, 1).astype('int')
        # One hot encoding
        y = np_utils.to_categorical(y_raw)
        # Add fliplr image to label 1
        if horizontal_flip:
            (x, y) = add_fliplr_image(x, y, y_raw, 1)
        if shuffle_data:
            (x, y) = shuffle(x, y)
        
        # Split validation set
        if validation_split > 0.0 and validation_split <= 1.0:
            valid_size = int(validation_split*len(x))
            x_train = x[:-valid_size]
            x_valid = x[-valid_size:]
            y_train = y[:-valid_size]
            y_valid = y[-valid_size:]
        else:
            x_train = x
            y_train = y
            x_valid = []
            y_valid = []
    else:
        print("Error: No such file at %s" % filepath)

    return (x_train, y_train), (x_valid, y_valid), (x_raw, y_raw)
        
def get_accuracy(y_hypo, y):
    return len(y_hypo[y_hypo==y]) / len(y_hypo)
    
def get_testing_data(original):
    if original:
        filename = "test.csv"
        # TODO
    else:
        filename = "test_X"
    filepath = DIRECTORY + filename

    if os.path.exists(filepath):
        x = pd.read_csv(filepath, dtype=float).as_matrix()
        return x
        
def output_prediction(y_test, filename="output.csv"):
    arr = [[i+1, int(y_test[i])] for i in range(len(y_test))]
    dw = pd.DataFrame(arr, columns = ["id", "label"])
    dw.to_csv(filename, index=False)

def normalize(x_set, norm_column=[]):
    x_all = np.concatenate(x_set, axis=0)
    mean = np.mean(x_all, axis=0)
    std = np.std(x_all, axis=0)
    
    for x in x_set:
        for column in norm_column:
            x[:, column] = np.subtract(x[:, column], mean[column])
            x[:, column] = np.true_divide(x[:, column], std[column])
            
    return x_set

def add_fliplr_image(x_train, y_train, y_raw, label):
    index = y_raw[y_raw == label].index
    category = np_utils.to_categorical([label], 7)
    total_categories = np.repeat(category, len(index), axis=0)
    total_images = np.empty((0, 48, 48, 1), int)

    for i in index:
        image = np.fliplr(x_train[i]).reshape(1, 48, 48, 1)
        total_images = np.append(total_images, image, axis=0)

    x_train = np.concatenate((x_train, total_images), axis=0)
    y_train = np.concatenate((y_train, total_categories), axis=0)
    return (x_train, y_train)

def shuffle(x_train, y_train):
    seed = np.arange(x_train.shape[0])
    np.random.shuffle(seed)
    x_train = x_train[seed]
    y_train = y_train[seed]
    return (x_train, y_train)

In [None]:
# Plot
def show_train_history(train_history, train, validation):
    plt.plot(train_history.history[train])
    plt.plot(train_history.history[validation])
    plt.title("Train History")
    plt.ylabel("train")
    plt.xlabel("Epoch")
    plt.legend(["train", "validation"], loc="center right")
    plt.show()

In [None]:
(x_train, y_train), (x_valid, y_valid), (x_raw, y_raw) = get_training_data(
    horizontal_flip=True,
    shuffle_data=True,
    validation_split=0.1)

In [None]:
# Normalization
x_train = x_train / 255
if len(x_valid) > 0:
    x_valid = x_valid / 255

In [None]:
train_gen = ImageDataGenerator(
    zca_whitening=False,
    rotation_range=20,
    width_shift_range=0.2,
    height_shift_range=0.2,
    shear_range=0.1,
    zoom_range=0.2,
    horizontal_flip=True,
    fill_mode="nearest")

train_gen.fit(x_train)

In [None]:
index = 0
print(y_train[index])
plt.imshow(x_train[index].reshape(48, 48))
plt.show()

In [None]:
model = Sequential()

# CNN
# input shape (1, 48, 48)
model.add(Conv2D(25, (5, 5), input_shape=(48, 48, 1), activation="relu"))
model.add(MaxPooling2D((2, 2)))
model.add(Conv2D(16, (3, 3)))
model.add(MaxPooling2D((2, 2)))
# model.add(Dropout(0.5))
model.add(Flatten())

# DNN
model.add(Dense(units=128, activation="relu"))
# model.add(Dense(units=128, activation="relu",
#                kernel_regularizer=regularizers.l2(0.01)))
# model.add(Dropout(0.5))
# model.add(Dense(units=128, activation="relu"))
# model.add(Dropout(0.5))

# Output layer
model.add(Dense(units=7,activation="softmax"))
model.summary()

In [None]:
epochs = 200
batch_size = 128
steps_per_epoch = int((x_train.shape[0]*10)/batch_size)
model_name = "cnn.h5"

model.compile(loss="categorical_crossentropy", optimizer="adam", metrics=['accuracy'])
# train_history = model.fit(x_train, y_train, validation_split=0.1, batch_size=100, epochs=10)
train_history = model.fit_generator(
    train_gen.flow(x_train, y_train, batch_size=batch_size),
    steps_per_epoch=steps_per_epoch,
    epochs=epochs,
    validation_data=(x_valid, y_valid))

# score = model.evaluate(x_valid, y_valid)
# print('Total loss on Testing Set:', score[0])
# print('Accuracy of Testing Set:', score[1])

# y_test = model.predict(x_test)
# prob = np.argmax(y_test, axis=1)
# output_prediction(prob , "best.csv")

model.save(model_name)

In [None]:
show_train_history(train_history, 'acc', 'val_acc')
show_train_history(train_history, 'loss', 'val_loss')

In [None]:
# Confusion matrix
prediction = model.predict_classes(x_train)
print(y_train.shape)
y_train_categories = np.argmax(y_train, axis=1)
pd.crosstab(y_train_categories, prediction, rownames=["label"], colnames=["predict"])