# CNN Architecture Notebook
## Facial Keypoint Detection
## W207 Final Project - Summer 2019
### Joanna Yu
### July 2019

This workbook is used to explore different CNN artchitectures, namely GoogleNet, ResNet, and MobileNet, for facial keypoint detection. The notebook runs in the Google Colaboratory environment using GPU.

In [1]:
# Import needed packages
from __future__ import absolute_import, division, print_function, unicode_literals
import tensorflow as tf
import pandas as pd
import numpy as np
import time
from tensorflow.keras import models, layers, callbacks, optimizers, metrics
from matplotlib import pyplot as plt
import keras
from keras.layers.core import Layer
import keras.backend as K

from keras.models import Model, Sequential
from keras.layers import Conv2D, MaxPool2D,  \
    Dropout, Dense, Input, concatenate,      \
    GlobalAveragePooling2D, AveragePooling2D,\
    Flatten, Activation, add, BatchNormalization
from keras.applications.mobilenet import MobileNet

import cv2 
from keras.utils import np_utils, plot_model

import math 
from keras.optimizers import SGD 
from keras.callbacks import LearningRateScheduler
from sklearn.metrics import mean_squared_error
%matplotlib inline

Using TensorFlow backend.


In [2]:
# Mount Google drive since the notebook is run in Colab. 
from google.colab import drive
drive.mount('/content/drive')

Go to this URL in a browser: https://accounts.google.com/o/oauth2/auth?client_id=947318989803-6bn6qk8qdgf4n4g3pfee6491hc0brc4i.apps.googleusercontent.com&redirect_uri=urn%3Aietf%3Awg%3Aoauth%3A2.0%3Aoob&scope=email%20https%3A%2F%2Fwww.googleapis.com%2Fauth%2Fdocs.test%20https%3A%2F%2Fwww.googleapis.com%2Fauth%2Fdrive%20https%3A%2F%2Fwww.googleapis.com%2Fauth%2Fdrive.photos.readonly%20https%3A%2F%2Fwww.googleapis.com%2Fauth%2Fpeopleapi.readonly&response_type=code

Enter your authorization code:
··········
Mounted at /content/drive


In [0]:
tf.__version__
tf.keras.backend.clear_session()

## Data Loading

- Load in the pickle file that was created as part of the EDA in DataExploration.ipynb. 
- This dataset has the NaNs removed and a few mislabeled images removed as well. 
- As such there is only limited training and development data to use. 
- The image data (X) has already been normalized to [0,1].
- The keypoint coordinates (y) is being normalized to [-1, 1].
- There is also a dataset that has been augmented with flipped images to increase training size, but results showed that the models do not improve much from the additional flipped images so we reverted back to just training on the original dataset without the flipped images.



In [0]:
# Initialize Random Seed for reproducibility
np.random.seed(13)

# Load the dataframe from the pickle file
df_nostache_nonan = pd.read_pickle("/content/drive/My Drive/School/UC Berkeley MIDS/W207 Applied Machine Learning/Final Project/facial-keypoints-detection/df_nostache_nonan.pkl")
df_nostache_nonan_w_flip = pd.read_pickle("/content/drive/My Drive/School/UC Berkeley MIDS/W207 Applied Machine Learning/Final Project/facial-keypoints-detection/df_nostache_nonan_w_flip.pkl")

# Grab the last column - that is our image data for X matrix
X = df_nostache_nonan.iloc[:, -1]
Xf = df_nostache_nonan_w_flip.iloc[:, -1]

# Convert from a series of arrays to an NDarray. Xf includes flipped images.
X = np.array([x.reshape(96,96,1) for x in X])
Xf = np.array([x.reshape(96,96,1) for x in Xf])

# Grab the keypoints and stick into our y-variable. yf includes flipped images.
y = np.array(df_nostache_nonan.iloc[:,:-1])
yf = np.array(df_nostache_nonan_w_flip.iloc[:,:-1])

# Normalize y to [-1,1], yfn includes flipped images.
yn = y/48 - 1
yfn = yf/48 - 1

### Define a function to make a csv file in the Kaggle submission format to submit for scoring.
Kaggle provides a test set but no groundtruth data. To compare model performance, we submit the predictions to Kaggle for scoring. The metric used by Kaggle is root mean squared error. This function generates predictions in Kaggle's format for scoring. 

In [0]:
def make_subm (model, file_name):
  test_df = pd.read_csv('/content/drive/My Drive/School/UC Berkeley MIDS/W207 Applied Machine Learning/Final Project/facial-keypoints-detection/test.csv')
  test_df['Image'] = test_df['Image'].map(lambda x: np.array(x.split()).astype(int) / 255.0)
  keypoints = ['left_eye_center_x', 'left_eye_center_y', 'right_eye_center_x', 'right_eye_center_y', 'left_eye_inner_corner_x', 'left_eye_inner_corner_y',
             'left_eye_outer_corner_x', 'left_eye_outer_corner_y', 'right_eye_inner_corner_x', 'right_eye_inner_corner_y', 'right_eye_outer_corner_x',
             'right_eye_outer_corner_y', 'left_eyebrow_inner_end_x', 'left_eyebrow_inner_end_y', 'left_eyebrow_outer_end_x', 'left_eyebrow_outer_end_y', 
             'right_eyebrow_inner_end_x', 'right_eyebrow_inner_end_y', 'right_eyebrow_outer_end_x', 'right_eyebrow_outer_end_y', 'nose_tip_x', 'nose_tip_y',
             'mouth_left_corner_x', 'mouth_left_corner_y', 'mouth_right_corner_x', 'mouth_right_corner_y', 'mouth_center_top_lip_x', 'mouth_center_top_lip_y',
             'mouth_center_bottom_lip_x', 'mouth_center_bottom_lip_y']
  predictions_df = pd.DataFrame(columns=keypoints)

  test_image = np.array([x.reshape(96,96,1) for x in test_df['Image']])
  predictions = model.predict(test_image)
  predictions = (predictions+1)*48
  predictions[predictions<0] = 0
  predictions[predictions>96] = 96

  pos = 0
  for keypoint in keypoints:
    predictions_df[keypoint] = predictions[:, pos]
    pos += 1
  
  predictions_df = pd.concat([test_df, predictions_df], axis=1)
  predictions_df.set_index('ImageId')

  result_df = pd.read_csv('/content/drive/My Drive/School/UC Berkeley MIDS/W207 Applied Machine Learning/Final Project/facial-keypoints-detection/IdLookupTable.csv')

  for row in range(result_df.shape[0]):
    image = result_df.at[row, 'ImageId']
    feature = result_df.at[row, 'FeatureName']
    result_df.at[row, 'Location'] = predictions_df.at[image-1,feature]
  
  # Extract relevant columns to produce final submission file.
  submission_df = pd.concat([result_df['RowId'], result_df['Location']], axis=1)
  # Save submission file to Google Drive.
  submission_df.to_csv('/content/drive/My Drive/School/UC Berkeley MIDS/W207 Applied Machine Learning/Final Project/facial-keypoints-detection/submission_'
                       + file_name + '.csv', index=False)

### Define a function to plot model predictions so results can be visualized.

In [0]:
def plot_predictions(model, num_plot):  
  test_df = pd.read_csv('/content/drive/My Drive/School/UC Berkeley MIDS/W207 Applied Machine Learning/Final Project/facial-keypoints-detection/test.csv')
  test_df['Image'] = test_df['Image'].map(lambda x: np.array(x.split()).astype(int) / 255.0)
  test_image = np.array([x.reshape(96,96,1) for x in test_df['Image']])
  predictions = model.predict(test_image)
  predictions = (predictions+1)*48
  
  print(predictions[0], '\n', predictions[1])

  fig, axes = plt.subplots(len(predictions[-num_plot:-1]), 1, figsize=(8,8*len(predictions[-num_plot:-1])))
  axes = axes.flatten()
  xs = predictions[:,0:30:2]
  ys = predictions[:,1:30:2]
  for i, ax in enumerate(axes):
      ax.imshow(np.reshape(test_image[-num_plot+i],(96,96)), origin='upper', cmap='gray')
      ax.scatter(x=xs[i,:], y=ys[i,:])

## Define the input layer for the models

In [5]:
input_layer = Input(shape=(96, 96, 1))

W0805 20:45:46.628570 140294979774336 deprecation_wrapper.py:119] From /usr/local/lib/python3.6/dist-packages/keras/backend/tensorflow_backend.py:74: The name tf.get_default_graph is deprecated. Please use tf.compat.v1.get_default_graph instead.

W0805 20:45:46.682502 140294979774336 deprecation_wrapper.py:119] From /usr/local/lib/python3.6/dist-packages/keras/backend/tensorflow_backend.py:517: The name tf.placeholder is deprecated. Please use tf.compat.v1.placeholder instead.



# GoogleNet Study


###Background

As the winner of the ImageNet Large Scale Visual Recognition Competition in 2014, the GoogleNet, or Inception v1, achieved a top-5 error rate of 6.67%, which is close to human level performance. Inception quickly became a defining model architecture. The more recent versions even combine with ResNet to produce an Inception-ResNet hybrid. ResNet will be discussed in a subsequent section.

![alt text](https://)

The diagram above shows the layers in a inception module. It is a aconcatenated layer that includes a 1x1,  3x3, 5x5, and max pooling. The additional 1x1 convolution layers serve as dimensionality reduction since the concatenated layers drastically increases computational costs.

In the following section, we first defined a function for the inception module and then used that to experiment with various combinations of convolution, normalization, pooling and dropouts, along with different number of inception modules. 

We present a few representive models below with model statistics for a quick comparison.

### Base Model
![alt text](https://)

We started out with a single inception layer to make sure that the model is performing as expected. The model plot is shown above. We noticed that a single inception module without any additional layers produces a glaring parameter count of over 35 million! This is expected because of the concatenated block. We compared different filter depths and found that the higher depth did not increase performance much so we kept the depth at 8 to manage the parameter size. 

This model has a Kaggle score of 3.53 with 35.4 million parameters at a run time of 10 seconds per epoch.


### Full GoogleNet (Inception v1)
![alt text](https://)

The 22-layer architecture is based on several very small convolutions in order to drastically reduce the number of parameters, which is ~6 million. The model uses large feature size (7x7) with strides of (2,2) to help downsample the input, however, we believe that heavy downsampling could be one of the reasons that affects the model's ability to accurately predict keypoint coordinates. 

This model has a Kaggle score of 4.2 with 6 million parameters at a run time of 7 seconds per epoch.

### Best Model 
![alt text](https://)

Our best performing model is much simpler than the 22-layer model. We start out with 2 layers of convolution, 1 layer of pooling, 2 inception blocks, and a dense layer with linear activation function. Interestingly, while we have success applying normalization and dropout in the AlexNet models, applying dropout and normalization did not improve this particular Inception model. 

This model has a Kaggle score of 3.17 with 2.3 million parameters at a run time of 9 seconds per epoch. 

## Define the inception module


In [0]:
def inception_module(in_layer,
                     filter_1x1,
                     filter_3x3_reduce,
                     filter_3x3,
                     filter_5x5_reduce,
                     filter_5x5,
                     filter_pool,
                     name=None):
    
    conv1 = Conv2D(filter_1x1, (1, 1), padding='same', activation='relu')(in_layer)
    
    conv3 = Conv2D(filter_3x3_reduce, (1, 1), padding='same', activation='relu')(in_layer)
    conv3 = Conv2D(filter_3x3, (3, 3), padding='same', activation='relu')(conv3)

    conv5 = Conv2D(filter_5x5_reduce, (1, 1), padding='same', activation='relu')(in_layer)
    conv5 = Conv2D(filter_5x5, (5, 5), padding='same', activation='relu')(conv5)

    pool = MaxPool2D((3, 3), strides=(1, 1), padding='same')(in_layer)
    pool = Conv2D(filter_pool, (1, 1), padding='same', activation='relu')(pool)

    output = concatenate([conv1, conv3, conv5, pool], axis=3, name=name)
    
    return output
  

##Base Model with a single inception block and various filter depth. 
This model has a parameter count of over 35 million and its performance is similar to the next model with all depth of 8. To manage the parameter count, we use depth of 8 for all subsequent models. 

In [0]:
output = inception_module(input_layer, 
                          filter_1x1=32,
                          filter_3x3_reduce=48,
                          filter_3x3=64,
                          filter_5x5_reduce=8,
                          filter_5x5=16,
                          filter_pool=16,)
output = Flatten()(output)
out = Dense(30, activation='linear')(output)

single_inception_model2 = Model(inputs = input_layer, outputs = out)
print(single_inception_model2.summary())

epochs = 200
lrate = 0.01
decay = lrate/epochs

sgd = SGD(lr=lrate, momentum=0.9, decay=decay, nesterov=False)
single_inception_model2.compile(loss='mean_squared_error', optimizer=sgd, metrics=['mean_squared_error'])
single_inception_model2.fit(X.astype(np.float32), yn.astype(np.float32), validation_split=0.15, epochs=epochs, batch_size=32)

__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_1 (InputLayer)            (None, 96, 96, 1)    0                                            
__________________________________________________________________________________________________
conv2d_16 (Conv2D)              (None, 96, 96, 48)   96          input_1[0][0]                    
__________________________________________________________________________________________________
conv2d_18 (Conv2D)              (None, 96, 96, 8)    16          input_1[0][0]                    
__________________________________________________________________________________________________
max_pooling2d_4 (MaxPooling2D)  (None, 96, 96, 1)    0           input_1[0][0]                    
__________________________________________________________________________________________________
conv2d_15 

<keras.callbacks.History at 0x7f277c869f60>

##Base Model with a single inception block and depth of 8. 

In [0]:
output = inception_module(input_layer, 
                          filter_1x1=8,
                          filter_3x3_reduce=8,
                          filter_3x3=8,
                          filter_5x5_reduce=8,
                          filter_5x5=8,
                          filter_pool=8,)
output = Flatten()(output)
out = Dense(30, activation='linear')(output)

single_inception_model3 = Model(inputs = input_layer, outputs = out)
print(single_inception_model3.summary())

epochs = 200
lrate = 0.01
decay = lrate/epochs

sgd = SGD(lr=lrate, momentum=0.9, decay=decay, nesterov=False)
single_inception_model3.compile(loss='mean_squared_error', optimizer=sgd, metrics=['mean_squared_error'])
single_inception_model3.fit(X.astype(np.float32), yn.astype(np.float32), validation_split=0.15, epochs=epochs, batch_size=32)
make_subm(single_inception_model3, '82_no_flip_1inception_all8_sgd_200ep_linear')
plot_model(single_inception_model3, show_shapes=True, to_file='/content/drive/My Drive/School/UC Berkeley MIDS/'\
           'W207 Applied Machine Learning/Final Project/facial-keypoints-detection/single_inception_all8_linear_module.png')

__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_1 (InputLayer)            (None, 96, 96, 1)    0                                            
__________________________________________________________________________________________________
conv2d_28 (Conv2D)              (None, 96, 96, 8)    16          input_1[0][0]                    
__________________________________________________________________________________________________
conv2d_30 (Conv2D)              (None, 96, 96, 8)    16          input_1[0][0]                    
__________________________________________________________________________________________________
max_pooling2d_6 (MaxPooling2D)  (None, 96, 96, 1)    0           input_1[0][0]                    
__________________________________________________________________________________________________
conv2d_27 

## Full GoogleNet model
This 22-layer model did not perform well (Kaggle RMSE 4.2) so we started exploring simpler models with fewer layers. 

In [0]:
layer = Conv2D(64, (7, 7), padding='same', strides=(2, 2), activation='relu', 
           name='conv_1_7x7/2')(input_layer)
layer = MaxPool2D((3, 3), padding='same', strides=(2, 2), name='max_pool_1_3x3/2')(layer)
layer = Conv2D(64, (1, 1), padding='same', strides=(1, 1), activation='relu', 
           name='conv_2a_3x3/1')(layer)
layer = Conv2D(192, (3, 3), padding='same', strides=(1, 1), activation='relu', 
           name='conv_2b_3x3/1')(layer)
layer = MaxPool2D((3, 3), padding='same', strides=(2, 2), name='max_pool_2_3x3/2')(layer)

layer = inception_module(layer,
                     filter_1x1=64,
                     filter_3x3_reduce=96,
                     filter_3x3=128,
                     filter_5x5_reduce=16,
                     filter_5x5=32,
                     filter_pool=32,
                     name='inception_3a')

layer = inception_module(layer,
                     filter_1x1=128,
                     filter_3x3_reduce=128,
                     filter_3x3=192,
                     filter_5x5_reduce=32,
                     filter_5x5=96,
                     filter_pool=64,
                     name='inception_3b')

layer = MaxPool2D((3, 3), padding='same', strides=(2, 2), name='max_pool_3_3x3/2')(layer)

layer = inception_module(layer,
                     filter_1x1=192,
                     filter_3x3_reduce=96,
                     filter_3x3=208,
                     filter_5x5_reduce=16,
                     filter_5x5=48,
                     filter_pool=64,
                     name='inception_4a')


layer2 = AveragePooling2D((5, 5), strides=3)(layer)
layer2 = Conv2D(128, (1, 1), padding='same', activation='relu')(layer2)
layer2 = Flatten()(layer2)
layer2 = Dense(1024, activation='linear')(layer2)
layer2 = Dropout(0.1)(layer2)
layer2 = Dense(30, activation='linear', name='auxilliary_output_1')(layer2)

layer = inception_module(layer,
                     filter_1x1=160,
                     filter_3x3_reduce=112,
                     filter_3x3=224,
                     filter_5x5_reduce=24,
                     filter_5x5=64,
                     filter_pool=64,
                     name='inception_4b')

layer = inception_module(layer,
                     filter_1x1=128,
                     filter_3x3_reduce=128,
                     filter_3x3=256,
                     filter_5x5_reduce=24,
                     filter_5x5=64,
                     filter_pool=64,
                     name='inception_4c')

layer = inception_module(layer,
                     filter_1x1=112,
                     filter_3x3_reduce=144,
                     filter_3x3=288,
                     filter_5x5_reduce=32,
                     filter_5x5=64,
                     filter_pool=64,
                     name='inception_4d')


layer3 = AveragePooling2D((5, 5), strides=3)(layer)
layer3 = Conv2D(128, (1, 1), padding='same', activation='relu')(layer3)
layer3 = Flatten()(layer3)
layer3 = Dense(1024, activation='linear')(layer3)
layer3 = Dropout(0.2)(layer3)
layer3 = Dense(30, activation='linear', name='auxilliary_output_2')(layer3)

layer = inception_module(layer,
                     filter_1x1=256,
                     filter_3x3_reduce=160,
                     filter_3x3=320,
                     filter_5x5_reduce=32,
                     filter_5x5=128,
                     filter_pool=128,
                     name='inception_4e')

layer = MaxPool2D((3, 3), padding='same', strides=(2, 2), name='max_pool_4_3x3/2')(layer)

layer = inception_module(layer,
                     filter_1x1=256,
                     filter_3x3_reduce=160,
                     filter_3x3=320,
                     filter_5x5_reduce=32,
                     filter_5x5=128,
                     filter_pool=128,
                     name='inception_5a')

layer = inception_module(layer,
                     filter_1x1=384,
                     filter_3x3_reduce=192,
                     filter_3x3=384,
                     filter_5x5_reduce=48,
                     filter_5x5=128,
                     filter_pool=128,
                     name='inception_5b')

layer = GlobalAveragePooling2D(name='avg_pool_5_3x3/1')(layer)

layer = Dropout(0.3)(layer)

out = Dense(30, activation='linear', name='output')(layer)

googlenet_model = Model(input_layer, out, name='inception_v1')
googlenet_model.summary()

epochs = 200
lrate = 0.01
decay = lrate/epochs

sgd = SGD(lr=lrate, momentum=0.9, decay=decay, nesterov=False)
googlenet_model.compile(loss='mean_squared_error', optimizer=sgd, metrics=['mean_squared_error'])
googlenet_model.fit(X.astype(np.float32), yn.astype(np.float32), validation_split=0.15, epochs=epochs, batch_size=32)
make_subm(googlenet_model, '82_no_flip_full_googlenet_linear')
plot_model(googlenet_model, show_shapes=True, to_file='/content/drive/My Drive/School/UC Berkeley MIDS/'\
           'W207 Applied Machine Learning/Final Project/facial-keypoints-detection/full_googlenet_linear_module.png')

__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_1 (InputLayer)            (None, 96, 96, 1)    0                                            
__________________________________________________________________________________________________
conv_1_7x7/2 (Conv2D)           (None, 48, 48, 64)   3200        input_1[0][0]                    
__________________________________________________________________________________________________
max_pool_1_3x3/2 (MaxPooling2D) (None, 24, 24, 64)   0           conv_1_7x7/2[0][0]               
__________________________________________________________________________________________________
conv_2a_3x3/1 (Conv2D)          (None, 24, 24, 64)   4160        max_pool_1_3x3/2[0][0]           
__________________________________________________________________________________________________
conv_2b_3x

##Inception Model with 2 layers of convolution and 1 layer of pooling before the 1 inception module.
The added convolution layers improved model performance. The Kaggle RMSE decreased from 3.4 to 3.2. The parameter also decreased from 8.8 million to 2.3 million. 

In [0]:
layer = Conv2D(64, (1, 1), padding='same', strides=(1, 1), activation='relu')(input_layer)
layer = Conv2D(192, (3, 3), padding='same', strides=(1, 1), activation='relu')(layer)
layer = MaxPool2D((3, 3), padding='same', strides=(2, 2))(layer)

output = inception_module(layer, 
                          filter_1x1=8,
                          filter_3x3_reduce=8,
                          filter_3x3=8,
                          filter_5x5_reduce=8,
                          filter_5x5=8,
                          filter_pool=8,)
output = Flatten()(output)
out = Dense(30, activation='linear')(output)

single_inception_model3 = Model(inputs = input_layer, outputs = out)
print(single_inception_model3.summary())

epochs = 200
lrate = 0.01
decay = lrate/epochs

sgd = SGD(lr=lrate, momentum=0.9, decay=decay, nesterov=False)
single_inception_model3.compile(loss='mean_squared_error', optimizer=sgd, metrics=['mean_squared_error'])
single_inception_model3.fit(X.astype(np.float32), yn.astype(np.float32), validation_split=0.15, epochs=epochs, batch_size=32)
make_subm(single_inception_model3, '82_no_flip_1inception_3conv_all8_sgd_200ep_linear')
plot_model(single_inception_model3, show_shapes=True, to_file='/content/drive/My Drive/School/UC Berkeley MIDS/'\
           'W207 Applied Machine Learning/Final Project/facial-keypoints-detection/single_inception_all8_3conv_linear_module.png')

__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_1 (InputLayer)            (None, 96, 96, 1)    0                                            
__________________________________________________________________________________________________
conv2d_110 (Conv2D)             (None, 96, 96, 64)   128         input_1[0][0]                    
__________________________________________________________________________________________________
conv2d_111 (Conv2D)             (None, 96, 96, 192)  110784      conv2d_110[0][0]                 
__________________________________________________________________________________________________
max_pooling2d_20 (MaxPooling2D) (None, 48, 48, 192)  0           conv2d_111[0][0]                 
__________________________________________________________________________________________________
conv2d_113

##Best Inception Model with 2 layers of convolution and 1 layer of pooling before the 2 inception modules.
The Kaggle RMSE decreased further to 3.17. 

In [0]:
layer = Conv2D(64, (1, 1), padding='same', strides=(1, 1), activation='relu')(input_layer)
layer = Conv2D(192, (3, 3), padding='same', strides=(1, 1), activation='relu')(layer)
layer = MaxPool2D((3, 3), padding='same', strides=(2, 2))(layer)

layer = inception_module(layer, 
                          filter_1x1=8,
                          filter_3x3_reduce=8,
                          filter_3x3=8,
                          filter_5x5_reduce=8,
                          filter_5x5=8,
                          filter_pool=8,)
output = inception_module(layer, 
                          filter_1x1=8,
                          filter_3x3_reduce=8,
                          filter_3x3=8,
                          filter_5x5_reduce=8,
                          filter_5x5=8,
                          filter_pool=8,)
output = Flatten()(output)
out = Dense(30, activation='linear')(output)

single_inception_model4 = Model(inputs = input_layer, outputs = out)
print(single_inception_model4.summary())

epochs = 200
lrate = 0.01
decay = lrate/epochs

sgd = SGD(lr=lrate, momentum=0.9, decay=decay, nesterov=False)
single_inception_model4.compile(loss='mean_squared_error', optimizer=sgd, metrics=['mean_squared_error'])
single_inception_model4.fit(X.astype(np.float32), yn.astype(np.float32), validation_split=0.15, epochs=epochs, batch_size=32)
make_subm(single_inception_model4, '82_no_flip_2inception_3conv_all8_sgd_200ep_linear')
plot_model(single_inception_model4, show_shapes=True, to_file='/content/drive/My Drive/School/UC Berkeley MIDS/'\
           'W207 Applied Machine Learning/Final Project/facial-keypoints-detection/2_inception_all8_3conv_linear_module.png')

__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_1 (InputLayer)            (None, 96, 96, 1)    0                                            
__________________________________________________________________________________________________
conv2d_118 (Conv2D)             (None, 96, 96, 64)   128         input_1[0][0]                    
__________________________________________________________________________________________________
conv2d_119 (Conv2D)             (None, 96, 96, 192)  110784      conv2d_118[0][0]                 
__________________________________________________________________________________________________
max_pooling2d_22 (MaxPooling2D) (None, 48, 48, 192)  0           conv2d_119[0][0]                 
__________________________________________________________________________________________________
conv2d_121

##Inception Model with 3 convolution and pooling layers + 3 inception modules. 
The Kaggle RMSE is 3.27, which did not improve from the 2-layer inception model so we stopped adding additional inception modules and focused on the 2-layer setup. 

In [0]:
layer = Conv2D(64, (1, 1), padding='same', strides=(1, 1), activation='relu')(input_layer)
layer = Conv2D(192, (3, 3), padding='same', strides=(1, 1), activation='relu')(layer)
layer = MaxPool2D((3, 3), padding='same', strides=(2, 2))(layer)

layer = inception_module(layer, 
                          filter_1x1=8,
                          filter_3x3_reduce=8,
                          filter_3x3=8,
                          filter_5x5_reduce=8,
                          filter_5x5=8,
                          filter_pool=8,)
layer = inception_module(layer, 
                          filter_1x1=8,
                          filter_3x3_reduce=8,
                          filter_3x3=8,
                          filter_5x5_reduce=8,
                          filter_5x5=8,
                          filter_pool=8,)
output = inception_module(layer, 
                          filter_1x1=8,
                          filter_3x3_reduce=8,
                          filter_3x3=8,
                          filter_5x5_reduce=8,
                          filter_5x5=8,
                          filter_pool=8,)
output = Flatten()(output)
out = Dense(30, activation='linear')(output)

single_inception_model5 = Model(inputs = input_layer, outputs = out)
print(single_inception_model5.summary())

epochs = 200
lrate = 0.01
decay = lrate/epochs

sgd = SGD(lr=lrate, momentum=0.9, decay=decay, nesterov=False)
single_inception_model5.compile(loss='mean_squared_error', optimizer=sgd, metrics=['mean_squared_error'])
single_inception_model5.fit(X.astype(np.float32), yn.astype(np.float32), validation_split=0.15, epochs=epochs, batch_size=32)
make_subm(single_inception_model5, '82_no_flip_3inception_3conv_all8_sgd_200ep_linear')
plot_model(single_inception_model5, show_shapes=True, to_file='/content/drive/My Drive/School/UC Berkeley MIDS/'\
           'W207 Applied Machine Learning/Final Project/facial-keypoints-detection/3_inception_all8_3conv_linear_module.png')

__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_1 (InputLayer)            (None, 96, 96, 1)    0                                            
__________________________________________________________________________________________________
conv2d_132 (Conv2D)             (None, 96, 96, 64)   128         input_1[0][0]                    
__________________________________________________________________________________________________
conv2d_133 (Conv2D)             (None, 96, 96, 192)  110784      conv2d_132[0][0]                 
__________________________________________________________________________________________________
max_pooling2d_25 (MaxPooling2D) (None, 48, 48, 192)  0           conv2d_133[0][0]                 
__________________________________________________________________________________________________
conv2d_135

##Other Models
We explored other models but the performance did not improve. We experimented with different optimizers, different number of inception modules, adding additional convolution and pooling layers, Normalizing layers. The following are a few representative models. 

In [0]:
layer = inception_module(input_layer, 
                         filter_1x1=8,
                         filter_3x3_reduce=8,
                         filter_3x3=8,
                         filter_5x5_reduce=8,
                         filter_5x5=8,
                         filter_pool=8,)
layer = inception_module(layer, 
                         filter_1x1=8,
                         filter_3x3_reduce=8,
                         filter_3x3=8,
                         filter_5x5_reduce=8,
                         filter_5x5=8,
                         filter_pool=8,)

layer = MaxPool2D((3, 3), padding='same', strides=(2, 2), name='max_pool_3_3x3/2')(layer)

layer = inception_module(layer,
                         filter_1x1=8,
                         filter_3x3_reduce=8,
                         filter_3x3=8,
                         filter_5x5_reduce=8,
                         filter_5x5=8,
                         filter_pool=8,)

layer = AveragePooling2D((5, 5), strides=2)(layer)
layer = Conv2D(128, (1, 1), padding='same', activation='relu')(layer)
layer = Dropout(0.7)(layer)

output = Flatten()(layer)
out = Dense(30)(output)

single_inception_model4 = Model(inputs = input_layer, outputs = out)
print(single_inception_model4.summary())

epochs = 200
lrate = 0.01
decay = lrate/epochs

sgd = SGD(lr=lrate, momentum=0.9, decay=decay, nesterov=False)
single_inception_model4.compile(loss='mean_squared_error', optimizer=sgd, metrics=['mean_squared_error'])
single_inception_model4.fit(X.astype(np.float32), yn.astype(np.float32), validation_split=0.15, epochs=epochs, batch_size=32)
make_subm(single_inception_model4, '82_no_flip_3inception_all8_sgd_200ep')
plot_model(single_inception_model4, show_shapes=True, to_file='/content/drive/My Drive/School/UC Berkeley MIDS/'\
           'W207 Applied Machine Learning/Final Project/facial-keypoints-detection/3_inception_all8_module.png')

W0802 20:47:22.479570 139810382698368 nn_ops.py:4224] Large dropout rate: 0.7 (>0.5). In TensorFlow 2.x, dropout() uses dropout rate instead of keep_prob. Please ensure that this is intended.


__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_1 (InputLayer)            (None, 96, 96, 1)    0                                            
__________________________________________________________________________________________________
conv2d_84 (Conv2D)              (None, 96, 96, 8)    16          input_1[0][0]                    
__________________________________________________________________________________________________
conv2d_86 (Conv2D)              (None, 96, 96, 8)    16          input_1[0][0]                    
__________________________________________________________________________________________________
max_pooling2d_15 (MaxPooling2D) (None, 96, 96, 1)    0           input_1[0][0]                    
__________________________________________________________________________________________________
conv2d_83 

In [0]:
layer = Conv2D(64, (1, 1), padding='same', strides=(1, 1), activation='relu')(input_layer)
layer = Conv2D(192, (3, 3), padding='same', strides=(1, 1), activation='relu')(layer)
layer = MaxPool2D((3, 3), padding='same', strides=(2, 2))(layer)

layer = inception_module(layer, 
                          filter_1x1=8,
                          filter_3x3_reduce=8,
                          filter_3x3=8,
                          filter_5x5_reduce=8,
                          filter_5x5=8,
                          filter_pool=8,)
output = inception_module(layer, 
                          filter_1x1=8,
                          filter_3x3_reduce=8,
                          filter_3x3=8,
                          filter_5x5_reduce=8,
                          filter_5x5=8,
                          filter_pool=8,)
output = Flatten()(output)
output = Dense(200, activation='linear')(output)
output = Dropout(0.25)(output)
out = Dense(30, activation='linear')(output)

two_inception_model = Model(inputs = input_layer, outputs = out)
print(two_inception_model.summary())

epochs = 200
lrate = 0.01
decay = lrate/epochs

sgd = SGD(lr=lrate, momentum=0.9, decay=decay, nesterov=False)
two_inception_model.compile(loss='mean_squared_error', optimizer=sgd, metrics=['mean_squared_error'])
two_inception_model.fit(X.astype(np.float32), yn.astype(np.float32), validation_split=0.15, epochs=epochs, batch_size=32)
make_subm(two_inception_model, '83_no_flip_2inception_3conv_all8_sgd_200ep_linear_dropout')
plot_model(two_inception_model, show_shapes=True, to_file='/content/drive/My Drive/School/UC Berkeley MIDS/'\
           'W207 Applied Machine Learning/Final Project/facial-keypoints-detection/2_inception_all8_3conv_linear_dropout_module_.png')

__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_1 (InputLayer)            (None, 96, 96, 1)    0                                            
__________________________________________________________________________________________________
conv2d_113 (Conv2D)             (None, 96, 96, 64)   128         input_1[0][0]                    
__________________________________________________________________________________________________
conv2d_114 (Conv2D)             (None, 96, 96, 192)  110784      conv2d_113[0][0]                 
__________________________________________________________________________________________________
max_pooling2d_19 (MaxPooling2D) (None, 48, 48, 192)  0           conv2d_114[0][0]                 
__________________________________________________________________________________________________
conv2d_116

In [0]:
layer = Conv2D(64, (1, 1), padding='same', strides=(1, 1), activation='relu')(input_layer)
layer = Conv2D(192, (3, 3), padding='same', strides=(1, 1), activation='relu')(layer)
layer = MaxPool2D((3, 3), padding='same', strides=(2, 2))(layer)

layer = inception_module(layer, 
                          filter_1x1=8,
                          filter_3x3_reduce=8,
                          filter_3x3=8,
                          filter_5x5_reduce=8,
                          filter_5x5=8,
                          filter_pool=8,)

layer = BatchNormalization()(layer)
layer = Dropout(0.5)(layer)

output = Flatten()(layer)
out = Dense(30, activation='linear')(output)

single_inception_model5 = Model(inputs = input_layer, outputs = out)
print(single_inception_model5.summary())

epochs = 200
lrate = 0.01
decay = lrate/epochs

sgd = SGD(lr=lrate, momentum=0.9, decay=decay, nesterov=False)
single_inception_model5.compile(loss='mean_squared_error', optimizer=sgd, metrics=['mean_squared_error'])
single_inception_model5.fit(X.astype(np.float32), yn.astype(np.float32), validation_split=0.15, epochs=epochs, batch_size=32)
make_subm(single_inception_model5, '84_no_flip_2inception_3conv_all8_BN_Global_dropout_sgd_200ep')
plot_model(single_inception_model5, show_shapes=True, to_file='/content/drive/My Drive/School/UC Berkeley MIDS/'\
           'W207 Applied Machine Learning/Final Project/facial-keypoints-detection/2_inception_all8_3conv_BN_Global_dropout_module.png')

__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_2 (InputLayer)            (None, 96, 96, 1)    0                                            
__________________________________________________________________________________________________
conv2d_259 (Conv2D)             (None, 96, 96, 64)   128         input_2[0][0]                    
__________________________________________________________________________________________________
conv2d_260 (Conv2D)             (None, 96, 96, 192)  110784      conv2d_259[0][0]                 
__________________________________________________________________________________________________
max_pooling2d_55 (MaxPooling2D) (None, 48, 48, 192)  0           conv2d_260[0][0]                 
__________________________________________________________________________________________________
conv2d_262

In [0]:
layer = Conv2D(64, (1, 1), padding='same', strides=(1, 1), activation='relu')(input_layer)
layer = Conv2D(192, (3, 3), padding='same', strides=(1, 1), activation='relu')(layer)
layer = MaxPool2D((3, 3), padding='same', strides=(2, 2))(layer)

layer = inception_module(layer, 
                          filter_1x1=8,
                          filter_3x3_reduce=8,
                          filter_3x3=8,
                          filter_5x5_reduce=8,
                          filter_5x5=8,
                          filter_pool=8,)
layer = inception_module(layer, 
                          filter_1x1=8,
                          filter_3x3_reduce=8,
                          filter_3x3=8,
                          filter_5x5_reduce=8,
                          filter_5x5=8,
                          filter_pool=8,)
layer = inception_module(layer, 
                          filter_1x1=8,
                          filter_3x3_reduce=8,
                          filter_3x3=8,
                          filter_5x5_reduce=8,
                          filter_5x5=8,
                          filter_pool=8,)

layer = Dropout(0.5)(layer)

layer = Conv2D(64, (1, 1), padding='same', strides=(1, 1), activation='relu')(layer)
layer = Conv2D(192, (3, 3), padding='same', strides=(1, 1), activation='relu')(layer)
layer = MaxPool2D((3, 3), padding='same', strides=(2, 2))(layer)

layer = inception_module(layer, 
                          filter_1x1=8,
                          filter_3x3_reduce=8,
                          filter_3x3=8,
                          filter_5x5_reduce=8,
                          filter_5x5=8,
                          filter_pool=8,)
layer = inception_module(layer, 
                          filter_1x1=8,
                          filter_3x3_reduce=8,
                          filter_3x3=8,
                          filter_5x5_reduce=8,
                          filter_5x5=8,
                          filter_pool=8,)
output = inception_module(layer, 
                          filter_1x1=8,
                          filter_3x3_reduce=8,
                          filter_3x3=8,
                          filter_5x5_reduce=8,
                          filter_5x5=8,
                          filter_pool=8,)

output = Flatten()(output)
out = Dense(30, activation='linear')(output)

single_inception_model5 = Model(inputs = input_layer, outputs = out)
print(single_inception_model5.summary())

epochs = 200
lrate = 0.01
decay = lrate/epochs

sgd = SGD(lr=lrate, momentum=0.9, decay=decay, nesterov=False)
single_inception_model5.compile(loss='mean_squared_error', optimizer=sgd, metrics=['mean_squared_error'])
single_inception_model5.fit(X.astype(np.float32), yn.astype(np.float32), validation_split=0.15, epochs=epochs, batch_size=32)
make_subm(single_inception_model5, '84_no_flip_2inception_3conv_all8_sgd_200ep_linear')
plot_model(single_inception_model5, show_shapes=True, to_file='/content/drive/My Drive/School/UC Berkeley MIDS/'\
           'W207 Applied Machine Learning/Final Project/facial-keypoints-detection/2_inception_all8_3conv_module.png')

W0804 22:43:12.992282 139723501893504 deprecation.py:506] From /usr/local/lib/python3.6/dist-packages/keras/backend/tensorflow_backend.py:3445: calling dropout (from tensorflow.python.ops.nn_ops) with keep_prob is deprecated and will be removed in a future version.
Instructions for updating:
Please use `rate` instead of `keep_prob`. Rate should be set to `rate = 1 - keep_prob`.


__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_1 (InputLayer)            (None, 96, 96, 1)    0                                            
__________________________________________________________________________________________________
conv2d_42 (Conv2D)              (None, 96, 96, 64)   128         input_1[0][0]                    
__________________________________________________________________________________________________
conv2d_43 (Conv2D)              (None, 96, 96, 192)  110784      conv2d_42[0][0]                  
__________________________________________________________________________________________________
max_pooling2d_10 (MaxPooling2D) (None, 48, 48, 192)  0           conv2d_43[0][0]                  
__________________________________________________________________________________________________
conv2d_45 

#ResNet Study
##Background
ResNet, Residual Neural Network, is the winner of the ImageNet Competition in 2015. The winning model features 152 layers while still having lower complexity than VGGNet and achieved a top-5 accuracy that is better than human level performance for that dataset. 

![alt text](https://)

The main idea behind ResNet is the residual learning, which helps address the issue of vanishing gradients. Deeper nets almost always degrade in performance. The author of ResNet hypothesized that direct mappings are hard to learn, therefore, instead of trying to learn an underlying mapping, the model can learn the difference, or residual, between the two. 

##Base Model
[alt text](https://)

We started out with a single residual layer to make sure that the model is performing as expected. The model plot is shown above. Based on results from our Inception Net study, we built the base ResNet model with the same 3 convolution layers and pooling layer with 1 residual module. The parameter count was also big in the base model so we worked on trying to reduce that as we tuned the model further.

This model has a Kaggle score of 3.5 with 17.7 million parameters at a run time of 6 seconds per epoch.


##Best Model
[alt text](https://)

Our best ResNet model has similar structure as the GoogleNet and the performance is also similar. The model plot is shown above. Interestingly, while we have success applying normalization and dropout in the AlexNet models, applying dropout and normalization made this particular ResNet model significantly worse (Kaggle RMSE increased from 3.25 to 4.25)

This model has a Kaggle score of 3.25 with 18 million parameters at a run time of 17 seconds per epoch.


##Define the Residual Module
We define the module and run a simple 1-layer module to make sure that it performs as expected. The model plot below confirms this. 

In [0]:
# function for creating an identity or projection residual module
def residual_module(layer_in, n_filters):
	merge_input = layer_in
	# check if the number of filters needs to be increase, assumes channels last format
	if layer_in.shape[-1] != n_filters:
		merge_input = Conv2D(n_filters, (1,1), padding='same', activation='relu')(layer_in)

	conv1 = Conv2D(n_filters, (3,3), padding='same', activation='relu')(layer_in)
	
	conv2 = Conv2D(n_filters, (3,3), padding='same', activation='linear')(conv1)
	# add filters, assumes filters/channels last
	layer_out = add([conv2, merge_input])
	# activation function
	layer_out = Activation('relu')(layer_out)
	return layer_out

In [0]:
input_layer = Input(shape=(96, 96, 1))

layer = residual_module(input_layer, 64)
output = Flatten()(layer)
out = Dense(30, activation='linear')(output)
# create model
res_model = Model(inputs=input_layer, outputs=out)
# summarize model
res_model.summary()
# plot model architecture
plot_model(model, show_shapes=True, to_file='residual_module.png')

epochs = 100
lrate = 0.01
decay = lrate/epochs

sgd = SGD(lr=lrate, momentum=0.9, decay=decay, nesterov=False)
res_model.compile(loss='mean_squared_error', optimizer=sgd, metrics=['mean_squared_error'])
res_model.fit(X.astype(np.float32), yn.astype(np.float32), validation_split=0.15, epochs=epochs, batch_size=64)

__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_13 (InputLayer)           (None, 96, 96, 1)    0                                            
__________________________________________________________________________________________________
conv2d_40 (Conv2D)              (None, 96, 96, 64)   640         input_13[0][0]                   
__________________________________________________________________________________________________
conv2d_41 (Conv2D)              (None, 96, 96, 64)   36928       conv2d_40[0][0]                  
__________________________________________________________________________________________________
add_16 (Add)                    (None, 96, 96, 64)   0           conv2d_41[0][0]                  
                                                                 input_13[0][0]                   
__________

<keras.callbacks.History at 0x7f8d985a99e8>

##ResNet Model with 2 layers of convolution and 1 layer of pooling before the 1 residual module.
The Kaggle RMSE is 3.5. The parameter count is 17.7 million. Runtime is 6 seconds per epoch.

In [0]:
layer = Conv2D(64, (1, 1), padding='same', strides=(1, 1), activation='relu')(input_layer)
layer = Conv2D(192, (3, 3), padding='same', strides=(1, 1), activation='relu')(layer)
layer = MaxPool2D((3, 3), padding='same', strides=(2, 2))(layer)

layer = residual_module(input_layer, 64)
output = Flatten()(layer)
out = Dense(30, activation='linear')(output)

single_res_model = Model(inputs = input_layer, outputs = out)
print(single_res_model.summary())

epochs = 200
lrate = 0.01
decay = lrate/epochs

sgd = SGD(lr=lrate, momentum=0.9, decay=decay, nesterov=False)
single_res_model.compile(loss='mean_squared_error', optimizer=sgd, metrics=['mean_squared_error'])
single_res_model.fit(X.astype(np.float32), yn.astype(np.float32), validation_split=0.15, epochs=epochs, batch_size=32)
make_subm(single_res_model, '83_no_flip_1res_3conv_64_sgd_200ep_linear')
plot_model(single_res_model, show_shapes=True, to_file='/content/drive/My Drive/School/UC Berkeley MIDS/'\
           'W207 Applied Machine Learning/Final Project/facial-keypoints-detection/1_res_64_3conv_linear_module.png')

W0804 04:48:21.042495 139727688898432 deprecation_wrapper.py:119] From /usr/local/lib/python3.6/dist-packages/keras/backend/tensorflow_backend.py:4138: The name tf.random_uniform is deprecated. Please use tf.random.uniform instead.

W0804 04:48:21.102471 139727688898432 deprecation_wrapper.py:119] From /usr/local/lib/python3.6/dist-packages/keras/backend/tensorflow_backend.py:3976: The name tf.nn.max_pool is deprecated. Please use tf.nn.max_pool2d instead.

W0804 04:48:21.205422 139727688898432 deprecation_wrapper.py:119] From /usr/local/lib/python3.6/dist-packages/keras/optimizers.py:790: The name tf.train.Optimizer is deprecated. Please use tf.compat.v1.train.Optimizer instead.



__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_1 (InputLayer)            (None, 96, 96, 1)    0                                            
__________________________________________________________________________________________________
conv2d_4 (Conv2D)               (None, 96, 96, 64)   640         input_1[0][0]                    
__________________________________________________________________________________________________
conv2d_5 (Conv2D)               (None, 96, 96, 64)   36928       conv2d_4[0][0]                   
__________________________________________________________________________________________________
conv2d_3 (Conv2D)               (None, 96, 96, 64)   128         input_1[0][0]                    
__________________________________________________________________________________________________
add_1 (Add

W0804 04:48:21.439621 139727688898432 deprecation_wrapper.py:119] From /usr/local/lib/python3.6/dist-packages/keras/backend/tensorflow_backend.py:986: The name tf.assign_add is deprecated. Please use tf.compat.v1.assign_add instead.

W0804 04:48:21.486155 139727688898432 deprecation_wrapper.py:119] From /usr/local/lib/python3.6/dist-packages/keras/backend/tensorflow_backend.py:973: The name tf.assign is deprecated. Please use tf.compat.v1.assign instead.



Train on 1819 samples, validate on 321 samples
Epoch 1/200
Epoch 2/200
Epoch 3/200
Epoch 4/200
Epoch 5/200
Epoch 6/200
Epoch 7/200
Epoch 8/200
Epoch 9/200
Epoch 10/200
Epoch 11/200
Epoch 12/200
Epoch 13/200
Epoch 14/200
Epoch 15/200
Epoch 16/200
Epoch 17/200
Epoch 18/200
Epoch 19/200
Epoch 20/200
Epoch 21/200
Epoch 22/200
Epoch 23/200
Epoch 24/200
Epoch 25/200
Epoch 26/200
Epoch 27/200
Epoch 28/200
Epoch 29/200
Epoch 30/200
Epoch 31/200
Epoch 32/200
Epoch 33/200
Epoch 34/200
Epoch 35/200
Epoch 36/200
Epoch 37/200
Epoch 38/200
Epoch 39/200
Epoch 40/200
Epoch 41/200
Epoch 42/200
Epoch 43/200
Epoch 44/200
Epoch 45/200
Epoch 46/200
Epoch 47/200
Epoch 48/200
Epoch 49/200
Epoch 50/200
Epoch 51/200
Epoch 52/200
Epoch 53/200
Epoch 54/200
Epoch 55/200
Epoch 56/200
Epoch 57/200
Epoch 58/200
Epoch 59/200
Epoch 60/200
Epoch 61/200
Epoch 62/200
Epoch 63/200
Epoch 64/200
Epoch 65/200
Epoch 66/200
Epoch 67/200
Epoch 68/200
Epoch 69/200
Epoch 70/200
Epoch 71/200
Epoch 72/200
Epoch 73/200
Epoch 74/200


##Best ResNet Model with 2 layers of convolution and 1 layer of pooling before the 2 residual modules.
While the Kaggle score of 3.25 is comparable to that of Inception net, this model is computationally intensive, with 18 million parameters at a run time of 17 seconds per epoch.

In [12]:
layer = Conv2D(64, (1, 1), padding='same', strides=(1, 1), activation='relu')(input_layer)
layer = Conv2D(192, (3, 3), padding='same', strides=(1, 1), activation='relu')(layer)
layer = MaxPool2D((3, 3), padding='same', strides=(2, 2))(layer)

layer2b = residual_module(input_layer, 128)

layer2a = Conv2D(64, (1, 1), padding='same', strides=(1, 1), activation='relu')(layer2b)
layer2a = Conv2D(192, (3, 3), padding='same', strides=(1, 1), activation='relu')(layer2a)
layer2a = MaxPool2D((3, 3), padding='same', strides=(2, 2))(layer2a)

output = residual_module(layer2b, 64)

output = Flatten()(output)
out = Dense(30, activation='linear')(output)

two_res_model = Model(inputs = input_layer, outputs = out)
print(two_res_model.summary())

epochs = 200
lrate = 0.01
decay = lrate/epochs

sgd = SGD(lr=lrate, momentum=0.9, decay=decay, nesterov=False)
two_res_model.compile(loss='mean_squared_error', optimizer=sgd, metrics=['mean_squared_error'])
two_res_model.fit(X.astype(np.float32), yn.astype(np.float32), validation_split=0.15, epochs=epochs, batch_size=32)
make_subm(two_res_model, '84_no_flip_2res_3conv_64_sgd_200ep_')
plot_model(two_res_model, show_shapes=True, to_file='/content/drive/My Drive/School/UC Berkeley MIDS/'\
           'W207 Applied Machine Learning/Final Project/facial-keypoints-detection/2_res_64_3conv_module.png')

__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_1 (InputLayer)            (None, 96, 96, 1)    0                                            
__________________________________________________________________________________________________
conv2d_32 (Conv2D)              (None, 96, 96, 128)  1280        input_1[0][0]                    
__________________________________________________________________________________________________
conv2d_33 (Conv2D)              (None, 96, 96, 128)  147584      conv2d_32[0][0]                  
__________________________________________________________________________________________________
conv2d_31 (Conv2D)              (None, 96, 96, 128)  256         input_1[0][0]                    
__________________________________________________________________________________________________
add_7 (Add

##Other Models
Interestingly, adding BatchNormalization and Dropout significantly decreased model performance. This model has a Kaggle score of 4.25 with 18 million parameters and runtime of 7 seconds per epoch. 

In [11]:
layer = Conv2D(64, (1, 1), padding='same', strides=(1, 1), activation='relu')(input_layer)
layer = Conv2D(192, (3, 3), padding='same', strides=(1, 1), activation='relu')(layer)
layer = MaxPool2D((3, 3), padding='same', strides=(2, 2))(layer)

layer2b = residual_module(input_layer, 64)
layer2b = BatchNormalization()(layer2b)
layer2b = Dropout(0.5)(layer2b)

layer2a = Conv2D(64, (1, 1), padding='same', strides=(1, 1), activation='relu')(layer2b)
layer2a = Conv2D(192, (3, 3), padding='same', strides=(1, 1), activation='relu')(layer2a)
layer2a = MaxPool2D((3, 3), padding='same', strides=(2, 2))(layer2a)

output = residual_module(layer2b, 64)

output = Flatten()(output)
out = Dense(30, activation='linear')(output)

two_res_model = Model(inputs = input_layer, outputs = out)
print(two_res_model.summary())

epochs = 200
lrate = 0.01
decay = lrate/epochs

sgd = SGD(lr=lrate, momentum=0.9, decay=decay, nesterov=False)
two_res_model.compile(loss='mean_squared_error', optimizer=sgd, metrics=['mean_squared_error'])
two_res_model.fit(X.astype(np.float32), yn.astype(np.float32), validation_split=0.15, epochs=epochs, batch_size=32)
make_subm(two_res_model, '84_no_flip_2res_3conv_64_sgd_200ep_BN_Dropout_linear')
plot_model(two_res_model, show_shapes=True, to_file='/content/drive/My Drive/School/UC Berkeley MIDS/'\
           'W207 Applied Machine Learning/Final Project/facial-keypoints-detection/2_res_64_3conv_BN_Dropout_module.png')

__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_1 (InputLayer)            (None, 96, 96, 1)    0                                            
__________________________________________________________________________________________________
conv2d_23 (Conv2D)              (None, 96, 96, 64)   640         input_1[0][0]                    
__________________________________________________________________________________________________
conv2d_24 (Conv2D)              (None, 96, 96, 64)   36928       conv2d_23[0][0]                  
__________________________________________________________________________________________________
conv2d_22 (Conv2D)              (None, 96, 96, 64)   128         input_1[0][0]                    
__________________________________________________________________________________________________
add_5 (Add

# MobileNet Study
MobileNet is a class of efficient model targeted for mobile and embedded vision applications. MobileNets are based on a streamlined architecture that uses depth-wise separable convolutions to build light weight deep neural networks. 

## Background
![alt text](https://)

Regular convolution operation combines the values of all the input channels into one channel per pixel, as shown above.  

![alt text](https://)

MobileNets performs depthwise convolution followed by pointwise convolution. Depthwise convolution creates an output image that also has three channels. Each channel gets its own set of weights. The purpose of the depthwise convolution is to filter the input channels, like edge detection, color filtering, etc. 

![alt text](https://)

Pointwise convolution is the same as a regular convolution but with a 1×1 kernel. Comparing to regular convolution, depthwise separable convolution achieves similar end results, but depthwise separable convolution saves a lot of computations because regular convolution will need to do more work and learn more weights. 

##Modeling
Since MobileNet performs depthwise separable convolution, it requires data to have 3 channels. Our data is grayscale and only has 1 channel. To fix this, we filled the other channels with the values from the single channel. 

We started out using a very simple model with BatchNormalization, Pooling, and Dropout, which does surprisingly well. We have experimented with different Dropout thresholds as well as optimizers and found that the following simple model performs the best.

## Results
A simple model with BatchNormalization, Pooling, and Dropout, as shown below, produces an impressive Kaggle score of 2.6 RMSE. 


In [0]:
base_mobile_model = MobileNet(input_shape = (96, 96, 3), include_top=False)
mobilenet_model = Sequential()
mobilenet_model.add(BatchNormalization(input_shape = (96, 96, 3)))
mobilenet_model.add(base_mobile_model)
mobilenet_model.add(BatchNormalization())
mobilenet_model.add(GlobalAveragePooling2D())
mobilenet_model.add(Dropout(0.5))
mobilenet_model.add(Dense(30, activation = 'linear' ))

new_X = np.zeros((len(X), 96, 96, 3))
print(new_X.shape)
for i in range(len(X)):
  x = X[i]
  new_X[i,:,:,0] = x.reshape(96,96)
  new_X[i,:,:,1] = x.reshape(96,96)
  new_X[i,:,:,2] = x.reshape(96,96)

epochs = 300
lrate = 0.01
decay = lrate/epochs

sgd = SGD(lr=lrate, momentum=0.9, decay=decay, nesterov=False)
mobilenet_model.compile(optimizer = 'adam', loss = 'mse', metrics = ['mean_squared_error'])

mobilenet_model.summary()
mobilenet_model.fit(new_X.astype(np.float32), yn.astype(np.float32), validation_split=0.15, epochs = 200)


W0804 07:18:00.509545 139727688898432 nn_ops.py:4224] Large dropout rate: 0.7 (>0.5). In TensorFlow 2.x, dropout() uses dropout rate instead of keep_prob. Please ensure that this is intended.


(2140, 96, 96, 3)
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
batch_normalization_62 (Batc (None, 96, 96, 3)         12        
_________________________________________________________________
mobilenet_1.00_224 (Model)   (None, 3, 3, 1024)        3228864   
_________________________________________________________________
batch_normalization_63 (Batc (None, 3, 3, 1024)        4096      
_________________________________________________________________
global_average_pooling2d_30  (None, 1024)              0         
_________________________________________________________________
dropout_30 (Dropout)         (None, 1024)              0         
_________________________________________________________________
dense_31 (Dense)             (None, 30)                30750     
Total params: 3,263,722
Trainable params: 3,239,780
Non-trainable params: 23,942
___________________________________________

<keras.callbacks.History at 0x7f13cd8ee828>

In [0]:
# This section processes the input channel data so it can be passed to the MobileNet. It also plots the model and saves the predictions in Kaggle format 
# so it can be used to submit for scoring.

test_df = pd.read_csv('/content/drive/My Drive/School/UC Berkeley MIDS/W207 Applied Machine Learning/Final Project/facial-keypoints-detection/test.csv')
test_df['Image'] = test_df['Image'].map(lambda x: np.array(x.split()).astype(int) / 255.0)
keypoints = ['left_eye_center_x', 'left_eye_center_y', 'right_eye_center_x', 'right_eye_center_y', 'left_eye_inner_corner_x', 'left_eye_inner_corner_y',
             'left_eye_outer_corner_x', 'left_eye_outer_corner_y', 'right_eye_inner_corner_x', 'right_eye_inner_corner_y', 'right_eye_outer_corner_x',
             'right_eye_outer_corner_y', 'left_eyebrow_inner_end_x', 'left_eyebrow_inner_end_y', 'left_eyebrow_outer_end_x', 'left_eyebrow_outer_end_y', 
             'right_eyebrow_inner_end_x', 'right_eyebrow_inner_end_y', 'right_eyebrow_outer_end_x', 'right_eyebrow_outer_end_y', 'nose_tip_x', 'nose_tip_y',
             'mouth_left_corner_x', 'mouth_left_corner_y', 'mouth_right_corner_x', 'mouth_right_corner_y', 'mouth_center_top_lip_x', 'mouth_center_top_lip_y',
             'mouth_center_bottom_lip_x', 'mouth_center_bottom_lip_y']
predictions_df = pd.DataFrame(columns=keypoints)

test_image = np.array([x.reshape(96,96,1) for x in test_df['Image']])
new_test_image = np.zeros((len(test_image), 96, 96, 3))
for i in range(len(test_image)):
  image = test_image[i]
  new_test_image[i,:,:,0] = image.reshape(96,96)
  new_test_image[i,:,:,1] = image.reshape(96,96)
  new_test_image[i,:,:,2] = image.reshape(96,96)


predictions = mobilenet_model.predict(new_test_image)
predictions = (predictions+1)*48
predictions[predictions<0] = 0
predictions[predictions>96] = 96

pos = 0 
for keypoint in keypoints:
  predictions_df[keypoint] = predictions[:, pos]
  pos += 1
  
predictions_df = pd.concat([test_df, predictions_df], axis=1)
predictions_df.set_index('ImageId')

result_df = pd.read_csv('/content/drive/My Drive/School/UC Berkeley MIDS/W207 Applied Machine Learning/Final Project/facial-keypoints-detection/IdLookupTable.csv')

for row in range(result_df.shape[0]):
  image = result_df.at[row, 'ImageId']
  feature = result_df.at[row, 'FeatureName']
  result_df.at[row, 'Location'] = predictions_df.at[image-1,feature]
  
# Extract relevant columns to produce final submission file.
submission_df = pd.concat([result_df['RowId'], result_df['Location']], axis=1)
# Save submission file to Google Drive.
submission_df.to_csv('/content/drive/My Drive/School/UC Berkeley MIDS/W207 Applied Machine Learning/Final Project/facial-keypoints-detection/submission_'
                     '83_no_flip_mobile_adam_200ep_dropoutp7.csv', index=False)
plot_model(mobilenet_model, show_shapes=True, to_file='/content/drive/My Drive/School/UC Berkeley MIDS/'
           'W207 Applied Machine Learning/Final Project/facial-keypoints-detection/mobile_linear_module.png')