# Neural Net Analysis Notebook
## W207 Final Project
### T. P. Goter
### July 6, 2019

This workbook is used to assess various models created as part of the Facial Keypoint Detection project for W207.

In [19]:
# Import the packages we need
import pandas as pd
from matplotlib import pyplot as plt
%matplotlib inline
import numpy as np
import ipywidgets as widgets
from ipywidgets import interact, interact_manual, fixed
import os

In [20]:
# Load the pkled dataframe for the baseline single layer neural net
bl_sl_df = pd.read_pickle("OutputData/single_layer_df.pkl")
bl_sl_df['cum_times'] = bl_sl_df.groupby(['hunits', 'activation', 'optimizer', 'lrate']).times.cumsum()

Unnamed: 0,loss,mean_squared_error,val_loss,val_mean_squared_error,epoch,RMSE,val_RMSE,times,hunits,activation,optimizer,lrate
320,2.036483,2.036483,32.099414,32.099411,320,1.427054,5.665634,0.480469,200,relu,adam,0.001
31,11.933042,11.933043,11.3236,11.323601,31,3.454424,3.365056,0.794,200,sigmoid,adagrad,0.01
242,10.228678,10.228679,9.917707,9.917706,242,3.198231,3.149239,0.372951,150,tanh,adam,0.001
269,9.929816,9.929816,9.6317,9.631701,269,3.151161,3.103498,0.543546,150,sigmoid,adagrad,0.01
251,10.225569,10.22557,9.91482,9.91482,251,3.197744,3.148781,1.192174,200,sigmoid,nadam,0.002
179,14.142279,14.142282,44.739003,44.739006,179,3.760622,6.688722,0.196416,50,relu,adam,0.001
115,10.271638,10.271636,9.937835,9.937836,115,3.204939,3.152433,0.268572,100,sigmoid,sgd,0.01
70,10.194059,10.19406,9.886653,9.886653,70,3.192814,3.144305,0.190728,50,tanh,adam,0.001
95,16.051545,16.051546,42.511925,42.511925,95,4.006438,6.520117,0.385449,150,relu,adam,0.001
191,10.229621,10.229621,9.928338,9.928338,191,3.198378,3.150927,0.477043,200,tanh,adam,0.001


In [86]:
bl_sl_df[bl_sl_df.optimizer == 'sgd']

Unnamed: 0,loss,mean_squared_error,val_loss,val_mean_squared_error,epoch,RMSE,val_RMSE,times,hunits,activation,optimizer,lrate,cum_times
0,28010.575914,28010.572266,2291.178069,2291.178223,0,167.363593,47.866253,0.265849,50,relu,sgd,0.01,0.265849
1,1619.893235,1619.893311,1050.184416,1050.184448,1,40.247898,32.406549,0.166498,50,relu,sgd,0.01,0.432347
2,731.095874,731.095886,472.246410,472.246399,2,27.038785,21.731231,0.166131,50,relu,sgd,0.01,0.598478
3,330.953334,330.953400,215.345170,215.345169,3,18.192125,14.674644,0.165214,50,relu,sgd,0.01,0.763692
4,152.905530,152.905533,101.042233,101.042221,4,12.365498,10.051976,0.192475,50,relu,sgd,0.01,0.956167
5,73.695376,73.695374,50.227416,50.227413,5,8.584601,7.087130,0.173594,50,relu,sgd,0.01,1.129761
6,38.447532,38.447533,27.734740,27.734743,6,6.200607,5.266379,0.179937,50,relu,sgd,0.01,1.309698
7,22.766007,22.766003,17.744032,17.744032,7,4.771373,4.212367,0.197338,50,relu,sgd,0.01,1.507036
8,15.785918,15.785920,13.324637,13.324637,8,3.973150,3.650293,0.196654,50,relu,sgd,0.01,1.703690
9,12.676719,12.676719,11.381038,11.381038,9,3.560438,3.373579,0.197424,50,relu,sgd,0.01,1.901114


In [80]:
# Create a plotting function to pass to the interact widget function
def plot_validation_loss(df=bl_sl_df, optimizer = bl_sl_df.optimizer.unique(), 
                    activation = bl_sl_df.activation.unique(), 
                         ymax=[y for y in range(0,100)],
                        timemax=250):
    
    # Subset the baseline df by the specified optimizer and activation
    sub_df = df[df.optimizer.str.match(optimizer)]
    sub_df = sub_df[sub_df.activation.str.match(activation)]
     
    # Group the neural net data by optimizer and activation
    groups = sub_df.groupby(['hunits'])
    fig, axes = plt.subplots(2, 2, figsize=(20, 20))
    axes = axes.flatten()
    
    # Loop over the grouped data and plot out epoch timing and validation loss data
    for name, group in groups:
        
        # Plot training and validation losses by epoch
        axes[0].plot(group.epoch, group.val_RMSE, label=str(name)+' Validation Loss')
        axes[0].scatter(group.epoch, group.RMSE, label=str(name)+' Training Loss')
        axes[0].set_ylim([0,ymax])
        axes[0].set_xlabel('Epoch')
        axes[0].set_ylabel('Root Mean Square Error')
        axes[0].set_title("{} Optimizer and {} Activation".format(group.optimizer.unique(), group.activation.unique()))
        
        # Plot train time by epoch
        axes[1].scatter(group.epoch, group.times*1000, label=str(name)+' Fit Time')
        axes[1].set_xlabel('Epoch')
        axes[1].set_ylabel('Fit Time (milliseconds)')
        axes[1].set_ylim([0,1000])
        axes[1].legend()
        
        # Plot cumulative training time
        axes[2].plot(group.epoch, group.cum_times, label=str(name)+' Fit Time (s)', lw=4)
        axes[2].set_xlabel('Epoch')
        axes[2].set_ylabel('Cumulative Fit Time (seconds)')
        axes[2].grid(b=True)
        axes[2].set_ylim([0,timemax])
        axes[2].legend()
        
        # Plot cumulative validation loss by cumulative time
        axes[3].plot(group.cum_times, group.val_RMSE, label=str(name)+' Validation Loss', lw=3)
        axes[3].set_xlabel('Cumulative Fit Time (seconds)')
        axes[3].set_ylabel('Validation RMSE')
        axes[3].set_xlim([0,timemax])
        axes[3].set_ylim([0,ymax])
        axes[3].legend()
    
    # Add line for knr score
    axes[0].axhline(2.49, label='kNR Score', lw=5, c='k')
    axes[0].legend()
    
    # Adjust the spacing of the subplots
    fig.subplots_adjust(left=0.03, right=0.97, hspace=0.1, wspace=0.15)

    # Add an overarching title for these plots
    fig.suptitle("Performance Comparison for Single Layer, Fully Connected Neural Nets",
                 fontsize=18, y=0.93)



In [82]:
interact_manual(plot_validation_loss, df=fixed(bl_sl_df), 
                optimizer = bl_sl_df.optimizer.unique(), 
                    activation = bl_sl_df.activation.unique(), ymax=10, timemax=250)
print()

interactive(children=(Dropdown(description='optimizer', options=('adam', 'sgd', 'nadam', 'adagrad'), value='ad…




### Assessment of Baseline Results

Now that we have some real results, we can make some real assessments of what is working and what is not.

1. Overall the sgd optimizer seems to be working the best. There is not a large difference between training and validation accuracies. In other words we aren't overfitting to the data.
2. SGD is also faster than the other optimizers. It is relatively close to Adam. NAdam on the other hand is twice as slow.
3. No neural net does better than our kNR model

In the evaluation above, both the output layer used a linear activation function while the hidden layer used a user specified optimizer. 

### Single Layer Assessment
Based on the three different experiments run, we will use a linear final layer activation function. We will continue to assess sgd and adam optimizers. We likely do not need to train for more than 200 epochs or so to get reasonably converged nets.
## Need to redo learning rate study

In [5]:
# Load the pkled dataframe for the baseline single layer neural net
relu_sl_lr_df = pd.read_pickle("OutputData/single_layer_relu_lr_df.pkl")
relu_sl_lr_df.sample(10)

Unnamed: 0,loss,mean_squared_error,val_loss,val_mean_squared_error,epoch,RMSE,val_RMSE,times,hunits,activation,optimizer,lrate
166,585.102651,585.1026,602.217882,602.217896,166,24.188894,24.540128,0.777923,200,relu,adagrad_02,0.02
197,101.179164,101.179169,121.990278,121.990288,197,10.058786,11.044921,0.195769,50,relu,adam_005,0.005
190,1382.132048,1382.13208,1389.458887,1389.458862,190,37.177037,37.275446,0.561554,150,relu,adagrad_005,0.05
184,84.662256,84.662247,116.007755,116.007736,184,9.201209,10.770689,0.490967,200,relu,adam_0005,0.0005
154,154.702597,154.702591,174.748838,174.748825,154,12.43795,13.21926,0.468345,200,relu,adam_005,0.005
121,293.815909,293.815887,292.408084,292.408112,121,17.141059,17.099945,0.259666,50,tanh,adagrad_02,0.02
62,360.313932,360.313965,370.613009,370.613007,62,18.981938,19.251312,0.271495,50,relu,adagrad_005,0.05
60,244.381297,244.381256,244.194078,244.194077,60,15.632698,15.62671,0.247615,50,tanh,adagrad_005,0.05
115,87.490673,87.490669,87.13756,87.137558,115,9.353645,9.33475,0.756261,200,tanh,adagrad_005,0.05
60,1393.413407,1393.413452,1396.197367,1396.197388,60,37.328454,37.365725,0.543673,150,relu,adagrad_005,0.05


In [6]:
# Plot the relu learning rate data
interact_manual(plot_validation_loss, df=fixed(relu_sl_lr_df), 
                optimizer = relu_sl_lr_df.optimizer.unique(), 
                    activation = relu_sl_lr_df.activation.unique())
print()

interactive(children=(Dropdown(description='optimizer', options=('adam_005', 'adam_0005', 'adagrad_02', 'adagr…




## Two Layer Results


In [109]:
# Load the pkled dataframe for the baseline single layer neural net
tl_df = pd.read_pickle("OutputData/two_layer_df.pkl")
tl_df['cum_times'] = tl_df.groupby(['hunits', 'activation', 'optimizer', 'lrate']).times.cumsum()
tl_df.sample(10)

Unnamed: 0,loss,mean_squared_error,val_loss,val_mean_squared_error,epoch,RMSE,val_RMSE,times,hunits,activation,optimizer,lrate,cum_times
145,,,,,145,,,0.398503,150,elu,sgd,0.01,61.407666
38,,,,,38,,,0.395313,150,relu,sgd,0.01,16.632931
95,,,,,95,,,0.416388,100,relu,sgd,0.01,41.205356
253,3.393705,3.393706,32.006817,32.006821,253,1.842201,5.657457,0.471379,50,relu,adam,0.001,129.203526
198,1.144119,1.144119,3.683345,3.683345,198,1.069635,1.919204,0.501398,100,elu,adam,0.001,99.629387
225,2.872493,2.872493,29.48744,29.487438,225,1.694843,5.430234,0.474248,100,relu,adam,0.001,112.383048
159,5.280693,5.280693,34.801313,34.801315,159,2.297976,5.899264,0.470124,150,relu,adam,0.001,76.343897
158,5.132459,5.132459,33.683566,33.683567,158,2.265493,5.803755,0.533842,100,relu,adam,0.001,78.727865
158,5.469104,5.469104,35.034632,35.034634,158,2.338612,5.919006,0.518651,50,relu,adam,0.001,81.698643
2,112.388315,112.388298,108.367533,108.367531,2,10.601335,10.409973,0.504838,50,relu,adam,0.001,1.834678


In [111]:
# Plot the relu learning rate data
interact_manual(plot_validation_loss, df=fixed(tl_df), 
                optimizer = tl_df.optimizer.unique(), 
                    activation = tl_df.activation.unique(),
               ymax=10,
               timemax=250)
print()

interactive(children=(Dropdown(description='optimizer', options=('adam', 'sgd'), value='adam'), Dropdown(descr…




# CNN Data below

## Baseline CNN

The baseline CNN was structured following Nuori blog on this Facial Keypoint Detection topic. It uses a CNN architecture of the following:

1. Input Layer
2. Convolution Layer with valid padding and add N filters
3. Max Pooling with 2,2 kernel
4. Repeat Steps 2 and 3, doubling the number of filters each time you convolve
5. After three full layers - flatten and condense to 500 units
6. 500 --> 500 fully connected layer
7. Linear activation to 30 output nodes

For our Baseline CNN we have chosen to use the relu activation and not use dropout layers. We train only on the non-nan data. And we have identified many follow on studies including:


1. *Model for each keypoint* - Some keypoints have a lot more training data than others. We can maximize the use of our training data by building separate specialized models for each keypoint.
2. *Transform images* - We can also expand our training data by applying transformations to our training images. We can do things like flipping on the x and y axes or rotating our images. We now will have a different feature matrix. The trick will be to also transform our keypoint locations appropriately.
3. *Filters* - Investigate different filter strategies - the 32, 64, 128 strategy employed is expensive. Can we reduce the number of filters and still get good results
4. *Drop Out* - Drop out layers can act as a means of regularization - If we notice overfitting we can try this as a correction mechanism
5. *Stride v Pool* - Evaluate time and accuracy differences between pooling and using a stride
6. *TPU Usage* - Identify when and how to best utilize TPU resources

In [88]:
# Load the pkled dataframe for the baseline cnn
cnn_df = pd.read_pickle("OutputData/cnn_base_df.pkl")
cnn_df['cum_times'] = cnn_df.groupby(['optimizer']).times.cumsum()

800

In [105]:
cnn_df.sample(20)

Unnamed: 0,loss,mean_squared_error,val_loss,val_mean_squared_error,epoch,RMSE,val_RMSE,times,layers,pooling,fc_layer,activation,optimizer,lrate,cum_times
127,2.437129,2.437129,3.216901,3.216901,127,1.561131,1.793572,9.558447,3,yes,500,relu,sgd,0.01,1202.983716
133,1.150528,1.150528,3.033182,3.033182,133,1.072627,1.741603,9.299843,3,yes,500,relu,adam,0.001,1275.841142
0,1451.824655,1451.824585,16.243768,16.243769,0,38.102816,4.030356,10.350786,3,yes,500,relu,nadam,0.002,10.350786
112,1.604778,1.604778,3.185822,3.185822,112,1.266798,1.784887,9.531875,3,yes,500,relu,adam,0.001,1076.675367
60,9.132258,9.132257,9.277314,9.277315,60,3.021962,3.045869,9.517048,3,yes,500,relu,adam,0.001,582.320275
38,10.204178,10.204181,9.677074,9.677074,38,3.194398,3.1108,9.506696,3,yes,500,relu,adagrad,0.01,372.32042
194,1.25724,1.25724,2.992142,2.992142,194,1.121267,1.729781,9.458734,3,yes,500,relu,sgd,0.01,1838.008554
93,4.646559,4.646559,4.644593,4.644594,93,2.155588,2.155132,9.434961,3,yes,500,relu,adagrad,0.01,890.737207
39,9.261003,9.261003,8.998551,8.998552,39,3.04319,2.999759,9.269853,3,yes,500,relu,sgd,0.01,377.735415
99,3.966604,3.966604,4.257668,4.257668,99,1.991633,2.063412,9.37502,3,yes,500,relu,adagrad,0.01,947.403977


# create a plotting function for our cnn data

In [102]:
# Create a plotting function to pass to the interact widget function
def plot_cnn_data(df=cnn_df, optimizer = cnn_df.optimizer.unique(), 
                    activation = cnn_df.activation.unique(), 
                         ymax=100,
                        timemax=1000):
        
    # Group the neural net data by optimizer
    groups = df.groupby(['optimizer'])
    fig, axes = plt.subplots(2, 2, figsize=(20, 20))
    axes = axes.flatten()
    
    # Loop over the grouped data and plot out epoch timing and validation loss data
    for name, group in groups:
        
        # Plot training and validation losses by epoch
        axes[0].plot(group.epoch, group.val_RMSE, label=str(name)+' Validation Loss')
        axes[0].scatter(group.epoch, group.RMSE, label=str(name)+' Training Loss')
        axes[0].set_ylim([0,ymax])
        axes[0].set_xlabel('Epoch')
        axes[0].set_ylabel('Root Mean Square Error')
        axes[0].set_title("{} Optimizer and {} Activation".format(group.optimizer.unique(), group.activation.unique()))
        
        # Plot train time by epoch
        axes[1].scatter(group.epoch, group.times, label=str(name)+' Fit Time')
        axes[1].set_xlabel('Epoch')
        axes[1].set_ylabel('Fit Time (seconds)')
        axes[1].set_ylim([0,timemax/group.epoch.max()])
        axes[1].legend()
        
        # Plot cumulative training time
        axes[2].plot(group.epoch, group.cum_times, label=str(name)+' Fit Time (s)', lw=4)
        axes[2].set_xlabel('Epoch')
        axes[2].set_ylabel('Cumulative Fit Time (seconds)')
        axes[2].grid(b=True)
        axes[2].set_ylim([0,timemax])
        axes[2].legend()
        
        # Plot cumulative validation loss by cumulative time
        axes[3].plot(group.cum_times, group.val_RMSE, label=str(name)+' Validation Loss', lw=3)
        axes[3].set_xlabel('Cumulative Fit Time (seconds)')
        axes[3].set_ylabel('Validation RMSE')
        axes[3].set_xlim([0,timemax])
        axes[3].set_ylim([0,ymax])
        axes[3].legend()
    
    # Add line for knr score
    axes[0].axhline(2.49, label='kNR Score', lw=5, c='k')
    axes[0].legend()
    
    # Adjust the spacing of the subplots
    fig.subplots_adjust(left=0.03, right=0.97, hspace=0.1, wspace=0.15)

    # Add an overarching title for these plots
    fig.suptitle("Performance Comparison for Baseline Convolutional Neural Nets",
                 fontsize=18, y=0.93)

In [103]:
# Plot the relu learning rate data
interact_manual(plot_cnn_data, df=fixed(cnn_df), 
                optimizer = cnn_df.optimizer.unique(), 
                    activation = cnn_df.activation.unique(),
               ymax=10,
               timemax=1000)
print()

interactive(children=(Dropdown(description='optimizer', options=('adam', 'sgd', 'nadam', 'adagrad'), value='ad…




In [104]:
cnn_df.val_loss

0      16.774411
1      10.803520
2      10.233881
3      10.233680
4       9.985762
5      10.038510
6      10.134028
7      10.044329
8       9.996206
9      10.064538
10     10.101108
11     10.020350
12     10.060766
13     10.003001
14     10.061866
15     10.032402
16     10.131830
17      9.986675
18      9.988453
19     10.018970
20     10.146545
21     10.287867
22      9.998142
23     10.046421
24     10.002417
25      9.982548
26      9.980573
27     10.262111
28     10.083271
29     10.200400
         ...    
170     2.851715
171     2.914028
172     2.778235
173     2.907122
174     2.875822
175     2.778594
176     2.955646
177     2.977480
178     2.783652
179     2.916816
180     2.749964
181     2.981440
182     2.985107
183     2.804005
184     2.708213
185     3.157891
186     2.834607
187     2.745469
188     2.774565
189     2.847229
190     2.791091
191     2.746600
192     2.729275
193     2.836102
194     2.726668
195     2.776544
196     2.863327
197     2.7399

## Assessment of Initial CNN Results
