# Neural Net Analysis Notebook
## W207 Final Project
### T. P. Goter
### July 6, 2019

This workbook is used to assess various models created as part of the Facial Keypoint Detection project for W207.

In [43]:
# Import the packages we need
import pandas as pd
from matplotlib import pyplot as plt
%matplotlib inline
import numpy as np
import ipywidgets as widgets
from ipywidgets import interact, interact_manual, fixed
import os
from bokeh.transform import linear_cmap
from bokeh.plotting import figure, show
from bokeh.io import output_notebook, push_notebook
from bokeh.models import ColumnDataSource
from bokeh.models import HoverTool

In [44]:
# Load the pkled dataframe for the baseline single layer neural net
bl_sl_df = pd.read_pickle("OutputData/single_layer_df.pkl")
bl_sl_df['cum_times'] = bl_sl_df.groupby(['hunits', 'activation', 'optimizer', 'lrate']).times.cumsum()

In [45]:
bl_sl_df[bl_sl_df.optimizer == 'sgd']

Unnamed: 0,loss,mean_squared_error,val_loss,val_mean_squared_error,epoch,RMSE,val_RMSE,times,hunits,activation,optimizer,lrate,cum_times
0,28010.575914,28010.572266,2291.178069,2291.178223,0,167.363593,47.866253,0.265849,50,relu,sgd,0.01,0.265849
1,1619.893235,1619.893311,1050.184416,1050.184448,1,40.247898,32.406549,0.166498,50,relu,sgd,0.01,0.432347
2,731.095874,731.095886,472.246410,472.246399,2,27.038785,21.731231,0.166131,50,relu,sgd,0.01,0.598478
3,330.953334,330.953400,215.345170,215.345169,3,18.192125,14.674644,0.165214,50,relu,sgd,0.01,0.763692
4,152.905530,152.905533,101.042233,101.042221,4,12.365498,10.051976,0.192475,50,relu,sgd,0.01,0.956167
5,73.695376,73.695374,50.227416,50.227413,5,8.584601,7.087130,0.173594,50,relu,sgd,0.01,1.129761
6,38.447532,38.447533,27.734740,27.734743,6,6.200607,5.266379,0.179937,50,relu,sgd,0.01,1.309698
7,22.766007,22.766003,17.744032,17.744032,7,4.771373,4.212367,0.197338,50,relu,sgd,0.01,1.507036
8,15.785918,15.785920,13.324637,13.324637,8,3.973150,3.650293,0.196654,50,relu,sgd,0.01,1.703690
9,12.676719,12.676719,11.381038,11.381038,9,3.560438,3.373579,0.197424,50,relu,sgd,0.01,1.901114


In [46]:
# Create a plotting function to pass to the interact widget function
def plot_validation_loss(df=bl_sl_df, optimizer = bl_sl_df.optimizer.unique(), 
                    activation = bl_sl_df.activation.unique(), 
                         ymax=[y for y in range(0,100)],
                        timemax=250):
    
    # Subset the baseline df by the specified optimizer and activation
    sub_df = df[df.optimizer.str.match(optimizer)]
    sub_df = sub_df[sub_df.activation.str.match(activation)]
     
    # Group the neural net data by optimizer and activation
    groups = sub_df.groupby(['hunits'])
    fig, axes = plt.subplots(2, 2, figsize=(20, 20))
    axes = axes.flatten()
    
    # Loop over the grouped data and plot out epoch timing and validation loss data
    for name, group in groups:
        
        # Plot training and validation losses by epoch
        axes[0].plot(group.epoch, group.val_RMSE, label=str(name)+' Validation Loss')
        axes[0].scatter(group.epoch, group.RMSE, label=str(name)+' Training Loss')
        axes[0].set_ylim([0,ymax])
        axes[0].set_xlabel('Epoch')
        axes[0].set_ylabel('Root Mean Square Error')
        axes[0].set_title("{} Optimizer and {} Activation".format(group.optimizer.unique(), group.activation.unique()))
        
        # Plot train time by epoch
        axes[1].scatter(group.epoch, group.times*1000, label=str(name)+' Fit Time')
        axes[1].set_xlabel('Epoch')
        axes[1].set_ylabel('Fit Time (milliseconds)')
        axes[1].set_ylim([0,1000])
        axes[1].legend()
        
        # Plot cumulative training time
        axes[2].plot(group.epoch, group.cum_times, label=str(name)+' Fit Time (s)', lw=4)
        axes[2].set_xlabel('Epoch')
        axes[2].set_ylabel('Cumulative Fit Time (seconds)')
        axes[2].grid(b=True)
        axes[2].set_ylim([0,timemax])
        axes[2].legend()
        
        # Plot cumulative validation loss by cumulative time
        axes[3].plot(group.cum_times, group.val_RMSE, label=str(name)+' Validation Loss', lw=3)
        axes[3].set_xlabel('Cumulative Fit Time (seconds)')
        axes[3].set_ylabel('Validation RMSE')
        axes[3].set_xlim([0,timemax])
        axes[3].set_ylim([0,ymax])
        axes[3].legend()
    
    # Add line for knr score
    axes[0].axhline(2.49, label='kNR Score', lw=5, c='k')
    axes[0].legend()
    
    # Adjust the spacing of the subplots
    fig.subplots_adjust(left=0.03, right=0.97, hspace=0.1, wspace=0.15)

    # Add an overarching title for these plots
    fig.suptitle("Performance Comparison for Single Layer, Fully Connected Neural Nets",
                 fontsize=18, y=0.93)



In [47]:
interact_manual(plot_validation_loss, df=fixed(bl_sl_df), 
                optimizer = bl_sl_df.optimizer.unique(), 
                    activation = bl_sl_df.activation.unique(), ymax=10, timemax=250)
print()

interactive(children=(Dropdown(description='optimizer', options=('adam', 'sgd', 'nadam', 'adagrad'), value='ad…




### Assessment of Baseline Results

Now that we have some real results, we can make some real assessments of what is working and what is not.

1. Overall the sgd optimizer seems to be working the best. There is not a large difference between training and validation accuracies. In other words we aren't overfitting to the data.
2. SGD is also faster than the other optimizers. It is relatively close to Adam. NAdam on the other hand is twice as slow.
3. No neural net does better than our kNR model

In the evaluation above, both the output layer used a linear activation function while the hidden layer used a user specified optimizer. 

### Single Layer Assessment
Based on the three different experiments run, we will use a linear final layer activation function. We will continue to assess sgd and adam optimizers. We likely do not need to train for more than 200 epochs or so to get reasonably converged nets.

The study below was conducted to determine the effects of learning rate on convergence and accuracy


In [48]:
# Load the pkled dataframe for the baseline single layer neural net
relu_sl_lr_df = pd.read_pickle("OutputData/single_layer_lr_df.pkl")
relu_sl_lr_df['cum_times'] = relu_sl_lr_df.groupby(['hunits', 'activation', 'optimizer', 'lrate']).times.cumsum()
relu_sl_lr_df.sample(10)

Unnamed: 0,loss,mean_squared_error,val_loss,val_mean_squared_error,epoch,RMSE,val_RMSE,times,hunits,activation,optimizer,lrate,cum_times
391,4.217018,4.217018,33.002135,33.00214,391,2.053538,5.744749,0.954722,150,relu,nadam,0.002,367.614386
79,34.569408,34.569401,54.616973,54.61697,79,5.879575,7.390329,0.226374,50,relu,adam,0.001,17.775728
6,86.400589,86.400597,99.930309,99.930313,6,9.295192,9.996515,0.530176,200,relu,adam,0.001,3.962211
44,33.866867,33.866867,52.360566,52.360569,44,5.819525,7.23606,0.384534,150,relu,adam,0.001,17.735224
13,55.886526,55.88652,72.86989,72.869888,13,7.475729,8.536386,0.707683,100,relu,nadam,0.002,10.232548
282,10.189049,10.189051,9.883392,9.883392,282,3.192029,3.143786,0.32225,150,relu,sgd,0.01,98.417383
353,2.924278,2.924278,31.046858,31.046856,353,1.710052,5.571971,0.398027,150,relu,adam,0.001,135.740402
329,34.19132,34.191319,55.489284,55.489285,329,5.847334,7.449113,0.285192,50,relu,adagrad,0.01,89.823591
186,4.435796,4.435796,36.28173,36.281727,186,2.106133,6.023431,0.434601,200,relu,adam,0.001,89.114765
328,4.846084,4.846084,39.477733,39.47773,328,2.201382,6.283131,0.408713,50,relu,nadam,0.002,130.862197


In [49]:
# Plot the relu learning rate data
interact_manual(plot_validation_loss, df=fixed(relu_sl_lr_df), 
                optimizer = relu_sl_lr_df.optimizer.unique(), 
                    activation = relu_sl_lr_df.activation.unique())
print()

interactive(children=(Dropdown(description='optimizer', options=('adam', 'sgd', 'nadam', 'adagrad'), value='ad…




## Two Layer Results


In [50]:
# Load the pkled dataframe for the baseline single layer neural net
tl_df = pd.read_pickle("OutputData/two_layer_df.pkl")
tl_df['cum_times'] = tl_df.groupby(['hunits', 'activation', 'optimizer', 'lrate']).times.cumsum()
tl_df.sample(10)

Unnamed: 0,loss,mean_squared_error,val_loss,val_mean_squared_error,epoch,RMSE,val_RMSE,times,hunits,activation,optimizer,lrate,cum_times
59,,,,,59,,,0.410201,50,elu,sgd,0.01,26.104618
56,,,,,56,,,0.417943,100,elu,sgd,0.01,25.27202
204,,,,,204,,,0.410311,50,relu,sgd,0.01,85.895397
79,,,,,79,,,0.401661,50,elu,sgd,0.01,34.426956
280,1.210054,1.210054,4.361899,4.361898,280,1.100025,2.088516,0.453382,50,elu,adam,0.001,141.840752
291,2.045783,2.045783,27.729059,27.729061,291,1.430309,5.265839,0.483608,100,relu,adam,0.001,144.433593
289,,,,,289,,,0.485357,100,elu,sgd,0.01,126.208189
64,,,,,64,,,0.410233,150,elu,sgd,0.01,26.59216
270,3.388154,3.388154,29.360584,29.360586,270,1.840694,5.418541,0.485379,100,relu,adam,0.001,134.093936
20,,,,,20,,,0.427797,150,relu,sgd,0.01,9.299155


In [51]:
# Plot the relu learning rate data
interact_manual(plot_validation_loss, df=fixed(tl_df), 
                optimizer = tl_df.optimizer.unique(), 
                    activation = tl_df.activation.unique(),
               ymax=10,
               timemax=250)
print()

interactive(children=(Dropdown(description='optimizer', options=('adam', 'sgd'), value='adam'), Dropdown(descr…




# CNN Data below

## Baseline CNN

The baseline CNN was structured following Nuori blog on this Facial Keypoint Detection topic. It uses a CNN architecture of the following:

1. Input Layer
2. Convolution Layer with valid padding and add N filters
3. Max Pooling with 2,2 kernel
4. Repeat Steps 2 and 3, doubling the number of filters each time you convolve
5. After three full layers - flatten and condense to 500 units
6. 500 --> 500 fully connected layer
7. Linear activation to 30 output nodes

For our Baseline CNN we have chosen to use the relu activation and not use dropout layers. We train only on the non-nan data. And we have identified many follow on studies including:


1. *Model for each keypoint* - Some keypoints have a lot more training data than others. We can maximize the use of our training data by building separate specialized models for each keypoint.
2. *Transform images* - We can also expand our training data by applying transformations to our training images. We can do things like flipping on the x and y axes or rotating our images. We now will have a different feature matrix. The trick will be to also transform our keypoint locations appropriately.
3. *Filters* - Investigate different filter strategies - the 32, 64, 128 strategy employed is expensive. Can we reduce the number of filters and still get good results
4. *Drop Out* - Drop out layers can act as a means of regularization - If we notice overfitting we can try this as a correction mechanism
5. *Stride v Pool* - Evaluate time and accuracy differences between pooling and using a stride
6. *TPU Usage* - Identify when and how to best utilize TPU resources

In [52]:
test_df = pd.read_pickle('OutputData/cnn_vgg_df.pkl')

In [53]:
test_df.head(5)

Unnamed: 0,loss,mean_squared_error,val_loss,val_mean_squared_error,epoch,RMSE,val_RMSE,times,starting_filter,layers,pooling,fc_layer,activation,optimizer,lrate,dropout_initial,dropout_step,batch_norm,bias,arch
0,2014.237548,2014.237671,1966.87327,1966.873413,0,44.880259,44.349447,4.343745,16,4,yes,500,relu,adam,0.001,0.0,0.0,1,0,vgg
1,944.676497,944.676331,1121.239891,1121.239868,1,30.735587,33.48492,1.400704,16,4,yes,500,relu,adam,0.001,0.0,0.0,1,0,vgg
2,330.445365,330.445312,331.134516,331.134521,2,18.178155,18.197102,1.425586,16,4,yes,500,relu,adam,0.001,0.0,0.0,1,0,vgg
3,105.024883,105.024895,288.332089,288.332062,3,10.248165,16.980343,1.410569,16,4,yes,500,relu,adam,0.001,0.0,0.0,1,0,vgg
4,43.974663,43.974663,37.349535,37.349537,4,6.631339,6.111427,1.403307,16,4,yes,500,relu,adam,0.001,0.0,0.0,1,0,vgg


In [69]:
def agglomerate_data():
    '''
    Function used to concatentate results from various sensitivities together
    '''
    # Read in base data frames
    cnn_base_df = pd.read_pickle('OutputData/cnn_base_df.pkl')
    # Add cumulative times to dataframe
    cnn_base_df['cum_times'] = cnn_base_df.groupby(['optimizer']).times.cumsum()
    # Adjust data for consistency with other dataframes
    cnn_base_df['starting_filter'] = 32
    cnn_base_df['dropout_initial'] = 0
    cnn_base_df['dropout_step'] = 0
    cnn_base_df['bias'] = 1
    cnn_base_df['batch_norm'] = 0
    cnn_base_df['stride'] = 1
    # Add flipped flag
    cnn_base_df['flipped'] = 0
    cnn_base_df['fc_layer1'], cnn_base_df['fc_layer2'] = cnn_base_df.fc_layer, cnn_base_df.fc_layer
    cnn_base_df['arch'] = 'alex'
    cnn_base_df['keypoint'] = "All"
    
    # Read in filter data frames
    cnn_filter_df = pd.read_pickle('OutputData/cnn_filter_df.pkl')
    # Add cumulative times to dataframe
    cnn_filter_df['cum_times'] = cnn_filter_df.groupby(['starting_filter', 'optimizer']).times.cumsum()
    # Adjust data for consistency with other dataframes
    cnn_filter_df['dropout_initial'] = 0
    cnn_filter_df['dropout_step'] = 0
    cnn_filter_df['bias'] = 1
    cnn_filter_df['batch_norm'] = 0
    cnn_filter_df['stride'] = 1
    # Add flipped flag
    cnn_filter_df['flipped'] = 0
    cnn_filter_df['fc_layer1'], cnn_filter_df['fc_layer2'] = cnn_filter_df.fc_layer, cnn_filter_df.fc_layer
    cnn_filter_df['arch'] = 'alex'
    cnn_filter_df['keypoint'] = "All"
    
    # Read in dropout data frames
    cnn_do_df = pd.read_pickle('OutputData/cnn_dropout_df.pkl')
    # Add cumulative times to dataframe
    cnn_do_df['cum_times'] = cnn_do_df.groupby(['dropout', 'optimizer']).times.cumsum()
    # Adjust data for consistency with other dataframes
    cnn_do_df['dropout_initial'] = cnn_do_df.dropout
    cnn_do_df['dropout_step'] = 0
    cnn_do_df = cnn_do_df.drop(['dropout'], axis=1)
    cnn_do_df['bias'] = 1
    cnn_do_df['batch_norm'] = 0
    cnn_do_df['stride'] = 1
    # Add flipped flag
    cnn_do_df['flipped'] = 0
    cnn_do_df['fc_layer1'], cnn_do_df['fc_layer2'] = cnn_do_df.fc_layer, cnn_do_df.fc_layer
    cnn_do_df['arch'] = 'alex'
    cnn_do_df['keypoint'] = "All"
   
    # Second dropout dataframe
    cnn_do2_df = pd.read_pickle('OutputData/cnn_dropout_df2.pkl')
    # Add cumulative times to dataframe
    cnn_do2_df['cum_times'] = cnn_do2_df.groupby(['dropout', 'optimizer']).times.cumsum()
    # Adjust data for consistency with other dataframes
    cnn_do2_df['dropout_initial'] = cnn_do2_df.dropout
    cnn_do2_df['dropout_step'] = 0
    cnn_do2_df = cnn_do2_df.drop(['dropout'], axis=1)
    cnn_do2_df['bias'] = 1
    cnn_do2_df['batch_norm'] = 0
    cnn_do2_df['stride'] = 1
    # Add flipped flag
    cnn_do2_df['flipped'] = 0
    cnn_do2_df['fc_layer1'], cnn_do2_df['fc_layer2'] = cnn_do2_df.fc_layer, cnn_do2_df.fc_layer
    cnn_do2_df['arch'] = 'alex'
    cnn_do2_df['keypoint'] = "All"
    
    # Dropout with changing step dataframe
    cnn_do3_df = pd.read_pickle('OutputData/cnn_dropout_step_df.pkl')
    # Add cumulative times to dataframe
    cnn_do3_df['cum_times'] = cnn_do3_df.groupby(['dropout_initial', 'dropout_step', 'optimizer']).times.cumsum()
    # Adjust data for consistency with other dataframes
    cnn_do3_df['bias'] = 1
    cnn_do3_df['batch_norm'] = 0
    cnn_do3_df['stride'] = 1
    # Add flipped flag
    cnn_do3_df['flipped'] = 0
    cnn_do3_df['fc_layer1'], cnn_do3_df['fc_layer2'] = cnn_do3_df.fc_layer, cnn_do3_df.fc_layer
    cnn_do3_df['arch'] = 'alex'
    cnn_do3_df['keypoint'] = "All"
    
    # Read in Batch Normalization Data
    cnn_bn_df = pd.read_pickle('OutputData/cnn_dropout_bn_df.pkl')
    # Add cumulative times to dataframe
    cnn_bn_df['cum_times'] = cnn_bn_df.groupby(['bias', 'starting_filter', 'dropout_initial', 'dropout_step']).times.cumsum()
    cnn_bn_df['stride'] = 1
    # Add flipped flag
    cnn_bn_df['flipped'] = 0
    cnn_bn_df['fc_layer1'], cnn_bn_df['fc_layer2'] = cnn_bn_df.fc_layer, cnn_bn_df.fc_layer
    cnn_bn_df['arch'] = 'alex'
    cnn_bn_df['keypoint'] = "All"
    
    # Read in Batch Normalization Data
    cnn_stride_df = pd.read_pickle('OutputData/cnn_stride_df.pkl')
    # Add cumulative times to dataframe
    cnn_stride_df['cum_times'] = cnn_stride_df.groupby(['optimizer', 'dropout_initial', 'dropout_step']).times.cumsum()
    # Add flipped flag
    cnn_stride_df['flipped'] = 0
    cnn_stride_df['fc_layer1'], cnn_stride_df['fc_layer2'] = cnn_stride_df.fc_layer, cnn_stride_df.fc_layer
    cnn_stride_df['arch'] = 'alex'
    cnn_stride_df['keypoint'] = "All"
    
    # Read in Batch Normalization Data
    cnn_lr_df = pd.read_pickle('OutputData/cnn_lr_df.pkl')
    # Add cumulative times to dataframe
    cnn_lr_df['cum_times'] = cnn_lr_df.groupby(['starting_filter', 'dropout_initial', 'dropout_step', 'lrate']).times.cumsum()
    # Add flipped flag
    cnn_lr_df['flipped'] = 0
    cnn_lr_df['arch'] = 'alex'
    cnn_lr_df['keypoint'] = "All"
    
    # Read in Batch Normalization Data
    cnn_flipped_df = pd.read_pickle('OutputData/cnn_flipped_df.pkl')
    # Add cumulative times to dataframe
    cnn_flipped_df['cum_times'] = cnn_flipped_df.groupby(['starting_filter', 'dropout_initial', 'dropout_step', 'lrate']).times.cumsum()   
    cnn_flipped_df['arch'] = 'alex'
    cnn_flipped_df['keypoint'] = "All"
    
    # Read in Batch Normalization Data
    cnn_flipped_df2 = pd.read_pickle('OutputData/cnn_flipped_df2.pkl')
    # Add cumulative times to dataframe
    cnn_flipped_df2['cum_times'] = cnn_flipped_df2.groupby(['starting_filter', 'dropout_initial', 'dropout_step', 'lrate']).times.cumsum()      
    cnn_flipped_df2['arch'] = 'alex'
    cnn_flipped_df2['keypoint'] = "All"
    
    # Read in Batch Normalization Data
    cnn_flipped_df3 = pd.read_pickle('OutputData/cnn_flipped_df3.pkl')
    # Add cumulative times to dataframe
    cnn_flipped_df3['cum_times'] = cnn_flipped_df3.groupby(['starting_filter', 'dropout_initial', 'dropout_step', 'lrate']).times.cumsum()   
    cnn_flipped_df3['arch'] = 'alex'
    cnn_flipped_df3['keypoint'] = "All"

    # Read in Batch Normalization Data
    cnn_flipped_df4 = pd.read_pickle('OutputData/cnn_flipped_df4.pkl')
    # Add cumulative times to dataframe
    cnn_flipped_df4['cum_times'] = cnn_flipped_df4.groupby(['starting_filter', 'dropout_initial', 'dropout_step', 'lrate']).times.cumsum()   
    cnn_flipped_df4['arch'] = 'alex'
    cnn_flipped_df4['keypoint'] = "All"
    
    # Read in Batch Normalization Data
    cnn_flipped_df5 = pd.read_pickle('OutputData/cnn_flipped_df5.pkl')
    # Add cumulative times to dataframe
    cnn_flipped_df5['cum_times'] = cnn_flipped_df5.groupby(['starting_filter', 'dropout_initial', 'dropout_step', 'lrate']).times.cumsum()   
    cnn_flipped_df5['fc_layer2'] = 500
    cnn_flipped_df5['arch'] = 'alex'
    cnn_flipped_df5['keypoint'] = "All"
    
    # Read in Batch Normalization Data
    cnn_flipped_df6 = pd.read_pickle('OutputData/cnn_flipped_df6.pkl')
    # Add cumulative times to dataframe
    cnn_flipped_df6['cum_times'] = cnn_flipped_df6.groupby(['starting_filter', 'dropout_initial', 'dropout_step', 'lrate']).times.cumsum()   
    cnn_flipped_df6['arch'] = 'alex'
    cnn_flipped_df6['keypoint'] = "All"
    
    # Read in Batch Normalization Data
    cnn_vgg_df = pd.read_pickle('OutputData/cnn_vgg_df.pkl')
    # Add cumulative times to dataframe
    cnn_vgg_df['cum_times'] = cnn_vgg_df.groupby(['starting_filter', 'dropout_initial', 'dropout_step', 'lrate']).times.cumsum()   
    cnn_vgg_df['fc_layer1'] = 500
    cnn_vgg_df['fc_layer2'] = 500
    cnn_vgg_df['flipped'] = 0
    cnn_vgg_df['stride'] = 1
    cnn_vgg_df['keypoint'] = "All"
    
    # Read in Batch Normalization Data
    cnn_vgg_flipped_df = pd.read_pickle('OutputData/cnn_vgg_flipped_df.pkl')
    # Add cumulative times to dataframe
    cnn_vgg_flipped_df['cum_times'] = cnn_vgg_flipped_df.groupby(['starting_filter', 'dropout_initial', 'dropout_step', 'lrate']).times.cumsum()   
    cnn_vgg_flipped_df['fc_layer1'] = 500
    cnn_vgg_flipped_df['fc_layer2'] = 500
    cnn_vgg_flipped_df['stride'] = 1
    cnn_vgg_flipped_df['flipped'] = 1
    cnn_vgg_flipped_df['keypoint'] = "All"
    
    # Read in Batch Normalization Data
    cnn_vgg_flipped_df2 = pd.read_pickle('OutputData/cnn_vgg_flipped2_df.pkl')
    # Add cumulative times to dataframe
    cnn_vgg_flipped_df2['cum_times'] = cnn_vgg_flipped_df2.groupby(['starting_filter', 'dropout_initial', 'dropout_step', 'lrate']).times.cumsum()   
    cnn_vgg_flipped_df2['fc_layer1'] = 500
    cnn_vgg_flipped_df2['fc_layer2'] = 500
    cnn_vgg_flipped_df2['stride'] = 1
    cnn_vgg_flipped_df2['flipped'] = 1
    cnn_vgg_flipped_df2['keypoint'] = "All"
    
    # Read in Batch Normalization Data
    cnn_vgg_flipped_df3 = pd.read_pickle('OutputData/cnn_vgg_flipped3_df.pkl')
    # Add cumulative times to dataframe
    cnn_vgg_flipped_df3['cum_times'] = cnn_vgg_flipped_df3.groupby(['starting_filter', 'dropout_initial', 'dropout_step', 'lrate']).times.cumsum()   
    cnn_vgg_flipped_df3['fc_layer1'] = 500
    cnn_vgg_flipped_df3['fc_layer2'] = 500
    cnn_vgg_flipped_df3['stride'] = 1
    cnn_vgg_flipped_df3['flipped'] = 1
    cnn_vgg_flipped_df3['keypoint'] = "All"
    
    # Read in Batch Normalization Data
    cnn_vgg_flipped_df4 = pd.read_pickle('OutputData/cnn_vgg_flipped4_df.pkl')
    # Add cumulative times to dataframe
    cnn_vgg_flipped_df4['cum_times'] = cnn_vgg_flipped_df4.groupby(['starting_filter', 'dropout_initial', 'dropout_step', 'lrate']).times.cumsum()   
    cnn_vgg_flipped_df4['fc_layer1'] = 500
    cnn_vgg_flipped_df4['fc_layer2'] = 500
    cnn_vgg_flipped_df4['stride'] = 1
    cnn_vgg_flipped_df4['flipped'] = 1
    cnn_vgg_flipped_df4['keypoint'] = "All"
    
    # Read in Specialist Model Data
    cnn_spec01_df = pd.read_pickle('OutputData/spec_01.pkl')
    # Add cumulative times to dataframe
    cnn_spec01_df['cum_times'] = cnn_spec01_df.groupby(['keypoint']).times.cumsum()   
    cnn_spec01_df['arch'] = 'alex'
    
    cnn_total_df = pd.concat([cnn_spec01_df,
                              cnn_vgg_df, cnn_vgg_flipped_df, cnn_vgg_flipped_df2, cnn_vgg_flipped_df3, cnn_vgg_flipped_df4,
                              cnn_flipped_df6, cnn_flipped_df5, cnn_flipped_df4, cnn_flipped_df3, cnn_flipped_df2,
                              cnn_lr_df, cnn_stride_df, cnn_bn_df, cnn_do_df,
                              cnn_do2_df, cnn_do3_df, cnn_filter_df,
                              cnn_base_df], sort=True)
    cnn_total_df.to_pickle('OutputData/integrated_cnn_df.pkl')
    
    groups = cnn_total_df.groupby(
        ['arch','optimizer', 'dropout_initial', 'dropout_step', 'starting_filter',
         'bias', 'batch_norm', 'stride', 'lrate', 'flipped', 'fc_layer1', 'fc_layer2', 'keypoint'])
    
    # Grab min errors
    mins = groups.val_RMSE.min().reset_index() 
    
    # Grab median epoch training time
    meds = groups.times.median().reset_index()
    
    # Insert the median times into the min error dataframe
    mins['median_epoch_time'] = meds.times
    print(mins)    
    
    # Write the aggregated dataframe to Output
    mins.to_pickle('OutputData/aggregated_cnn_df.pkl')
    
    return cnn_total_df, mins
    
cnn_df, min_df = agglomerate_data()

     arch optimizer  dropout_initial  dropout_step  starting_filter  bias  \
0    alex   adagrad             0.00          0.00               32     1   
1    alex      adam             0.00          0.00                3     1   
2    alex      adam             0.00          0.00                5     1   
3    alex      adam             0.00          0.00               12     0   
4    alex      adam             0.00          0.00               12     0   
5    alex      adam             0.00          0.00               12     0   
6    alex      adam             0.00          0.00               12     0   
7    alex      adam             0.00          0.00               12     0   
8    alex      adam             0.00          0.00               12     0   
9    alex      adam             0.00          0.00               12     0   
10   alex      adam             0.00          0.00               12     0   
11   alex      adam             0.00          0.00               12     0   

# create a plotting function for our cnn data

In [123]:
# Create a plotting function to pass to the interact widget function
def plot_cnn_data(df=cnn_df,
                  optimizer = cnn_df.optimizer.unique(), 
                  activation = cnn_df.activation.unique(),
                  starting_filter = cnn_df.starting_filter.unique(),
                  dropout_initial = cnn_df.dropout_initial.unique(),
                  dropout_step = cnn_df.dropout_step.unique(),
                  bias = cnn_df.bias.unique(),
                  batch_norm = cnn_df.batch_norm.unique(),
                  stride= cnn_df.stride.unique(),
                  lrate = cnn_df.lrate.unique(),
                  fc1 = cnn_df.fc_layer1.unique(),
                  fc2 = cnn_df.fc_layer2.unique(),
                  flipped = [0,1],
#                   arch = cnn_df.arch.unique(),
                  keypoint = cnn_df.keypoint.unique(),
                  ymax=100,
                  timemax=1000):
    
    sub_df = df[df.activation==activation]
    sub_df = df[df.optimizer==optimizer]
    sub_df = sub_df[sub_df.starting_filter==starting_filter]
    sub_df = sub_df[sub_df.dropout_initial == dropout_initial]
    sub_df = sub_df[sub_df.dropout_step == dropout_step]
    sub_df = sub_df[sub_df.bias == bias]
    sub_df = sub_df[sub_df.batch_norm == batch_norm]
    sub_df = sub_df[sub_df.stride == stride]
    sub_df = sub_df[sub_df.lrate == lrate]
    sub_df = sub_df[sub_df.fc_layer1 == fc1]  
    sub_df = sub_df[sub_df.fc_layer2 == fc2] 
    sub_df = sub_df[sub_df.flipped == flipped]
#     sub_df = sub_df[sub_df.arch == arch]
    sub_df = sub_df[sub_df.keypoint == keypoint]
    
    # Group the neural net data by optimizer
    groups = sub_df.groupby(['arch'])
    fig, axes = plt.subplots(2, 2, figsize=(20, 20))
    axes = axes.flatten()
    
    # Loop over the grouped data and plot out epoch timing and validation loss data
    for name, group in groups:
        
        # Plot training and validation losses by epoch
        axes[0].plot(group.epoch, group.val_RMSE, label=str(name)+' Validation Loss')
#         axes[0].scatter(group.epoch, group.RMSE, label=str(name)+' Training Loss')
#         axes[0].text(group.epoch.iloc[len(group)//4]-10,group.val_RMSE.iloc[len(group)//4]+0.25,
#                      '25% RMSE: {:.2f}'.format(group.val_RMSE.iloc[len(group)//4]), fontsize=12)
#         axes[0].text(group.epoch.iloc[2*len(group)//4]-10,group.val_RMSE.iloc[2*len(group)//4]+0.25,
#                      '50% RMSE: {:.2f}'.format(group.val_RMSE.iloc[2*len(group)//4]), fontsize=12)        
#         axes[0].text(group.epoch.iloc[3*len(group)//4]-10,group.val_RMSE.iloc[3*len(group)//4]+0.25,
#                      '75% RMSE: {:.2f}'.format(group.val_RMSE.iloc[3*len(group)//4]), fontsize=12)        
#         axes[0].text(group.epoch.iloc[-1]-10,group.val_RMSE.iloc[-1]+0.25,
#                      'Final RMSE: {:.2f}'.format(group.val_RMSE.iloc[-1]), fontsize=12)
        axes[0].set_ylim([0,ymax])
        axes[0].set_xlabel('Epoch')
        axes[0].set_ylabel('Root Mean Square Error')
#         axes[0].set_title("{} Optimizer and {} Activation".format(group.optimizer.unique(), group.activation.unique()))
        
        # Plot train time by epoch
        axes[1].scatter(group.epoch, group.times, label=str(name)+' Fit Time')
        axes[1].set_xlabel('Epoch')
        axes[1].set_ylabel('Fit Time (seconds)')
        axes[1].set_ylim([0,timemax/group.epoch.max()])
        axes[1].legend()
        
        # Plot cumulative training time
        axes[2].plot(group.epoch, group.cum_times, label=str(name)+' Fit Time (s)', lw=4)
        axes[2].set_xlabel('Epoch')
        axes[2].set_ylabel('Cumulative Fit Time (seconds)')
        axes[2].grid(b=True)
        axes[2].set_ylim([0,timemax])
        axes[2].legend()
        
        # Plot cumulative validation loss by cumulative time
        axes[3].plot(group.cum_times, group.val_RMSE, label=str(name)+' Validation Loss', lw=3)
        axes[3].set_xlabel('Cumulative Fit Time (seconds)')
        axes[3].set_ylabel('Validation RMSE')
        axes[3].set_xlim([0,timemax])
        axes[3].set_ylim([0,ymax])
        axes[3].legend()
    
    # Add line for knr score
    axes[0].axhline(2.49, label='kNR Score', lw=5, c='k')
    axes[3].axhline(2.49, label='kNR Score', lw=5, c='k')

    axes[0].legend()
    
    # Adjust the spacing of the subplots
    fig.subplots_adjust(left=0.03, right=0.97, hspace=0.1, wspace=0.15)

    # Add an overarching title for these plots
    fig.suptitle("Performance Comparison for Baseline Convolutional Neural Nets",
                 fontsize=18, y=0.93)

In [124]:
# Plot the relu learning rate data
interact_manual(plot_cnn_data, df=fixed(cnn_df), 
                optimizer = cnn_df.optimizer.unique(), 
                activation = cnn_df.activation.unique(),
                starting_filter = cnn_df.starting_filter.unique(),
                dropout_initial = cnn_df.dropout_initial.unique(),
                dropout_step = cnn_df.dropout_step.unique(),
                bias = cnn_df.bias.unique(),
                batch_norm = cnn_df.batch_norm.unique(),
                stride= cnn_df.stride.unique(),
                lrate = cnn_df.lrate.unique(),
                fc1 = cnn_df.fc_layer1.unique(),
                fc2 = cnn_df.fc_layer2.unique(),
#                 arch = cnn_df.arch.unique(),
                keypoint = cnn_df.keypoint.unique(),
                flipped = [0,1],
                ymax=5,
                timemax=1000)
print()

interactive(children=(Dropdown(description='optimizer', options=('adam', 'sgd', 'nadam', 'adagrad'), value='ad…




## Assessment of Initial CNN Results

- Base results for our Neural Net achieved a final RMSE of 1.80 using no dropout, and 32 initial filter depth and alternative convolution and pooling layers. The runtime for this model was about 1900 seconds for training.
- Results of the filter study indicate that similar or better accuracies can be achieved with a lower initial filter depth while also reducing training time significantly over the base case.  For example the following results were observed:
    - Filter Depth of  3, Validation RMSE: 1.74, Training Time: 450 seconds, a little more noise in validation
    - Filter Depth of  5, Validation RMSE: 1.71, Training Time: 700 seconds, better behaved validation
    - Filter Depth of 12, Validation RMSE: 1.64, Training Time: 950 seconds, more overfit
    - Filter Depth of 16, Validation RMSE: 1.57, Training Time: 1100 seconds, most overfit
    - Filter Depth of 32, Validation RMSE: 1.80, Training Time: 1900 seconds, Baseline.
    We should use a filter depth of 16 for further studies, but to optimize for time the filter depth of 3 or 5 may also be viable
- Initial assessments of flat dropout rates of 0.1 and 0.15 do not appear to show favorable behavior. Validation accuracy is very noisy and begins to grow after reaching a minimum. We should look at a more gradual dropout process

In [107]:
output_notebook()

In [31]:
from bokeh.transform import linear_cmap
from bokeh.plotting import figure, show
from bokeh.io import output_notebook, push_notebook, curdoc
from bokeh.models import ColumnDataSource, Select, HoverTool
from bokeh.models.widgets import Slider, Select, TextInput
from bokeh.layouts import row, column

In [20]:
source = ColumnDataSource(cnn_df.reset_index())

p = figure(tools="pan, zoom_in, zoom_out, box_zoom, crosshair, lasso_select, box_select")
p.grid.visible = True
hover = HoverTool()
hover.tooltips = [
    ("Validation RMSE", "@val_RMSE"),
    ("Learning Rate", "@lrate"),
    ("Dropout Step", "@dropout_step"),
    ("Starting Filter Depth", "@starting_filter")
]
p.circle(x='index',y='val_RMSE', source=source, size='starting_filter', 
          fill_color=linear_cmap('lrate', 'Viridis256', 0, max(min_df.lrate.min(), min_df.lrate.max())))
p.add_tools(hover)

show(p)

In [22]:
cnn_df.head(5)

Unnamed: 0,RMSE,activation,arch,batch_norm,bias,cum_times,dropout_initial,dropout_step,epoch,fc_layer,...,lrate,mean_squared_error,optimizer,pooling,starting_filter,stride,times,val_RMSE,val_loss,val_mean_squared_error
0,44.880259,relu,vgg,1,0,4.343745,0.0,0.0,0,500.0,...,0.001,2014.237671,adam,yes,16,1,4.343745,44.349447,1966.87327,1966.873413
1,30.735587,relu,vgg,1,0,5.744449,0.0,0.0,1,500.0,...,0.001,944.676331,adam,yes,16,1,1.400704,33.48492,1121.239891,1121.239868
2,18.178155,relu,vgg,1,0,7.170035,0.0,0.0,2,500.0,...,0.001,330.445312,adam,yes,16,1,1.425586,18.197102,331.134516,331.134521
3,10.248165,relu,vgg,1,0,8.580605,0.0,0.0,3,500.0,...,0.001,105.024895,adam,yes,16,1,1.410569,16.980343,288.332089,288.332062
4,6.631339,relu,vgg,1,0,9.983911,0.0,0.0,4,500.0,...,0.001,43.974663,adam,yes,16,1,1.403307,6.111427,37.349535,37.349537
