# Neural Net Analysis Notebook
## W207 Final Project
### T. P. Goter
### July 6, 2019

This workbook is used to assess various models created as part of the Facial Keypoint Detection project for W207.

In [10]:
# Import the packages we need
import pandas as pd
from matplotlib import pyplot as plt
%matplotlib inline
import numpy as np
import ipywidgets as widgets
from ipywidgets import interact, interact_manual, fixed
import os

In [11]:
# Load the pkled dataframe for the baseline single layer neural net
bl_sl_df = pd.read_pickle("OutputData/single_layer_df.pkl")
bl_sl_df.sample(10)

Unnamed: 0,loss,mean_squared_error,val_loss,val_mean_squared_error,epoch,RMSE,val_RMSE,times,hunits,activation,optimizer,lrate
393,2525.411546,2525.411377,2520.734829,2520.735107,393,50.253471,50.206923,1.332076,200,sigmoid,nadam,0.002
212,2427.317821,2427.318115,2424.645473,2424.645752,212,49.26782,49.240692,0.44344,200,relu,sgd,0.01
41,2427.317869,2427.318359,2424.645338,2424.64502,41,49.267823,49.240685,0.431231,200,relu,sgd,0.01
59,2525.412949,2525.412598,2520.736259,2520.736328,59,50.253483,50.206935,0.763112,200,tanh,adagrad,0.01
68,1025.098842,1025.098755,1037.942122,1037.942139,68,32.017163,32.217109,1.363042,200,relu,nadam,0.002
340,272.703951,272.703979,299.593716,299.593689,340,16.513751,17.308775,0.305263,100,relu,adam,0.001
246,611.92644,611.926331,633.599868,633.599854,246,24.737145,25.171409,1.064145,150,relu,nadam,0.002
164,2525.411581,2525.411865,2520.734858,2520.735107,164,50.253476,50.206923,0.34594,100,sigmoid,adam,0.001
369,2525.411545,2525.411865,2520.734847,2520.734863,369,50.253476,50.20692,0.31835,100,tanh,sgd,0.01
278,2525.421268,2525.421631,2520.74461,2520.744629,278,50.253573,50.207018,0.305015,50,sigmoid,adagrad,0.01


In [12]:
# Create a plotting function to pass to the interact widget function
def plot_validation_loss(df=bl_sl_df, optimizer = bl_sl_df.optimizer.unique(), 
                    activation = bl_sl_df.activation.unique()):
    
    # Subset the baseline df by the specified optimizer and activation
    sub_df = df[df.optimizer.str.match(optimizer)]
    sub_df = sub_df[sub_df.activation.str.match(activation)]
     
    # Group the neural net data by optimizer and activation
    groups = sub_df.groupby(['hunits'])
    fig, axes = plt.subplots(1, 2, figsize=(15, 10))
    axes = axes.flatten()
    
    # Loop over the grouped data and plot out epoch timing and validation loss data
    for name, group in groups:
        axes[0].plot(group.epoch, group.val_RMSE, label=str(name)+' Validation Loss')
    #     axes[0].scatter(group.epoch, group.RMSE, label=' '.join(name)+' Training Loss')
        axes[1].scatter(group.epoch, group.times*1000, label=str(name)+' Fit Time')
        axes[0].set_xlabel('Epoch')
        axes[0].set_ylabel('Root Mean Square Error')
        axes[1].set_xlabel('Epoch')
        axes[1].set_ylabel('Fit Time (milliseconds)')
        axes[0].set_ylim([0,sub_df.val_RMSE.max()])
        axes[1].set_ylim([0,1000])
        axes[0].legend()
        axes[1].legend()
        axes[0].set_title("{} Optimizer and {} Activation".format(group.optimizer.unique(), group.activation.unique()))
    
    # Adjust the spacing of the subplots
    fig.subplots_adjust(left=0.03, right=0.97, hspace=0.1, wspace=0.15)

    # Add an overarching title for these plots
    fig.suptitle("Performance Comparison for Single Layer, Fully Connected Neural Nets",
                 fontsize=18, y=0.93)



In [13]:
interact_manual(plot_validation_loss, df=fixed(bl_sl_df), 
                optimizer = bl_sl_df.optimizer.unique(), 
                    activation = bl_sl_df.activation.unique())
print()

interactive(children=(Dropdown(description='optimizer', options=('adam', 'sgd', 'nadam', 'adagrad'), value='ad…




### Assessment of Baseline Results
1. Adam and Adagrad Optimizers are working well. 
2. Adam is faster and works well with 200 hidden units
3. Adagrad is slower buts works best with 100 hidden units.

In the evaluation above, both the hidden layer and the output layer used the activation function specified by the user. For the study below, the activation function of the output layer was set to softmax which is a multinomial classifier version of the sigmoid function. The plots below help to assess if the choice of activation function for the output layer significant alters are perception of which activation function and optimizers work well for our neural network.

In [14]:
# Load the pkled dataframe for the baseline single layer neural net
sm_sl_df = pd.read_pickle("OutputData/single_layer_softmax_df.pkl")
sm_sl_df.sample(10)

Unnamed: 0,loss,mean_squared_error,val_loss,val_mean_squared_error,epoch,RMSE,val_RMSE,times,hunits,activation,optimizer,lrate
258,2614.113111,2614.113037,2609.399139,2609.39917,258,51.128398,51.082278,0.367262,100,sigmoid,adam,0.001
338,2614.506091,2614.506104,2609.812108,2609.812012,338,51.132241,51.086319,0.559323,100,relu,adagrad,0.01
87,2614.113154,2614.112793,2609.399169,2609.39917,87,51.128395,51.082278,0.329807,100,tanh,adam,0.001
33,2614.11312,2614.113281,2609.399132,2609.39917,33,51.1284,51.082278,0.377566,50,tanh,nadam,0.002
52,2614.113173,2614.113037,2609.399151,2609.399414,52,51.128398,51.082281,0.811855,100,sigmoid,nadam,0.002
81,2614.315106,2614.314941,2609.594195,2609.594238,81,51.130372,51.084188,0.28495,100,sigmoid,sgd,0.01
41,2614.113223,2614.113525,2609.399267,2609.399414,41,51.128402,51.082281,0.776023,200,tanh,adagrad,0.01
208,2614.314997,2614.314453,2609.594201,2609.594238,208,51.130367,51.084188,0.429409,150,relu,adam,0.001
244,2614.5442,2614.544434,2609.843605,2609.84375,244,51.132616,51.08663,0.390927,200,relu,sgd,0.01
337,2614.113134,2614.112305,2609.39916,2609.399414,337,51.12839,51.082281,0.379356,50,tanh,nadam,0.002


In [15]:
# Plot the softmax data
interact_manual(plot_validation_loss, df=fixed(sm_sl_df), 
                optimizer = sm_sl_df.optimizer.unique(), 
                    activation = sm_sl_df.activation.unique())
print()

interactive(children=(Dropdown(description='optimizer', options=('adam', 'sgd', 'nadam', 'adagrad'), value='ad…




In [16]:
# Load the pkled dataframe for the baseline single layer neural net
relu_sl_df = pd.read_pickle("OutputData/single_layer_relu_df.pkl")
relu_sl_df.sample(10)

Unnamed: 0,loss,mean_squared_error,val_loss,val_mean_squared_error,epoch,RMSE,val_RMSE,times,hunits,activation,optimizer,lrate
115,502.155229,502.155182,500.816102,500.816101,115,22.408819,22.378921,0.699443,150,tanh,adagrad,0.01
157,180.9496,180.949585,180.257479,180.257477,157,13.45175,13.426,1.237069,200,tanh,nadam,0.002
178,623.835086,623.834961,622.083832,622.083801,178,24.976688,24.941608,0.206128,50,tanh,sgd,0.01
9,509.132992,509.132904,504.411733,504.411713,9,22.563974,22.459112,0.780349,200,tanh,adagrad,0.01
175,225.888963,225.888962,225.608633,225.608627,175,15.029603,15.020274,0.961487,150,sigmoid,nadam,0.002
37,879.276197,879.276184,876.833828,876.833801,37,29.652592,29.61138,0.294891,100,sigmoid,sgd,0.01
309,1196.653272,1196.653076,1192.762382,1192.762451,309,34.592674,34.536393,0.761255,200,sigmoid,adagrad,0.01
31,1108.908832,1108.908813,1104.147715,1104.147705,31,33.300282,33.228718,0.414289,50,sigmoid,nadam,0.002
225,1108.918343,1108.918213,1104.151216,1104.151245,225,33.300424,33.228771,0.418424,50,sigmoid,nadam,0.002
23,779.769239,779.769165,774.285601,774.285583,23,27.924347,27.825988,0.592959,150,sigmoid,adagrad,0.01


In [17]:
# Plot the relu data
interact_manual(plot_validation_loss, df=fixed(relu_sl_df), 
                optimizer = relu_sl_df.optimizer.unique(), 
                    activation = relu_sl_df.activation.unique())
print()

interactive(children=(Dropdown(description='optimizer', options=('adam', 'sgd', 'nadam', 'adagrad'), value='ad…




In [18]:
relu_sl_df.groupby('optimizer').val_RMSE.min()

optimizer
adagrad    22.046067
adam        3.142767
nadam      13.423687
sgd         7.542013
Name: val_RMSE, dtype: float64

### Single Layer Assessment
Based on the three different experiments run, we will use a RELU final layer activation function. We will continue to assess adagrad and adam optimizers. We likely do not need to train for more than 200 epochs or so to get reasonably converged nets.

In [5]:
# Load the pkled dataframe for the baseline single layer neural net
relu_sl_lr_df = pd.read_pickle("OutputData/single_layer_relu_lr_df.pkl")
relu_sl_lr_df.sample(10)

Unnamed: 0,loss,mean_squared_error,val_loss,val_mean_squared_error,epoch,RMSE,val_RMSE,times,hunits,activation,optimizer,lrate
166,585.102651,585.1026,602.217882,602.217896,166,24.188894,24.540128,0.777923,200,relu,adagrad_02,0.02
197,101.179164,101.179169,121.990278,121.990288,197,10.058786,11.044921,0.195769,50,relu,adam_005,0.005
190,1382.132048,1382.13208,1389.458887,1389.458862,190,37.177037,37.275446,0.561554,150,relu,adagrad_005,0.05
184,84.662256,84.662247,116.007755,116.007736,184,9.201209,10.770689,0.490967,200,relu,adam_0005,0.0005
154,154.702597,154.702591,174.748838,174.748825,154,12.43795,13.21926,0.468345,200,relu,adam_005,0.005
121,293.815909,293.815887,292.408084,292.408112,121,17.141059,17.099945,0.259666,50,tanh,adagrad_02,0.02
62,360.313932,360.313965,370.613009,370.613007,62,18.981938,19.251312,0.271495,50,relu,adagrad_005,0.05
60,244.381297,244.381256,244.194078,244.194077,60,15.632698,15.62671,0.247615,50,tanh,adagrad_005,0.05
115,87.490673,87.490669,87.13756,87.137558,115,9.353645,9.33475,0.756261,200,tanh,adagrad_005,0.05
60,1393.413407,1393.413452,1396.197367,1396.197388,60,37.328454,37.365725,0.543673,150,relu,adagrad_005,0.05


In [6]:
# Plot the relu learning rate data
interact_manual(plot_validation_loss, df=fixed(relu_sl_lr_df), 
                optimizer = relu_sl_lr_df.optimizer.unique(), 
                    activation = relu_sl_lr_df.activation.unique())
print()

interactive(children=(Dropdown(description='optimizer', options=('adam_005', 'adam_0005', 'adagrad_02', 'adagr…




## Combine the dataframes to get a comprehensive look at the data


In [26]:
comb_bl_sl_df = pd.DataFrame()

In [7]:
# Load the pkled dataframe for the baseline single layer neural net
tl_df = pd.read_pickle("OutputData/two_layer_relu_df.pkl")
tl_df.sample(10)

Unnamed: 0,loss,mean_squared_error,val_loss,val_mean_squared_error,epoch,RMSE,val_RMSE,times,hunits,activation,optimizer,lrate
229,123.751275,123.751266,123.281732,123.281738,229,11.124355,11.103231,0.439289,150,tanh,adam,0.001
209,569.614534,569.614502,593.652979,593.652954,209,23.866598,24.364994,0.699659,100,relu,adagrad,0.01
245,423.108962,423.108948,451.090325,451.090332,245,20.569612,21.238887,0.692946,50,relu,adagrad,0.01
59,444.13596,444.135986,462.668193,462.668213,59,21.074534,21.509724,0.691606,50,relu,adagrad,0.01
79,413.230153,413.230103,412.724919,412.724884,79,20.328062,20.315632,0.709746,100,tanh,adagrad,0.01
289,514.039798,514.039734,537.720856,537.720764,289,22.672444,23.188807,0.689042,150,relu,adagrad,0.01
233,412.497806,412.497833,412.097011,412.097015,233,20.310043,20.300173,0.68842,100,tanh,adagrad,0.01
32,410.932827,410.93277,428.343918,428.343933,32,20.271477,20.696472,0.45814,100,relu,adam,0.001
11,1315.643724,1315.643921,1294.415446,1294.415405,11,36.271806,35.977985,0.789763,50,tanh,adagrad,0.01
165,123.745365,123.745346,123.306409,123.306412,165,11.124089,11.104342,0.439983,150,tanh,adam,0.001


In [8]:
# Plot the relu learning rate data
interact_manual(plot_validation_loss, df=fixed(tl_df), 
                optimizer = tl_df.optimizer.unique(), 
                    activation = tl_df.activation.unique())
print()

interactive(children=(Dropdown(description='optimizer', options=('adam', 'adagrad'), value='adam'), Dropdown(d…


