#GitAnalysis using Theano

In [1]:
%matplotlib inline

In [2]:
from pprint import pprint
from collections import defaultdict

import numpy as np
import math
import matplotlib.pyplot as plt
import random
import warnings
import copy
import pickle

from sklearn.grid_search import GridSearchCV
from sklearn import metrics

import sys
sys.path.append('../dev')

from ml_plot import plot_validation_curve
from ml_plot import my_plot_learning_curve
from ml_plot import plot_prediction_curve
from ml_plot import get_dataset, eval_predictions
from ml_plot import PredictCV, PredictCV_TrainTest
from ml_plot import PredictCV_TrainTestValidate

In [3]:
from Theano_NN import test_nn, modern_nn_model, modern_nn_model_h1
from Theano_NN import convert_binary_to_onehot, compute_stats, print_stats

In [4]:
PROJECT = 'nova'
# PROJECT = 'swift'
# PROJECT = 'cinder'
# PROJECT = 'heat'
# PROJECT = 'glance'

# IMPORTANCE = 'crit'
# IMPORTANCE = 'high+'
IMPORTANCE = 'med+'
# IMPORTANCE = 'low+'

# SIZE = 100
#SIZE = 250
# SIZE = 0.1
SIZE = 0.5

SCORING = 'f1'         # (precision * recall) / (precision + recall)
# SCORING = 'accuracy'   # (TP + TN) / all values
# SCORING = 'precision'  # TP / (TP + FP)
# SCORING = 'recall'     # TP / (TP + FN)
# SCORING = 'average_precision'
# SCORING = 'roc_auc'

JOBS = 4
VERBOSE = True

warnings.filterwarnings('ignore', 'F-score is ill-defined')
warnings.filterwarnings('ignore', 'overflow encountered in exp')

#Function definitions

In [5]:
def print_stats2(metric_name, metric_value, param_name, param_value, stats):
    print 'Best Results for {0} : {1}  {2} : {3}'.format(
        metric_name, metric_value, param_name, param_value)
    print '    {0:8} {1:8}'.format(stats['TP'], stats['FP'])
    print '    {0:8} {1:8}'.format(stats['FN'], stats['TN'])
    print '    Precision: {0:0.2f} Recall: {1:0.2f}, F1: {2:0.2f}'.format(
        stats['precision'], stats['recall'], stats['f1'])    

In [6]:
def analyze_bias(predict, X_test, Y_test, min_bias=-0.2, max_bias=0.05, test_points=10, detail=False):
    best_f1 = 0.0
    best_f1_bias = 0.0
    best_f1_stats = []
    best_precision = 0.0
    best_precision_bias = 0.0
    best_precision_stats = []
    for bias in np.linspace(min_bias, max_bias, test_points):
        biasV = np.array([-bias, bias])
    
        y_predict = predict(X_test, bias=biasV)
        y_target = np.argmax(Y_test, axis=1)
            
        (cost, TP, FP, FN, TN, precision, recall, f1) = compute_stats(y_predict, y_target)
        if detail:
            print 'Bias:', bias
            print_stats(0, cost, TP, FP, FN, TN, precision, recall, f1)
            print
        if f1 > best_f1:
            best_f1 = f1
            best_f1_bias = bias
            best_f1_stats = {'cost':cost, 'TP':TP, 'FP':FP, 'FN':FN, 'TN':TN,
                             'precision':precision, 'recall':recall, 'f1':f1}
        if precision > best_precision:
            best_precision = precision
            best_precision_bias = bias
            best_precision_stats = {'cost':cost, 'TP':TP, 'FP':FP, 'FN':FN, 'TN':TN, 
                                    'precision':precision, 'recall':recall, 'f1':f1}
        
    print_stats2('F1', best_f1, 'Bias', best_f1_bias, best_f1_stats)
    print
    print_stats2('Precision', best_precision, 'Bias', best_precision_bias, best_precision_stats)

In [7]:
def analyze_distance(predict, X_test, Y_test, bias=0.0):
    biasV = np.array([-bias, bias])
    for size in [100, 200, 300, 400, 500]:
    
        y_predict = predict(X_test[:size,], bias=biasV)
        y_target = np.argmax(Y_test[:size,], axis=1)
            
        (cost, TP, FP, FN, TN, precision, recall, f1) = compute_stats(y_predict, y_target)
        stats = {'cost':cost, 'TP':TP, 'FP':FP, 'FN':FN, 'TN':TN,
                 'precision':precision, 'recall':recall, 'f1':f1}

        print_stats2('Size', size, 'Cost', stats['cost'], stats)
        print

In [8]:
def show_best(predict, X_test, Y_test, bias=0.0):
    
    biasV = np.array([-bias, bias])
    y_predict = predict(X_test, bias=biasV)
    y_target = np.argmax(Y_test, axis=1)
            
    (cost, TP, FP, FN, TN, precision, recall, f1) = compute_stats(y_predict, y_target)
    print_stats(-1, cost, TP, FP, FN, TN, precision, recall, f1)

#Load Data

In [9]:
%%capture
Y, X = get_dataset(PROJECT, IMPORTANCE)

In [10]:
rows = X.shape[0]
feats = X.shape[1]
print 'Rows:', rows, 'Features:', feats

Rows: 8806 Features: 16233


In [11]:
n_test = 500
ignore = 0.1

max_rows= int(rows*(1.0-ignore))
n_train = max_rows-n_test

print n_train, n_test, max_rows

7425 500 7925


In [12]:
Y_onehot = convert_binary_to_onehot(Y)

X_train = X[:n_train,]
Y_train = Y_onehot[:n_train,]
X_test = X[n_train:n_train+n_test,]
Y_test = Y_onehot[n_train:n_train+n_test,]

In [13]:
print 'Initial Shapes:'
print '  X', X.shape, type(X)
print '  Y', Y.shape, type(Y)
print '  Y(one hot):', Y_onehot.shape, type(Y_onehot)
print
print 'Train and Test Data:'
print '  train -- X:', X_train.shape, 'Y:', Y_train.shape
print '  test  -- X:', X_test.shape, 'Y:', Y_test.shape

Initial Shapes:
  X (8806, 16233) <type 'numpy.ndarray'>
  Y (8806,) <type 'numpy.ndarray'>
  Y(one hot): (8806, 2) <type 'numpy.ndarray'>

Train and Test Data:
  train -- X: (7425, 16233) Y: (7425, 2)
  test  -- X: (500, 16233) Y: (500, 2)


#Start of Analysis

##Test Instance

In [14]:
if False:
    (predict, best_weights, _) = test_nn(modern_nn_model_h1, X_train, X_test, Y_train, Y_test, 
        iterations=2, dimensions=[feats, 100, 2], #  lr=0.003,
        update='rms_prop', uses_dropout=True,
        max_distance=50, lr=0.003, min_lr=0.000001)

In [15]:
if False:
    show_best(predict, X_test, Y_test, bias=0.0)

In [16]:
if False:
    analyze_bias(predict, X_test, Y_test, min_bias=-0.2, max_bias=0.05)

In [17]:
if False:
    analyze_distance(predict, X_test, Y_test)

## Determine impact of starting LR

###50 Hidden nodes

In [18]:
(predict1, best_weights1, _) = test_nn(modern_nn_model_h1, X_train, X_test, Y_train, Y_test, 
        iterations=2001, dimensions=[feats, 50, 2],
        update='rms_prop', uses_dropout=True,
        max_distance=50, lr=0.003, min_lr=0.00003)

Iteration: 0  Cost: 0.11
          55      445
           0        0
    Precision: 0.11 Recall: 1.00, F1: 0.20
    Best Prec: 0.11 Recall: 1.00, F1: 0.20  i: 0

Iteration: 1  Cost: 0.186
          55      407
           0       38
    Precision: 0.12 Recall: 1.00, F1: 0.21
    Best Prec: 0.12 Recall: 1.00, F1: 0.21  i: 1

Iteration: 2  Cost: 0.27
          54      364
           1       81
    Precision: 0.13 Recall: 0.98, F1: 0.23
    Best Prec: 0.13 Recall: 0.98, F1: 0.23  i: 2

Iteration: 5  Cost: 0.72
          39      124
          16      321
    Precision: 0.24 Recall: 0.71, F1: 0.36
    Best Prec: 0.24 Recall: 0.71, F1: 0.36  i: 5

Iteration: 6  Cost: 0.814
          31       69
          24      376
    Precision: 0.31 Recall: 0.56, F1: 0.40
    Best Prec: 0.31 Recall: 0.56, F1: 0.40  i: 6

Iteration: 10  Cost: 0.772
          34       93
          21      352
    Precision: 0.27 Recall: 0.62, F1: 0.37
    Best Prec: 0.31 Recall: 0.56, F1: 0.40  i: 6

Iteration: 19  Cost: 0.7

In [30]:
show_best(predict1, X_test, Y_test, bias=0.0)

Iteration: -1  Cost: 0.794
          37       85
          18      360
    Precision: 0.30 Recall: 0.67, F1: 0.42


In [31]:
analyze_bias(predict1, X_test, Y_test, min_bias=-0.2, max_bias=0.0)

Best Results for F1 : 0.418079096045  Bias : 0.0
          37       85
          18      360
    Precision: 0.30 Recall: 0.67, F1: 0.42

Best Results for Precision : 0.349206349206  Bias : -0.155555555556
          22       41
          33      404
    Precision: 0.35 Recall: 0.40, F1: 0.37


In [32]:
analyze_distance(predict1, X_test, Y_test)

Best Results for Size : 100  Cost : 0.83
          12       13
           4       71
    Precision: 0.48 Recall: 0.75, F1: 0.59

Best Results for Size : 200  Cost : 0.76
          26       41
           7      126
    Precision: 0.39 Recall: 0.79, F1: 0.52

Best Results for Size : 300  Cost : 0.776666666667
          29       55
          12      204
    Precision: 0.35 Recall: 0.71, F1: 0.46

Best Results for Size : 400  Cost : 0.78
          34       73
          15      278
    Precision: 0.32 Recall: 0.69, F1: 0.44

Best Results for Size : 500  Cost : 0.794
          37       85
          18      360
    Precision: 0.30 Recall: 0.67, F1: 0.42



###100 Hidden

In [22]:
(predict2, best_weights2, _) = test_nn(modern_nn_model_h1, X_train, X_test, Y_train, Y_test, 
        iterations=2001, dimensions=[feats, 100, 2],
        update='rms_prop', uses_dropout=True,
        max_distance=50, lr=0.003, min_lr=0.00003)

Iteration: 0  Cost: 0.11
          55      445
           0        0
    Precision: 0.11 Recall: 1.00, F1: 0.20
    Best Prec: 0.11 Recall: 1.00, F1: 0.20  i: 0

Iteration: 1  Cost: 0.584
          41      194
          14      251
    Precision: 0.17 Recall: 0.75, F1: 0.28
    Best Prec: 0.17 Recall: 0.75, F1: 0.28  i: 1

Iteration: 2  Cost: 0.674
          38      146
          17      299
    Precision: 0.21 Recall: 0.69, F1: 0.32
    Best Prec: 0.21 Recall: 0.69, F1: 0.32  i: 2

Iteration: 3  Cost: 0.692
          39      138
          16      307
    Precision: 0.22 Recall: 0.71, F1: 0.34
    Best Prec: 0.22 Recall: 0.71, F1: 0.34  i: 3

Iteration: 5  Cost: 0.716
          35      122
          20      323
    Precision: 0.22 Recall: 0.64, F1: 0.33
    Best Prec: 0.22 Recall: 0.71, F1: 0.34  i: 3

Iteration: 7  Cost: 0.716
          38      125
          17      320
    Precision: 0.23 Recall: 0.69, F1: 0.35
    Best Prec: 0.23 Recall: 0.69, F1: 0.35  i: 7

Iteration: 8  Cost: 0.8

In [33]:
show_best(predict2, X_test, Y_test, bias=0.0)

Iteration: -1  Cost: 0.832
          35       64
          20      381
    Precision: 0.35 Recall: 0.64, F1: 0.45


In [34]:
analyze_bias(predict2, X_test, Y_test, min_bias=-0.2, max_bias=0.0)

Best Results for F1 : 0.454545454545  Bias : 0.0
          35       64
          20      381
    Precision: 0.35 Recall: 0.64, F1: 0.45

Best Results for Precision : 0.422222222222  Bias : -0.177777777778
          19       26
          36      419
    Precision: 0.42 Recall: 0.35, F1: 0.38


In [35]:
analyze_distance(predict2, X_test, Y_test)

Best Results for Size : 100  Cost : 0.85
          10        9
           6       75
    Precision: 0.53 Recall: 0.62, F1: 0.57

Best Results for Size : 200  Cost : 0.805
          24       30
           9      137
    Precision: 0.44 Recall: 0.73, F1: 0.55

Best Results for Size : 300  Cost : 0.826666666667
          27       38
          14      221
    Precision: 0.42 Recall: 0.66, F1: 0.51

Best Results for Size : 400  Cost : 0.8225
          31       53
          18      298
    Precision: 0.37 Recall: 0.63, F1: 0.47

Best Results for Size : 500  Cost : 0.832
          35       64
          20      381
    Precision: 0.35 Recall: 0.64, F1: 0.45



###200 Hidden

In [24]:
(predict3, best_weights3, _) = test_nn(modern_nn_model_h1, X_train, X_test, Y_train, Y_test, 
        iterations=2001, dimensions=[feats, 200, 2],
        update='rms_prop', uses_dropout=True,
        max_distance=50, lr=0.003, min_lr=0.00003)

Iteration: 0  Cost: 0.38
          51      306
           4      139
    Precision: 0.14 Recall: 0.93, F1: 0.25
    Best Prec: 0.14 Recall: 0.93, F1: 0.25  i: 0

Iteration: 1  Cost: 0.682
          36      140
          19      305
    Precision: 0.20 Recall: 0.65, F1: 0.31
    Best Prec: 0.20 Recall: 0.65, F1: 0.31  i: 1

Iteration: 2  Cost: 0.752
          34      103
          21      342
    Precision: 0.25 Recall: 0.62, F1: 0.35
    Best Prec: 0.25 Recall: 0.62, F1: 0.35  i: 2

Iteration: 3  Cost: 0.77
          33       93
          22      352
    Precision: 0.26 Recall: 0.60, F1: 0.36
    Best Prec: 0.26 Recall: 0.60, F1: 0.36  i: 3

Iteration: 5  Cost: 0.644
          46      169
           9      276
    Precision: 0.21 Recall: 0.84, F1: 0.34
    Best Prec: 0.26 Recall: 0.60, F1: 0.36  i: 3

Iteration: 7  Cost: 0.778
          34       90
          21      355
    Precision: 0.27 Recall: 0.62, F1: 0.38
    Best Prec: 0.27 Recall: 0.62, F1: 0.38  i: 7

Iteration: 10  Cost: 0.7

In [36]:
show_best(predict3, X_test, Y_test, bias=0.0)

Iteration: -1  Cost: 0.85
          25       45
          30      400
    Precision: 0.36 Recall: 0.45, F1: 0.40


In [37]:
analyze_bias(predict3, X_test, Y_test, min_bias=-0.2, max_bias=0.0)

Best Results for F1 : 0.4  Bias : 0.0
          25       45
          30      400
    Precision: 0.36 Recall: 0.45, F1: 0.40

Best Results for Precision : 0.405405405405  Bias : -0.177777777778
          15       22
          40      423
    Precision: 0.41 Recall: 0.27, F1: 0.33


In [38]:
analyze_distance(predict3, X_test, Y_test)

Best Results for Size : 100  Cost : 0.87
           8        5
           8       79
    Precision: 0.62 Recall: 0.50, F1: 0.55

Best Results for Size : 200  Cost : 0.83
          18       19
          15      148
    Precision: 0.49 Recall: 0.55, F1: 0.51

Best Results for Size : 300  Cost : 0.84
          20       27
          21      232
    Precision: 0.43 Recall: 0.49, F1: 0.45

Best Results for Size : 400  Cost : 0.85
          22       33
          27      318
    Precision: 0.40 Recall: 0.45, F1: 0.42

Best Results for Size : 500  Cost : 0.85
          25       45
          30      400
    Precision: 0.36 Recall: 0.45, F1: 0.40



###400 Hidden

In [26]:
(predict4, best_weights4, _) = test_nn(modern_nn_model_h1, X_train, X_test, Y_train, Y_test, 
        iterations=2001, dimensions=[feats, 400, 2],
        update='rms_prop', uses_dropout=True,
        max_distance=50, lr=0.003, min_lr=0.00003)

Iteration: 0  Cost: 0.128
          55      436
           0        9
    Precision: 0.11 Recall: 1.00, F1: 0.20
    Best Prec: 0.11 Recall: 1.00, F1: 0.20  i: 0

Iteration: 1  Cost: 0.55
          44      214
          11      231
    Precision: 0.17 Recall: 0.80, F1: 0.28
    Best Prec: 0.17 Recall: 0.80, F1: 0.28  i: 1

Iteration: 2  Cost: 0.628
          41      172
          14      273
    Precision: 0.19 Recall: 0.75, F1: 0.31
    Best Prec: 0.19 Recall: 0.75, F1: 0.31  i: 2

Iteration: 3  Cost: 0.59
          46      196
           9      249
    Precision: 0.19 Recall: 0.84, F1: 0.31
    Best Prec: 0.19 Recall: 0.84, F1: 0.31  i: 3

Iteration: 4  Cost: 0.668
          39      150
          16      295
    Precision: 0.21 Recall: 0.71, F1: 0.32
    Best Prec: 0.21 Recall: 0.71, F1: 0.32  i: 4

Iteration: 5  Cost: 0.686
          40      142
          15      303
    Precision: 0.22 Recall: 0.73, F1: 0.34
    Best Prec: 0.22 Recall: 0.73, F1: 0.34  i: 5

Iteration: 8  Cost: 0.74

In [39]:
show_best(predict4, X_test, Y_test, bias=0.0)

Iteration: -1  Cost: 0.784
          37       90
          18      355
    Precision: 0.29 Recall: 0.67, F1: 0.41


In [40]:
analyze_bias(predict4, X_test, Y_test, min_bias=-0.2, max_bias=0.0)

Best Results for F1 : 0.406593406593  Bias : 0.0
          37       90
          18      355
    Precision: 0.29 Recall: 0.67, F1: 0.41

Best Results for Precision : 0.411764705882  Bias : -0.2
          21       30
          34      415
    Precision: 0.41 Recall: 0.38, F1: 0.40


In [41]:
analyze_distance(predict4, X_test, Y_test)

Best Results for Size : 100  Cost : 0.82
          11       13
           5       71
    Precision: 0.46 Recall: 0.69, F1: 0.55

Best Results for Size : 200  Cost : 0.75
          24       41
           9      126
    Precision: 0.37 Recall: 0.73, F1: 0.49

Best Results for Size : 300  Cost : 0.763333333333
          28       58
          13      201
    Precision: 0.33 Recall: 0.68, F1: 0.44

Best Results for Size : 400  Cost : 0.7675
          33       77
          16      274
    Precision: 0.30 Recall: 0.67, F1: 0.42

Best Results for Size : 500  Cost : 0.784
          37       90
          18      355
    Precision: 0.29 Recall: 0.67, F1: 0.41



###800 Hidden

In [47]:
(predict5, best_weights5, _) = test_nn(modern_nn_model_h1, X_train, X_test, Y_train, Y_test, 
        iterations=20001, dimensions=[feats, 800, 2],
        update='rms_prop', uses_dropout=True,
        max_distance=50, lr=0.003, min_lr=0.00003)

Iteration: 0  Cost: 0.11
          55      445
           0        0
    Precision: 0.11 Recall: 1.00, F1: 0.20
    Best Prec: 0.11 Recall: 1.00, F1: 0.20  i: 0

Iteration: 1  Cost: 0.168
          55      416
           0       29
    Precision: 0.12 Recall: 1.00, F1: 0.21
    Best Prec: 0.12 Recall: 1.00, F1: 0.21  i: 1

Iteration: 2  Cost: 0.306
          54      346
           1       99
    Precision: 0.14 Recall: 0.98, F1: 0.24
    Best Prec: 0.14 Recall: 0.98, F1: 0.24  i: 2

Iteration: 3  Cost: 0.544
          48      221
           7      224
    Precision: 0.18 Recall: 0.87, F1: 0.30
    Best Prec: 0.18 Recall: 0.87, F1: 0.30  i: 3

Iteration: 4  Cost: 0.546
          49      221
           6      224
    Precision: 0.18 Recall: 0.89, F1: 0.30
    Best Prec: 0.18 Recall: 0.89, F1: 0.30  i: 4

Iteration: 5  Cost: 0.476
          50      257
           5      188
    Precision: 0.16 Recall: 0.91, F1: 0.28
    Best Prec: 0.18 Recall: 0.89, F1: 0.30  i: 4

Iteration: 6  Cost: 0.6

In [48]:
show_best(predict5, X_test, Y_test, bias=0.0)

Iteration: -1  Cost: 0.838
          27       53
          28      392
    Precision: 0.34 Recall: 0.49, F1: 0.40


In [49]:
analyze_bias(predict5, X_test, Y_test, min_bias=-0.1, max_bias=0.08)

Best Results for F1 : 0.4  Bias : 0.0
          27       53
          28      392
    Precision: 0.34 Recall: 0.49, F1: 0.40

Best Results for Precision : 0.351851851852  Bias : -0.1
          19       35
          36      410
    Precision: 0.35 Recall: 0.35, F1: 0.35


In [50]:
analyze_distance(predict5, X_test, Y_test)

Best Results for Size : 100  Cost : 0.82
           7        9
           9       75
    Precision: 0.44 Recall: 0.44, F1: 0.44

Best Results for Size : 200  Cost : 0.795
          20       28
          13      139
    Precision: 0.42 Recall: 0.61, F1: 0.49

Best Results for Size : 300  Cost : 0.813333333333
          23       38
          18      221
    Precision: 0.38 Recall: 0.56, F1: 0.45

Best Results for Size : 400  Cost : 0.83
          25       44
          24      307
    Precision: 0.36 Recall: 0.51, F1: 0.42

Best Results for Size : 500  Cost : 0.838
          27       53
          28      392
    Precision: 0.34 Recall: 0.49, F1: 0.40



In [53]:
solvers = {'predict1':predict1,
            'predict2':predict2,
            'predict3':predict3,
            'predict4':predict4,
           'predict5':predict5,}

In [56]:
with open('TheanoSolvers_NetworkSize.pkl', 'wb') as output:
    pickle.dump(solvers, output)

    # Pickle the list using the highest protocol available.
    #pickle.dump(solvers, output, -1)
    # output.close()
    print 'Done'

Done
