#GitAnalysis using Theano

In [1]:
%matplotlib inline

In [3]:
from pprint import pprint
from collections import defaultdict

import numpy as np
import math
import matplotlib.pyplot as plt
import random
import warnings
import copy
import pickle

from sklearn.grid_search import GridSearchCV
from sklearn import metrics

import sys
sys.path.append('../dev')

from ml_plot import plot_validation_curve
from ml_plot import my_plot_learning_curve
from ml_plot import plot_prediction_curve
from ml_plot import get_dataset, eval_predictions
from ml_plot import PredictCV, PredictCV_TrainTest
from ml_plot import PredictCV_TrainTestValidate

In [4]:
from Theano_NN import test_nn, modern_nn_model, modern_nn_model_h1
from Theano_NN import convert_binary_to_onehot, compute_stats, print_stats

In [5]:
PROJECT = 'nova'
# PROJECT = 'swift'
# PROJECT = 'cinder'
# PROJECT = 'heat'
PROJECT = 'glance'

# IMPORTANCE = 'crit'
# IMPORTANCE = 'high+'
IMPORTANCE = 'med+'
# IMPORTANCE = 'low+'

# SIZE = 100
#SIZE = 250
# SIZE = 0.1
SIZE = 0.5

SCORING = 'f1'         # (precision * recall) / (precision + recall)
# SCORING = 'accuracy'   # (TP + TN) / all values
# SCORING = 'precision'  # TP / (TP + FP)
# SCORING = 'recall'     # TP / (TP + FN)
# SCORING = 'average_precision'
# SCORING = 'roc_auc'

JOBS = 4
VERBOSE = True

warnings.filterwarnings('ignore', 'F-score is ill-defined')
warnings.filterwarnings('ignore', 'overflow encountered in exp')

#Function definitions

In [6]:
def print_stats2(metric_name, metric_value, param_name, param_value, stats):
    print 'Best Results for {0} : {1}  {2} : {3}'.format(
        metric_name, metric_value, param_name, param_value)
    print '    {0:8} {1:8}'.format(stats['TP'], stats['FP'])
    print '    {0:8} {1:8}'.format(stats['FN'], stats['TN'])
    print '    Precision: {0:0.2f} Recall: {1:0.2f}, F1: {2:0.2f}'.format(
        stats['precision'], stats['recall'], stats['f1'])    

In [7]:
def analyze_bias(predict, X_test, Y_test, min_bias=-0.2, max_bias=0.05, test_points=10, detail=False):
    best_f1 = 0.0
    best_f1_bias = 0.0
    best_f1_stats = []
    best_precision = 0.0
    best_precision_bias = 0.0
    best_precision_stats = []
    for bias in np.linspace(min_bias, max_bias, test_points):
        biasV = np.array([-bias, bias])
    
        y_predict = predict(X_test, bias=biasV)
        y_target = np.argmax(Y_test, axis=1)
            
        (cost, TP, FP, FN, TN, precision, recall, f1) = compute_stats(y_predict, y_target)
        if detail:
            print 'Bias:', bias
            print_stats(0, cost, TP, FP, FN, TN, precision, recall, f1)
            print
        if f1 > best_f1:
            best_f1 = f1
            best_f1_bias = bias
            best_f1_stats = {'cost':cost, 'TP':TP, 'FP':FP, 'FN':FN, 'TN':TN,
                             'precision':precision, 'recall':recall, 'f1':f1}
        if precision > best_precision:
            best_precision = precision
            best_precision_bias = bias
            best_precision_stats = {'cost':cost, 'TP':TP, 'FP':FP, 'FN':FN, 'TN':TN, 
                                    'precision':precision, 'recall':recall, 'f1':f1}
        
    print_stats2('F1', best_f1, 'Bias', best_f1_bias, best_f1_stats)
    print
    print_stats2('Precision', best_precision, 'Bias', best_precision_bias, best_precision_stats)

In [8]:
def analyze_distance(predict, X_test, Y_test, bias=0.0):
    biasV = np.array([-bias, bias])
    for size in [100, 200, 300, 400, 500]:
    
        y_predict = predict(X_test[:size,], bias=biasV)
        y_target = np.argmax(Y_test[:size,], axis=1)
            
        (cost, TP, FP, FN, TN, precision, recall, f1) = compute_stats(y_predict, y_target)
        stats = {'cost':cost, 'TP':TP, 'FP':FP, 'FN':FN, 'TN':TN,
                 'precision':precision, 'recall':recall, 'f1':f1}

        print_stats2('Size', size, 'Cost', stats['cost'], stats)
        print

In [9]:
def show_best(predict, X_test, Y_test, bias=0.0):
    
    biasV = np.array([-bias, bias])
    y_predict = predict(X_test, bias=biasV)
    y_target = np.argmax(Y_test, axis=1)
            
    (cost, TP, FP, FN, TN, precision, recall, f1) = compute_stats(y_predict, y_target)
    print_stats(-1, cost, TP, FP, FN, TN, precision, recall, f1)

#Load Data

In [10]:
%%capture
Y, X = get_dataset(PROJECT, IMPORTANCE)

In [11]:
rows = X.shape[0]
feats = X.shape[1]
print 'Rows:', rows, 'Features:', feats

Rows: 1140 Features: 3696


In [11]:
n_test = 500
ignore = 0.1

max_rows= int(rows*(1.0-ignore))
n_train = max_rows-n_test

print n_train, n_test, max_rows

526 500 1026


In [12]:
Y_onehot = convert_binary_to_onehot(Y)

X_train = X[:n_train,]
Y_train = Y_onehot[:n_train,]
X_test = X[n_train:n_train+n_test,]
Y_test = Y_onehot[n_train:n_train+n_test,]

In [13]:
print 'Initial Shapes:'
print '  X', X.shape, type(X)
print '  Y', Y.shape, type(Y)
print '  Y(one hot):', Y_onehot.shape, type(Y_onehot)
print
print 'Train and Test Data:'
print '  train -- X:', X_train.shape, 'Y:', Y_train.shape
print '  test  -- X:', X_test.shape, 'Y:', Y_test.shape

Initial Shapes:
  X (1140, 3696) <type 'numpy.ndarray'>
  Y (1140,) <type 'numpy.ndarray'>
  Y(one hot): (1140, 2) <type 'numpy.ndarray'>

Train and Test Data:
  train -- X: (526, 3696) Y: (526, 2)
  test  -- X: (500, 3696) Y: (500, 2)


#Start of Analysis

##Test Instance

In [14]:
(predict, best_weights, _) = test_nn(modern_nn_model_h1, X_train, X_test, Y_train, Y_test, 
        iterations=2, dimensions=[feats, 100, 2], #  lr=0.003,
        update='rms_prop', uses_dropout=True,
        max_distance=50, lr=0.003, min_lr=0.00003)

Iteration: 0  Cost: 0.218
         109      391
           0        0
    Precision: 0.22 Recall: 1.00, F1: 0.36
    Best Prec: 0.22 Recall: 1.00, F1: 0.36  i: 0

Iteration: 1  Cost: 0.218
         109      391
           0        0
    Precision: 0.22 Recall: 1.00, F1: 0.36
    Best Prec: 0.22 Recall: 1.00, F1: 0.36  i: 0

Validating restored results
Iteration: 1  Cost: 0.218
         109      391
           0        0
    Precision: 0.22 Recall: 1.00, F1: 0.36



In [15]:
show_best(predict, X_test, Y_test, bias=0.0)

Iteration: -1  Cost: 0.218
         109      391
           0        0
    Precision: 0.22 Recall: 1.00, F1: 0.36


In [16]:
analyze_bias(predict, X_test, Y_test, min_bias=-0.2, max_bias=0.00)

Best Results for F1 : 0.357963875205  Bias : -0.0222222222222
         109      391
           0        0
    Precision: 0.22 Recall: 1.00, F1: 0.36

Best Results for Precision : 0.599999400001  Bias : -0.0444444444444
           6        4
         103      387
    Precision: 0.60 Recall: 0.06, F1: 0.10


In [17]:
analyze_distance(predict, X_test, Y_test)

Best Results for Size : 100  Cost : 0.21
          21       79
           0        0
    Precision: 0.21 Recall: 1.00, F1: 0.35

Best Results for Size : 200  Cost : 0.25
          50      150
           0        0
    Precision: 0.25 Recall: 1.00, F1: 0.40

Best Results for Size : 300  Cost : 0.233333333333
          70      230
           0        0
    Precision: 0.23 Recall: 1.00, F1: 0.38

Best Results for Size : 400  Cost : 0.225
          90      310
           0        0
    Precision: 0.22 Recall: 1.00, F1: 0.37

Best Results for Size : 500  Cost : 0.218
         109      391
           0        0
    Precision: 0.22 Recall: 1.00, F1: 0.36



## Determine impact of starting LR

###Starting LR=0.003

In [62]:
(predict1, best_weights1, _) = test_nn(modern_nn_model_h1, X_train, X_test, Y_train, Y_test, 
        iterations=20001, dimensions=[feats, 100, 2],
        update='rms_prop', uses_dropout=True, batch=1000,
        max_distance=2000, lr=0.003, min_lr=0.0003)

Iteration: 0  Cost: 0.724
          13       42
          96      349
    Precision: 0.24 Recall: 0.12, F1: 0.16
    Best Prec: 0.24 Recall: 0.12, F1: 0.16  i: 0

Iteration: 1  Cost: 0.724
          13       42
          96      349
    Precision: 0.24 Recall: 0.12, F1: 0.16
    Best Prec: 0.24 Recall: 0.12, F1: 0.16  i: 0

Iteration: 2  Cost: 0.724
          13       42
          96      349
    Precision: 0.24 Recall: 0.12, F1: 0.16
    Best Prec: 0.24 Recall: 0.12, F1: 0.16  i: 0

Iteration: 5  Cost: 0.724
          13       42
          96      349
    Precision: 0.24 Recall: 0.12, F1: 0.16
    Best Prec: 0.24 Recall: 0.12, F1: 0.16  i: 0

Iteration: 10  Cost: 0.724
          13       42
          96      349
    Precision: 0.24 Recall: 0.12, F1: 0.16
    Best Prec: 0.24 Recall: 0.12, F1: 0.16  i: 0

Iteration: 20  Cost: 0.724
          13       42
          96      349
    Precision: 0.24 Recall: 0.12, F1: 0.16
    Best Prec: 0.24 Recall: 0.12, F1: 0.16  i: 0

Iteration: 30  Cost:

In [63]:
show_best(predict1, X_test, Y_test, bias=0.0)

Iteration: -1  Cost: 0.724
          13       42
          96      349
    Precision: 0.24 Recall: 0.12, F1: 0.16


In [64]:
analyze_bias(predict1, X_test, Y_test, min_bias=-0.2, max_bias=0.0)

Best Results for F1 : 0.158536585366  Bias : 0.0
          13       42
          96      349
    Precision: 0.24 Recall: 0.12, F1: 0.16

Best Results for Precision : 0.236363593388  Bias : 0.0
          13       42
          96      349
    Precision: 0.24 Recall: 0.12, F1: 0.16


In [65]:
analyze_distance(predict1, X_test, Y_test)

Best Results for Size : 100  Cost : 0.75
           4        8
          17       71
    Precision: 0.33 Recall: 0.19, F1: 0.24

Best Results for Size : 200  Cost : 0.695
           6       17
          44      133
    Precision: 0.26 Recall: 0.12, F1: 0.16

Best Results for Size : 300  Cost : 0.71
           7       24
          63      206
    Precision: 0.23 Recall: 0.10, F1: 0.14

Best Results for Size : 400  Cost : 0.715
          10       34
          80      276
    Precision: 0.23 Recall: 0.11, F1: 0.15

Best Results for Size : 500  Cost : 0.724
          13       42
          96      349
    Precision: 0.24 Recall: 0.12, F1: 0.16



###Starting LR=0.001

In [22]:
(predict2, best_weights2, _) = test_nn(modern_nn_model_h1, X_train, X_test, Y_train, Y_test, 
        iterations=2001, dimensions=[feats, 100, 2],
        update='rms_prop', uses_dropout=True,
        max_distance=100, lr=0.001, min_lr=0.00001)

Iteration: 0  Cost: 0.218
         109      391
           0        0
    Precision: 0.22 Recall: 1.00, F1: 0.36
    Best Prec: 0.22 Recall: 1.00, F1: 0.36  i: 0

Iteration: 1  Cost: 0.218
         109      391
           0        0
    Precision: 0.22 Recall: 1.00, F1: 0.36
    Best Prec: 0.22 Recall: 1.00, F1: 0.36  i: 0

Iteration: 2  Cost: 0.218
         109      391
           0        0
    Precision: 0.22 Recall: 1.00, F1: 0.36
    Best Prec: 0.22 Recall: 1.00, F1: 0.36  i: 0

Iteration: 5  Cost: 0.218
         109      391
           0        0
    Precision: 0.22 Recall: 1.00, F1: 0.36
    Best Prec: 0.22 Recall: 1.00, F1: 0.36  i: 0

Iteration: 10  Cost: 0.218
         109      391
           0        0
    Precision: 0.22 Recall: 1.00, F1: 0.36
    Best Prec: 0.22 Recall: 1.00, F1: 0.36  i: 0

Iteration: 20  Cost: 0.218
         109      391
           0        0
    Precision: 0.22 Recall: 1.00, F1: 0.36
    Best Prec: 0.22 Recall: 1.00, F1: 0.36  i: 0

Iteration: 30  Cost:

In [23]:
show_best(predict2, X_test, Y_test, bias=0.0)

Iteration: -1  Cost: 0.61
          90      176
          19      215
    Precision: 0.34 Recall: 0.83, F1: 0.48


In [24]:
analyze_bias(predict2, X_test, Y_test, min_bias=-0.2, max_bias=0.0)

Best Results for F1 : 0.48  Bias : 0.0
          90      176
          19      215
    Precision: 0.34 Recall: 0.83, F1: 0.48

Best Results for Precision : 0.410256375192  Bias : -0.2
          48       69
          61      322
    Precision: 0.41 Recall: 0.44, F1: 0.42


In [25]:
analyze_distance(predict2, X_test, Y_test)

Best Results for Size : 100  Cost : 0.58
          17       38
           4       41
    Precision: 0.31 Recall: 0.81, F1: 0.45

Best Results for Size : 200  Cost : 0.645
          43       64
           7       86
    Precision: 0.40 Recall: 0.86, F1: 0.55

Best Results for Size : 300  Cost : 0.616666666667
          56      101
          14      129
    Precision: 0.36 Recall: 0.80, F1: 0.49

Best Results for Size : 400  Cost : 0.61
          73      139
          17      171
    Precision: 0.34 Recall: 0.81, F1: 0.48

Best Results for Size : 500  Cost : 0.61
          90      176
          19      215
    Precision: 0.34 Recall: 0.83, F1: 0.48



###Starting LR=0.0001

In [55]:
(predict4, best_weights4, _) = test_nn(modern_nn_model_h1, X_train, X_test, Y_train, Y_test, 
        iterations=10001, dimensions=[feats, 100, 2],
        update='rms_prop', uses_dropout=True,
        max_distance=5000, lr=0.0001, min_lr=0.0001)

Iteration: 0  Cost: 0.466
          66      224
          43      167
    Precision: 0.23 Recall: 0.61, F1: 0.33
    Best Prec: 0.23 Recall: 0.61, F1: 0.33  i: 0

Iteration: 1  Cost: 0.274
          91      345
          18       46
    Precision: 0.21 Recall: 0.83, F1: 0.33
    Best Prec: 0.21 Recall: 0.83, F1: 0.33  i: 1

Iteration: 2  Cost: 0.216
         102      385
           7        6
    Precision: 0.21 Recall: 0.94, F1: 0.34
    Best Prec: 0.21 Recall: 0.94, F1: 0.34  i: 2

Iteration: 3  Cost: 0.216
         107      390
           2        1
    Precision: 0.22 Recall: 0.98, F1: 0.35
    Best Prec: 0.22 Recall: 0.98, F1: 0.35  i: 3

Iteration: 5  Cost: 0.216
         108      391
           1        0
    Precision: 0.22 Recall: 0.99, F1: 0.36
    Best Prec: 0.22 Recall: 0.99, F1: 0.36  i: 5

Iteration: 6  Cost: 0.218
         109      391
           0        0
    Precision: 0.22 Recall: 1.00, F1: 0.36
    Best Prec: 0.22 Recall: 1.00, F1: 0.36  i: 6

Iteration: 10  Cost: 0

In [56]:
show_best(predict4, X_test, Y_test, bias=0.0)

Iteration: -1  Cost: 0.566
          91      199
          18      192
    Precision: 0.31 Recall: 0.83, F1: 0.46


In [57]:
analyze_bias(predict4, X_test, Y_test, min_bias=-0.2, max_bias=0.00)

Best Results for F1 : 0.471698113208  Bias : -0.111111111111
          75      134
          34      257
    Precision: 0.36 Recall: 0.69, F1: 0.47

Best Results for Precision : 0.388059672533  Bias : -0.2
          52       82
          57      309
    Precision: 0.39 Recall: 0.48, F1: 0.43


In [58]:
analyze_distance(predict4, X_test, Y_test)

Best Results for Size : 100  Cost : 0.56
          17       40
           4       39
    Precision: 0.30 Recall: 0.81, F1: 0.44

Best Results for Size : 200  Cost : 0.62
          43       69
           7       81
    Precision: 0.38 Recall: 0.86, F1: 0.53

Best Results for Size : 300  Cost : 0.576666666667
          56      113
          14      117
    Precision: 0.33 Recall: 0.80, F1: 0.47

Best Results for Size : 400  Cost : 0.565
          73      157
          17      153
    Precision: 0.32 Recall: 0.81, F1: 0.46

Best Results for Size : 500  Cost : 0.566
          91      199
          18      192
    Precision: 0.31 Recall: 0.83, F1: 0.46



###Starting LR=0.00001

In [30]:
(predict6, best_weights6, _) = test_nn(modern_nn_model_h1, X_train, X_test, Y_train, Y_test, 
        iterations=20001, dimensions=[feats, 100, 2],
        update='rms_prop', uses_dropout=True,
        max_distance=1000, lr=0.00001, min_lr=0.000001)

Iteration: 0  Cost: 0.452
          67      232
          42      159
    Precision: 0.22 Recall: 0.61, F1: 0.33
    Best Prec: 0.22 Recall: 0.61, F1: 0.33  i: 0

Iteration: 1  Cost: 0.438
          69      241
          40      150
    Precision: 0.22 Recall: 0.63, F1: 0.33
    Best Prec: 0.22 Recall: 0.63, F1: 0.33  i: 1

Iteration: 2  Cost: 0.428
          73      250
          36      141
    Precision: 0.23 Recall: 0.67, F1: 0.34
    Best Prec: 0.23 Recall: 0.67, F1: 0.34  i: 2

Iteration: 3  Cost: 0.418
          77      259
          32      132
    Precision: 0.23 Recall: 0.71, F1: 0.35
    Best Prec: 0.23 Recall: 0.71, F1: 0.35  i: 3

Iteration: 4  Cost: 0.414
          79      263
          30      128
    Precision: 0.23 Recall: 0.72, F1: 0.35
    Best Prec: 0.23 Recall: 0.72, F1: 0.35  i: 4

Iteration: 5  Cost: 0.406
          82      270
          27      121
    Precision: 0.23 Recall: 0.75, F1: 0.36
    Best Prec: 0.23 Recall: 0.75, F1: 0.36  i: 5

Iteration: 10  Cost: 0

KeyboardInterrupt: 

In [None]:
show_best(predict6, X_test, Y_test, bias=0.0)

###Starting LR=0.000001

In [None]:
(predict8, best_weights8, _) = test_nn(modern_nn_model_h1, X_train, X_test, Y_train, Y_test, 
        iterations=20001, dimensions=[feats, 100, 2],
        update='rms_prop', uses_dropout=True,
        max_distance=1000, lr=0.000001, min_lr=0.0000001)

In [None]:
show_best(predict8, X_test, Y_test, bias=0.0)

###Now try with single output

Results are always 1, independent of Binary or categorical crossentropy

In [None]:
solvers = {'predict1':predict1,
           'predict2':predict2,
           # 'predict3':predict3,
           'predict4':predict4,
           # 'predict5':predict5,
           'predict6':predict6,
           # 'predict7':predict7,
           # 'predict8':predict8,
          }

In [None]:
if False:
    with open('TheanoSolvers_NN_LR_Sizes.pkl', 'wb') as output:
        pickle.dump(solvers, output)

        # Pickle the list using the highest protocol available.
        #pickle.dump(solvers, output, -1)
        # output.close()
        print 'Done'