#GitAnalysis using Theano

In [1]:
%matplotlib inline

In [1]:
from pprint import pprint
from collections import defaultdict

import numpy as np
import math
import matplotlib.pyplot as plt
import random
import warnings
import copy

from sklearn.grid_search import GridSearchCV
from sklearn import metrics

import sys
sys.path.append('./dev')

from ml_plot import plot_validation_curve
from ml_plot import my_plot_learning_curve
from ml_plot import plot_prediction_curve
from ml_plot import get_dataset, eval_predictions
from ml_plot import PredictCV, PredictCV_TrainTest
from ml_plot import PredictCV_TrainTestValidate

In [15]:
from Theano_NN import test_nn, modern_nn_model, modern_nn_model_h1
from Theano_NN import convert_binary_to_onehot, compute_stats, print_stats

In [3]:
PROJECT = 'nova'
# PROJECT = 'swift'
# PROJECT = 'cinder'
# PROJECT = 'heat'
# PROJECT = 'glance'

# IMPORTANCE = 'crit'
# IMPORTANCE = 'high+'
IMPORTANCE = 'med+'
# IMPORTANCE = 'low+'

# SIZE = 100
#SIZE = 250
# SIZE = 0.1
SIZE = 0.5

SCORING = 'f1'         # (precision * recall) / (precision + recall)
# SCORING = 'accuracy'   # (TP + TN) / all values
# SCORING = 'precision'  # TP / (TP + FP)
# SCORING = 'recall'     # TP / (TP + FN)
# SCORING = 'average_precision'
# SCORING = 'roc_auc'

JOBS = 4
VERBOSE = True

warnings.filterwarnings('ignore', 'F-score is ill-defined')
warnings.filterwarnings('ignore', 'overflow encountered in exp')

#Function definitions

In [22]:
def analyze_bias(predict, X_test, Y_test, min_bias=-0.2, max_bias=0.05):
    for bias in np.linspace(min_bias, max_bias, 10):
        biasV = np.array([-bias, bias])
    
        y_predict = predict(X_test, bias=biasV)
        y_target = np.argmax(Y_test, axis=1)
            
        (cost, TP, FP, FN, TN, precision, recall, f1) = compute_stats(y_predict, y_target)
        print 'Bias:', bias
        print_stats(0, cost, TP, FP, FN, TN, precision, recall, f1)
        print

In [23]:
def analyze_distance(predict, X_test, Y_test, bias=0.0):
    biasV = np.array([-bias, bias])
    for size in [100, 200, 300, 400, 500]:
    
        y_predict = predict(X_test[:size,], bias=biasV)
        y_target = np.argmax(Y_test[:size,], axis=1)
            
        (cost, TP, FP, FN, TN, precision, recall, f1) = compute_stats(y_predict, y_target)
        print 'Size:', size
        print_stats(0, cost, TP, FP, FN, TN, precision, recall, f1)
        print

#Load Data

In [4]:
%%capture
Y, X = get_dataset(PROJECT, IMPORTANCE)

In [5]:
rows = X.shape[0]
feats = X.shape[1]
print 'Rows:', rows, 'Features:', feats

Rows: 8806 Features: 16233


In [6]:
n_test = 500
ignore = 0.1

max_rows= int(rows*(1.0-ignore))
n_train = max_rows-n_test

print n_train, n_test, max_rows

7425 500 7925


In [9]:
Y_onehot = convert_binary_to_onehot(Y)

X_train = X[:n_train,]
Y_train = Y_onehot[:n_train,]
X_test = X[n_train:n_train+n_test,]
Y_test = Y_onehot[n_train:n_train+n_test,]

In [10]:
print 'Initial Shapes:'
print '  X', X.shape, type(X)
print '  Y', Y.shape, type(Y)
print '  Y(one hot):', Y_onehot.shape, type(Y_onehot)
print
print 'Train and Test Data:'
print '  train -- X:', X_train.shape, 'Y:', Y_train.shape
print '  test  -- X:', X_test.shape, 'Y:', Y_test.shape

Initial Shapes:
  X (8806, 16233) <type 'numpy.ndarray'>
  Y (8806,) <type 'numpy.ndarray'>
  Y(one hot): (8806, 2) <type 'numpy.ndarray'>

Train and Test Data:
  train -- X: (7425, 16233) Y: (7425, 2)
  test  -- X: (500, 16233) Y: (500, 2)


#Start of Analysis

##Test Instance

In [12]:
(predict, best_weights, _) = test_nn(modern_nn_model_h1, X_train, X_test, Y_train, Y_test, 
        iterations=2, dimensions=[feats, 100, 2], #  lr=0.003,
        update='rms_prop', uses_dropout=True,
        max_distance=50, lr=0.003, min_lr=0.000001)

Iteration: 0  Cost: 0.11
          55      445
           0        0
    Precision: 0.11 Recall: 1.00, F1: 0.20
    Best Prec: 0.11 Recall: 1.00, F1: 0.20  i: 0

Iteration: 1  Cost: 0.154
          55      423
           0       22
    Precision: 0.12 Recall: 1.00, F1: 0.21
    Best Prec: 0.12 Recall: 1.00, F1: 0.21  i: 1



In [17]:
analyze_bias(predict, X_test, Y_test, min_bias=-0.2, max_bias=0.05)

Bias: -0.2
Iteration: 0  Cost: 0.79
          29       79
          26      366
    Precision: 0.27 Recall: 0.53, F1: 0.36
Bias: -0.172222222222
Iteration: 0  Cost: 0.746
          36      108
          19      337
    Precision: 0.25 Recall: 0.65, F1: 0.36
Bias: -0.144444444444
Iteration: 0  Cost: 0.68
          37      142
          18      303
    Precision: 0.21 Recall: 0.67, F1: 0.32
Bias: -0.116666666667
Iteration: 0  Cost: 0.582
          41      195
          14      250
    Precision: 0.17 Recall: 0.75, F1: 0.28
Bias: -0.0888888888889
Iteration: 0  Cost: 0.504
          49      242
           6      203
    Precision: 0.17 Recall: 0.89, F1: 0.28
Bias: -0.0611111111111
Iteration: 0  Cost: 0.402
          53      297
           2      148
    Precision: 0.15 Recall: 0.96, F1: 0.26
Bias: -0.0333333333333
Iteration: 0  Cost: 0.3
          54      349
           1       96
    Precision: 0.13 Recall: 0.98, F1: 0.24
Bias: -0.00555555555556
Iteration: 0  Cost: 0.184
          55     

In [21]:
analyze_distance(predict, X_test, Y_test, bias=0.0)

Size: 100
Iteration: 0  Cost: 0.23
          16       77
           0        7
    Precision: 0.17 Recall: 1.00, F1: 0.29

Size: 200
Iteration: 0  Cost: 0.215
          33      157
           0       10
    Precision: 0.17 Recall: 1.00, F1: 0.30

Size: 300
Iteration: 0  Cost: 0.183333333333
          41      245
           0       14
    Precision: 0.14 Recall: 1.00, F1: 0.25

Size: 400
Iteration: 0  Cost: 0.17
          49      332
           0       19
    Precision: 0.13 Recall: 1.00, F1: 0.23

Size: 500
Iteration: 0  Cost: 0.154
          55      423
           0       22
    Precision: 0.12 Recall: 1.00, F1: 0.21



## Determine impact of starting LR

In [None]:
(predict1, best_weights, _) = test_nn(modern_nn_model_h1, X_train, X_test, Y_train, Y_test, 
        iterations=2001, dimensions=[feats, 100, 2],
        update='rms_prop', uses_dropout=True,
        max_distance=50, lr=0.003, min_lr=0.000001)

Iteration: 0  Cost: 0.266
          55      367
           0       78
    Precision: 0.13 Recall: 1.00, F1: 0.23
    Best Prec: 0.13 Recall: 1.00, F1: 0.23  i: 0

Iteration: 1  Cost: 0.478
          52      258
           3      187
    Precision: 0.17 Recall: 0.95, F1: 0.28
    Best Prec: 0.17 Recall: 0.95, F1: 0.28  i: 1

Iteration: 2  Cost: 0.578
          48      204
           7      241
    Precision: 0.19 Recall: 0.87, F1: 0.31
    Best Prec: 0.19 Recall: 0.87, F1: 0.31  i: 2

Iteration: 3  Cost: 0.708
          37      128
          18      317
    Precision: 0.22 Recall: 0.67, F1: 0.34
    Best Prec: 0.22 Recall: 0.67, F1: 0.34  i: 3

Iteration: 5  Cost: 0.696
          39      136
          16      309
    Precision: 0.22 Recall: 0.71, F1: 0.34
    Best Prec: 0.22 Recall: 0.71, F1: 0.34  i: 5

Iteration: 8  Cost: 0.766
          32       94
          23      351
    Precision: 0.25 Recall: 0.58, F1: 0.35
    Best Prec: 0.25 Recall: 0.58, F1: 0.35  i: 8

Iteration: 10  Cost: 0

In [None]:
(predict2, best_weights, _) = test_nn(modern_nn_model_h1, X_train, X_test, Y_train, Y_test, 
        iterations=2001, dimensions=[feats, 100, 2],
        update='rms_prop', uses_dropout=True,
        max_distance=50, lr=0.001, min_lr=0.000001)

In [None]:
(predict4, best_weights, _) = test_nn(modern_nn_model_h1, X_train, X_test, Y_train, Y_test, 
        iterations=2001, dimensions=[feats, 100, 2],
        update='rms_prop', uses_dropout=True,
        max_distance=50, lr=0.0001, min_lr=0.000001)

In [None]:
(predict6, best_weights, _) = test_nn(modern_nn_model_h1, X_train, X_test, Y_train, Y_test, 
        iterations=20001, dimensions=[feats, 100, 2],
        update='rms_prop', uses_dropout=True,
        max_distance=100, lr=0.00001, min_lr=0.000001)

In [None]:
(predict8, best_weights, _) = test_nn(modern_nn_model_h1, X_train, X_test, Y_train, Y_test, 
        iterations=20001, dimensions=[feats, 100, 2],
        update='rms_prop', uses_dropout=True,
        max_distance=100, lr=0.000001, min_lr=0.0000001)

###Now try with single output

Results are always 1, independent of Binary or categorical crossentropy