In [1]:
REPO_ROOT = "/usr/src/app"

import math
import pickle
import time

import numpy as np
import pandas as pd

from sklearn.svm import *
from sklearn.naive_bayes import BernoulliNB
from sklearn import linear_model
from sklearn.neighbors import KNeighborsClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.ensemble import RandomForestClassifier

import theano
import theano.tensor as T
import lasagne

In [2]:
def load_model(model_type, train_size):
    with open("%s/model-data/dataset_%s_%d.pickle" % (REPO_ROOT, model_type, train_size), "r") as f:
        return pickle.load(f)

def test_model(dataset, model_type, train_size, model, model_name):
    model.fit(dataset["X_train"], dataset["Y_train"])
    test_pred = model.predict(dataset["X_test"])
    test_y = dataset["Y_test"]
    shas = dataset["shas_test"]
        
    accuracy = (float(sum(test_y == test_pred))) / len(test_pred)
    precision = (float(sum((test_y == test_pred) & (test_pred == 1)))) / float(max(1, sum(test_pred == 1)))
    recall = (float(sum((test_y == test_pred) & (test_pred == 1)))) / float(sum(test_y == 1))
    f1 = 2 * (precision * recall) / max(1, precision + recall)

    print "%10s %15s. Train set size %5d. %0.1f%% / %0.1f%% / %0.1f%% (%0.3f)" % (
        model_type,
        model_name,
        train_size,
        accuracy * 100,
        precision * 100,
        recall * 100,
        f1)
        
    output_table.append([
        model_type,
        model_name,
        train_size, 
        accuracy,
        precision,
        recall,
        f1,
    ])
    
    # Save 10 errors
    error_shas = np.array(shas)[test_y != test_pred][0:50]
    error_correct = np.array(test_y)[test_y != test_pred][0:50]
    
    with open("%s/results/model_errors_%s_%s_%d.txt" % (REPO_ROOT, model_type, model_name, train_size), "w") as fout:
        for sha, correct in zip(error_shas, error_correct):
            fout.write("#### %s FLAG: %s ####\n\n" % (sha, "Yes" if correct > 0 else "No"))
            with open("%s/scripts/%s.js" % (REPO_ROOT, sha), "r") as fin:
                fout.write(fin.read())
            fout.write("\n\n")


In [3]:
def build_mlp(input_var, input_size):
    l_in = lasagne.layers.InputLayer(shape=(None, input_size),
                                     input_var=input_var)
    l_in_drop = lasagne.layers.DropoutLayer(l_in, p=0.2)
    
    l_hid1 = lasagne.layers.DenseLayer(
        l_in_drop, num_units=40,
        nonlinearity=lasagne.nonlinearities.tanh,
        W=lasagne.init.GlorotUniform())
    
    l_hid1_drop = lasagne.layers.DropoutLayer(l_hid1, p=0.5)

    l_hid2 = lasagne.layers.DenseLayer(
        l_hid1_drop, num_units=15,
        nonlinearity=lasagne.nonlinearities.tanh,
        W=lasagne.init.GlorotUniform())
    
    l_hid2_drop = lasagne.layers.DropoutLayer(l_hid2, p=0.5)
    
    l_out = lasagne.layers.DenseLayer(
        l_hid2_drop, num_units=2,
        nonlinearity=lasagne.nonlinearities.softmax)
    
    return l_out

def iterate_minibatches(inputs, targets, batchsize, shuffle=False):
    assert np.shape(inputs)[0] == len(targets)
    indices = np.arange(np.shape(inputs)[0])
    if shuffle:
        np.random.shuffle(indices)
    for start_idx in range(0, np.shape(inputs)[0] - batchsize + 1, batchsize):
        excerpt = indices[start_idx:start_idx + batchsize]
        yield inputs[excerpt].toarray(), targets[excerpt]
    
def test_mlp(dataset, model_type, train_size):
    input_var = T.matrix('inputs')
    target_var = T.lvector('targets')
    # Create neural network model
    network = build_mlp(input_var, np.shape(dataset["X_train"])[1])
    
    prediction = lasagne.layers.get_output(network)
    loss = lasagne.objectives.categorical_crossentropy(prediction, target_var)
    loss = loss.mean()
    
    params = lasagne.layers.get_all_params(network, trainable=True)
    updates = lasagne.updates.adam(loss, params)
    
    test_prediction = lasagne.layers.get_output(network, deterministic=True)
    test_loss = lasagne.objectives.categorical_crossentropy(test_prediction, target_var)
    test_loss = test_loss.mean()
    
    test_acc = T.mean(T.eq(T.argmax(test_prediction, axis=1), target_var),
                  dtype=theano.config.floatX)
    
    train_fn = theano.function([input_var, target_var], loss, updates=updates)
    val_fn = theano.function([input_var, target_var], [test_loss, test_acc])
    
    X_train_flat = dataset["X_train"].tocsc()
    X_test_flat = dataset["X_test"].tocsc()

    best_accuracy = 0
    bad_count = 0
    batch_size = min(200, train_size/10)
    for epoch in xrange(999):
        # In each epoch, we do a full pass over the training data:
        train_err = 0
        train_batches = 0
        start_time = time.time()
        for batch in iterate_minibatches(X_train_flat, dataset["Y_train"], batch_size, shuffle=True):
            inputs, targets = batch
            train_err += train_fn(inputs, targets)
            train_batches += 1

        # And a full pass over the validation data:
        val_err = 0
        val_acc = 0
        val_batches = 0
        for batch in iterate_minibatches(X_test_flat, dataset["Y_test"], batch_size, shuffle=False):
            inputs, targets = batch
            err, acc = val_fn(inputs, targets)
            val_err += err
            val_acc += acc
            val_batches += 1
        
        current_accuracy = val_acc / val_batches

        # Then we print the results for this epoch:
        print("Epoch {} took {:.3f}s - accuracy {:.2f} %".format(
            epoch + 1, time.time() - start_time, current_accuracy * 100))
        
        if current_accuracy > best_accuracy:
            best_accuracy = current_accuracy
            bad_count = 0
        else:
            bad_count += 1
            if bad_count > 4:
                break
        
    print "%10s %15s. Train set size %5d. %0.1f%% / %0.1f%% / %0.1f%% (%0.3f)" % (
            model_type,
            "MLP",
            train_size,
            current_accuracy * 100,
            0,
            0,
            0)
    output_table.append([
            model_type,
            "MLP",
            train_size, 
            current_accuracy,
            0,
            0,
            0,
        ])

In [4]:
with open("%s/model-data/metadata.pickle" % (REPO_ROOT,), "r") as f:
    size_data = pickle.load(f)
    
TRAIN_SIZES = size_data["train_sizes"]
TEST_SIZE = size_data["test_size"]

print "Training sizes: %s" % TRAIN_SIZES
print "Test size: %d" % TEST_SIZE

Training sizes: [300, 600, 1200, 2400, 4800, 9600, 19200]
Test size: 3588


In [5]:
output_table = []

for train_size in TRAIN_SIZES:
    for model_type in ["RegEx", "BiRegEx", "TriRegEx", "AST", "BiAST", "TriAST"]:
        dataset = load_model(model_type, train_size)

        test_model(dataset, model_type, train_size,
                   KNeighborsClassifier(2), "KNN")
        
        test_model(dataset, model_type, train_size,
                   BernoulliNB(), "Bernoulli")

        test_model(dataset, model_type, train_size,
                   linear_model.SGDClassifier(n_iter=1000, loss="log"), "SGD")
        
        test_model(dataset, model_type, train_size,
                   RandomForestClassifier(max_depth=15, n_estimators=100, max_features=30), "RandomForest")
        
        test_model(dataset, model_type, train_size,
                   LinearSVC(), "LinearSVC")
        
        #test_mlp(dataset, model_type, train_size)
        
output = ("Model Type,Model,Training set,Accuracy,Precision,Recall,F1 score\n" +
        "\n".join([",".join([str(s) for s in row]) for row in output_table]))
with open("%s/results/linear_models.csv" % REPO_ROOT, "w") as f:
    f.write(output)

     RegEx             KNN. Train set size   300. 76.6% / 84.6% / 65.1% (0.736)
     RegEx       Bernoulli. Train set size   300. 58.7% / 54.9% / 97.0% (0.701)
     RegEx             SGD. Train set size   300. 79.8% / 84.1% / 73.6% (0.785)
     RegEx    RandomForest. Train set size   300. 70.2% / 64.2% / 91.4% (0.754)
     RegEx       LinearSVC. Train set size   300. 79.1% / 84.2% / 71.7% (0.775)
   BiRegEx             KNN. Train set size   300. 77.2% / 82.7% / 68.9% (0.752)
   BiRegEx       Bernoulli. Train set size   300. 58.6% / 54.9% / 97.1% (0.701)
   BiRegEx             SGD. Train set size   300. 79.8% / 82.7% / 75.5% (0.789)
   BiRegEx    RandomForest. Train set size   300. 70.3% / 64.5% / 90.1% (0.752)
   BiRegEx       LinearSVC. Train set size   300. 78.6% / 82.7% / 72.4% (0.772)
  TriRegEx             KNN. Train set size   300. 78.2% / 83.8% / 70.0% (0.763)
  TriRegEx       Bernoulli. Train set size   300. 58.3% / 54.7% / 96.5% (0.699)
  TriRegEx             SGD. Train set si