In [1]:
REPO_ROOT = "/usr/src/app"

import math
import pickle
import time

import numpy as np
import pandas as pd
import scipy as sc

from sklearn.svm import *
from sklearn.naive_bayes import BernoulliNB
from sklearn import linear_model
from sklearn.neighbors import KNeighborsClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.ensemble import RandomForestClassifier

import theano
import theano.tensor as T
import lasagne

Using gpu device 0: GRID K520 (CNMeM is disabled, cuDNN not available)


In [2]:
def load_model(model_type, train_size):
    with open("%s/model-data/dataset_%s_%d.pickle" % (REPO_ROOT, model_type, train_size), "r") as f:
        return pickle.load(f)

def test_model(dataset, model_type, train_size, model, model_name, output_errors):
    model.fit(dataset["X_train"], dataset["Y_train"])
    test_pred = model.predict(dataset["X_test"])
    test_y = dataset["Y_test"]
    shas = dataset["shas_test"]
        
    accuracy = (float(sum(test_y == test_pred))) / len(test_pred)
    precision = (float(sum((test_y == test_pred) & (test_pred == 1)))) / float(max(1, sum(test_pred == 1)))
    recall = (float(sum((test_y == test_pred) & (test_pred == 1)))) / float(sum(test_y == 1))
    f1 = 2 * (precision * recall) / max(1, precision + recall)

    print "%10s %15s. Train set size %5d. %0.1f%% / %0.1f%% / %0.1f%% (%0.3f)" % (
        model_type,
        model_name,
        train_size,
        accuracy * 100,
        precision * 100,
        recall * 100,
        f1)
        
    output_table.append([
        model_type,
        model_name,
        train_size, 
        accuracy,
        precision,
        recall,
        f1,
    ])
    
    if output_errors:
        # Save 10 errors
        error_shas = np.array(shas)[test_y != test_pred][0:50]
        error_correct = np.array(test_y)[test_y != test_pred][0:50]

        with open("%s/results/model_errors_%s_%s_%d.txt" % (REPO_ROOT, model_type, model_name, train_size), "w") as fout:
            for sha, correct in zip(error_shas, error_correct):
                fout.write("#### %s FLAG: %s ####\n\n" % (sha, "Yes" if correct > 0 else "No"))
                with open("%s/scripts/%s.js" % (REPO_ROOT, sha), "r") as fin:
                    fout.write(fin.read())
                fout.write("\n\n")


In [3]:
def build_mlp(input_var, input_size):
    l_in = lasagne.layers.InputLayer(shape=(None, input_size),
                                     input_var=input_var)
    l_in_drop = lasagne.layers.DropoutLayer(l_in, p=0.2)
    
    l_hid1 = lasagne.layers.DenseLayer(
        l_in_drop, num_units=40,
        nonlinearity=lasagne.nonlinearities.tanh,
        W=lasagne.init.GlorotUniform())
    
    l_hid1_drop = lasagne.layers.DropoutLayer(l_hid1, p=0.5)

    l_hid2 = lasagne.layers.DenseLayer(
        l_hid1_drop, num_units=15,
        nonlinearity=lasagne.nonlinearities.tanh,
        W=lasagne.init.GlorotUniform())
    
    l_hid2_drop = lasagne.layers.DropoutLayer(l_hid2, p=0.5)
    
    l_out = lasagne.layers.DenseLayer(
        l_hid2_drop, num_units=2,
        nonlinearity=lasagne.nonlinearities.softmax)
    
    return l_out

def iterate_minibatches(inputs, targets, batchsize, shuffle=False):
    assert np.shape(inputs)[0] == len(targets)
    indices = np.arange(np.shape(inputs)[0])
    if shuffle:
        np.random.shuffle(indices)
    for start_idx in range(0, np.shape(inputs)[0] - batchsize + 1, batchsize):
        excerpt = indices[start_idx:start_idx + batchsize]
        if isinstance(inputs[excerpt], np.ndarray):
            i = inputs[excerpt]
        else:
            i = inputs[excerpt].toarray()
        yield i, targets[excerpt]
    
def test_mlp(dataset, model_type, train_size):
    input_var = T.matrix('inputs')
    target_var = T.lvector('targets')
    # Create neural network model
    network = build_mlp(input_var, np.shape(dataset["X_train"])[1])
    
    prediction = lasagne.layers.get_output(network)
    loss = lasagne.objectives.categorical_crossentropy(prediction, target_var)
    loss = loss.mean()
    
    params = lasagne.layers.get_all_params(network, trainable=True)
    updates = lasagne.updates.adam(loss, params)
    
    test_prediction = lasagne.layers.get_output(network, deterministic=True)
    test_loss = lasagne.objectives.categorical_crossentropy(test_prediction, target_var)
    test_loss = test_loss.mean()
    
    test_acc = T.mean(T.eq(T.argmax(test_prediction, axis=1), target_var),
                  dtype=theano.config.floatX)
    
    train_fn = theano.function([input_var, target_var], loss, updates=updates, allow_input_downcast=True)
    val_fn = theano.function([input_var, target_var], [test_loss, test_acc], allow_input_downcast=True)
    
    if isinstance(dataset["X_train"], np.ndarray):
        X_train_flat = dataset["X_train"]
    else:
        X_train_flat = dataset["X_train"].tocsc()

    if isinstance(dataset["X_test"], np.ndarray):
        X_test_flat = dataset["X_test"]
    else:
        X_test_flat = dataset["X_test"].tocsc()

    best_accuracy = 0
    bad_count = 0
    batch_size = min(200, train_size/10)
    for epoch in xrange(999):
        # In each epoch, we do a full pass over the training data:
        train_err = 0
        train_batches = 0
        start_time = time.time()
        for batch in iterate_minibatches(X_train_flat, dataset["Y_train"], batch_size, shuffle=True):
            inputs, targets = batch
            train_err += train_fn(inputs, targets)
            train_batches += 1

        # And a full pass over the validation data:
        val_err = 0
        val_acc = 0
        val_batches = 0
        for batch in iterate_minibatches(X_test_flat, dataset["Y_test"], batch_size, shuffle=False):
            inputs, targets = batch
            err, acc = val_fn(inputs, targets)
            val_err += err
            val_acc += acc
            val_batches += 1
        
        current_accuracy = val_acc / val_batches

        # Then we print the results for this epoch:
        print("Epoch {} took {:.3f}s - accuracy {:.2f} %".format(
            epoch + 1, time.time() - start_time, current_accuracy * 100))
        
        if current_accuracy > best_accuracy:
            best_accuracy = current_accuracy
            bad_count = 0
        else:
            bad_count += 1
            if bad_count > 4:
                break
        
    print "%10s %15s. Train set size %5d. %0.1f%% / %0.1f%% / %0.1f%% (%0.3f)" % (
            model_type,
            "MLP",
            train_size,
            current_accuracy * 100,
            0,
            0,
            0)
    output_table.append([
            model_type,
            "MLP",
            train_size, 
            current_accuracy,
            0,
            0,
            0,
        ])

In [4]:
with open("%s/model-data/metadata.pickle" % (REPO_ROOT,), "r") as f:
    size_data = pickle.load(f)
    
TRAIN_SIZES = size_data["train_sizes"]
TEST_SIZE = size_data["test_size"]

print "Training sizes: %s" % TRAIN_SIZES
print "Test size: %d" % TEST_SIZE

Training sizes: [300, 600, 1200, 2400, 4800, 9600, 19200]
Test size: 3588


In [None]:
output_table = []

for train_size in TRAIN_SIZES:
    for model_type in ["RegEx", "BiRegEx", "TriRegEx", "AST", "BiAST", "TriAST", "Random2Vec", "Word2Vec", "AST2Vec"]:
        dataset = load_model(model_type, train_size)

        test_model(dataset, model_type, train_size,
                   KNeighborsClassifier(2), "KNN",
                   train_size == TRAIN_SIZES[-1])
        
        test_model(dataset, model_type, train_size,
                   BernoulliNB(), "Bernoulli",
                   train_size == TRAIN_SIZES[-1])

        test_model(dataset, model_type, train_size,
                   linear_model.SGDClassifier(n_iter=1000, loss="log"), "SGD",
                   train_size == TRAIN_SIZES[-1])
        
        test_model(dataset, model_type, train_size,
                   RandomForestClassifier(max_depth=15, n_estimators=100, max_features=30), "RandomForest",
                   train_size == TRAIN_SIZES[-1])
        
        test_model(dataset, model_type, train_size,
                   LinearSVC(), "LinearSVC",
                   train_size == TRAIN_SIZES[-1])
        
        test_mlp(dataset, model_type, train_size)
        
output = ("Model Type,Model,Training set,Accuracy,Precision,Recall,F1 score\n" +
        "\n".join([",".join([str(s) for s in row]) for row in output_table]))
with open("%s/results/linear_models.csv" % REPO_ROOT, "w") as f:
    f.write(output)

     RegEx             KNN. Train set size   300. 76.6% / 84.6% / 65.1% (0.736)
     RegEx       Bernoulli. Train set size   300. 58.7% / 54.9% / 97.0% (0.701)
     RegEx             SGD. Train set size   300. 79.9% / 84.1% / 73.7% (0.786)
     RegEx    RandomForest. Train set size   300. 72.0% / 66.4% / 89.1% (0.761)
     RegEx       LinearSVC. Train set size   300. 79.1% / 84.2% / 71.7% (0.775)
Epoch 1 took 13.446s - accuracy 68.74 %
Epoch 2 took 13.671s - accuracy 71.37 %
Epoch 3 took 13.543s - accuracy 71.01 %
Epoch 4 took 13.505s - accuracy 77.76 %
Epoch 5 took 13.767s - accuracy 77.79 %
Epoch 6 took 13.285s - accuracy 78.15 %
Epoch 7 took 13.719s - accuracy 78.80 %
Epoch 8 took 13.786s - accuracy 79.50 %
Epoch 9 took 13.382s - accuracy 80.73 %
Epoch 10 took 13.318s - accuracy 81.40 %
Epoch 11 took 13.285s - accuracy 81.29 %
Epoch 12 took 13.278s - accuracy 81.12 %
Epoch 13 took 13.280s - accuracy 81.76 %
Epoch 14 took 13.297s - accuracy 81.99 %
Epoch 15 took 13.547s - accuracy 81

In [14]:
output_table = []

for train_size in TRAIN_SIZES:
    for model_type in ["Url3", "Url6", "Url12"]:
        dataset = load_model(model_type, train_size)

        test_model(dataset, model_type, train_size,
                   KNeighborsClassifier(2), "KNN",
                   train_size == TRAIN_SIZES[-1])
        
        test_model(dataset, model_type, train_size,
                   BernoulliNB(), "Bernoulli",
                   train_size == TRAIN_SIZES[-1])

        test_model(dataset, model_type, train_size,
                   linear_model.SGDClassifier(n_iter=1000, loss="log"), "SGD",
                   train_size == TRAIN_SIZES[-1])
        
        test_model(dataset, model_type, train_size,
                   RandomForestClassifier(max_depth=15, n_estimators=100, max_features=30), "RandomForest",
                   train_size == TRAIN_SIZES[-1])
        
        test_model(dataset, model_type, train_size,
                   LinearSVC(), "LinearSVC",
                   train_size == TRAIN_SIZES[-1])
        
        test_mlp(dataset, model_type, train_size)
        
output = ("Model Type,Model,Training set,Accuracy,Precision,Recall,F1 score\n" +
        "\n".join([",".join([str(s) for s in row]) for row in output_table]))
with open("%s/results/url_models.csv" % REPO_ROOT, "w") as f:
    f.write(output)

      Url3             KNN. Train set size   300. 79.7% / 80.6% / 78.3% (0.794)
      Url3       Bernoulli. Train set size   300. 71.7% / 93.0% / 46.9% (0.623)
      Url3             SGD. Train set size   300. 83.9% / 87.6% / 79.0% (0.831)
      Url3    RandomForest. Train set size   300. 75.0% / 91.6% / 55.0% (0.687)
      Url3       LinearSVC. Train set size   300. 83.9% / 88.8% / 77.5% (0.828)
Epoch 1 took 10.466s - accuracy 73.64 %
Epoch 2 took 10.460s - accuracy 75.97 %
Epoch 3 took 10.449s - accuracy 77.09 %
Epoch 4 took 10.476s - accuracy 78.88 %
Epoch 5 took 10.453s - accuracy 79.64 %
Epoch 6 took 10.449s - accuracy 80.95 %
Epoch 7 took 10.495s - accuracy 80.98 %
Epoch 8 took 10.467s - accuracy 81.46 %
Epoch 9 took 10.442s - accuracy 82.24 %
Epoch 10 took 10.430s - accuracy 82.83 %
Epoch 11 took 10.441s - accuracy 83.14 %
Epoch 12 took 10.437s - accuracy 83.17 %
Epoch 13 took 10.451s - accuracy 83.28 %
Epoch 14 took 10.478s - accuracy 83.47 %
Epoch 15 took 10.449s - accuracy 83

In [19]:
from sklearn.preprocessing import normalize

def concat_models(model_names, train_size):
    datasets = [load_model(name, train_size) for name in model_names]
    
    X_trains = [
        normalize(sc.sparse.csr.csr_matrix(dataset["X_train"]), norm='l2', axis=1)
        for dataset in datasets
    ]
    X_tests = [
        normalize(sc.sparse.csr.csr_matrix(dataset["X_test"]), norm='l2', axis=1)
        for dataset in datasets
    ]
       
    concat_dataset = {
        "X_train": sc.sparse.hstack(X_trains),
        "Y_train": datasets[0]["Y_train"],
        "X_test": sc.sparse.hstack(X_tests),
        "Y_test": datasets[0]["Y_test"],
        "shas_test": datasets[0]["shas_test"],
    }
    
    print "Datasets %s: %s = %s" % (
        model_names,
        " + ".join([str(np.shape(dataset["X_train"])[1]) for dataset in datasets]),
        np.shape(concat_dataset["X_train"])[1])

    print "Labels equal: %s %s" % (
        [np.array_equal(datasets[0]["Y_train"], dataset["Y_train"]) for dataset in datasets[1:]],
        [np.array_equal(datasets[0]["Y_test"], dataset["Y_test"]) for dataset in datasets[1:]])
    
    return concat_dataset
    
#output_table = []

train_size = TRAIN_SIZES[-1]
for model_names in [
        #("BiRegEx", "Url6"),
        #("BiRegEx", "TriAST", "Url6"),
        ("RegEx", "Random2Vec"),
        ("RegEx", "AST"),
        ("BiRegEx", "Word2Vec"),
        ("BiRegEx", "TriAST"),
        ("Word2Vec", "AST2Vec") ]:
    model_type = "-".join(model_names)
    dataset = concat_models(model_names, train_size)

    test_model(dataset, model_type, train_size,
               RandomForestClassifier(max_depth=15, n_estimators=100, max_features=30), "RandomForest",
               False)

    test_model(dataset, model_type, train_size,
               LinearSVC(), "LinearSVC",
               False)

    test_mlp(dataset, model_type, train_size)
        
output = ("Model Type,Model,Training set,Accuracy,Precision,Recall,F1 score\n" +
        "\n".join([",".join([str(s) for s in row]) for row in output_table]))
with open("%s/results/combined_models.csv" % REPO_ROOT, "w") as f:
    f.write(output)

Datasets ('RegEx', 'Random2Vec'): 500000 + 1600 = 501600
Labels equal: [True] [True]
RegEx-Random2Vec    RandomForest. Train set size 19200. 79.6% / 88.1% / 68.4% (0.770)
RegEx-Random2Vec       LinearSVC. Train set size 19200. 91.4% / 94.6% / 87.7% (0.910)
Epoch 1 took 87.264s - accuracy 82.76 %
Epoch 2 took 87.408s - accuracy 89.24 %
Epoch 3 took 87.362s - accuracy 91.18 %
Epoch 4 took 87.343s - accuracy 91.00 %
Epoch 5 took 87.280s - accuracy 91.65 %
Epoch 6 took 87.044s - accuracy 90.62 %
Epoch 7 took 87.590s - accuracy 90.68 %
Epoch 8 took 87.123s - accuracy 91.56 %
Epoch 9 took 84.771s - accuracy 89.68 %
Epoch 10 took 81.620s - accuracy 91.62 %
RegEx-Random2Vec             MLP. Train set size 19200. 91.6% / 0.0% / 0.0% (0.000)
Datasets ('RegEx', 'AST'): 500000 + 500000 = 1000000
Labels equal: [True] [True]
 RegEx-AST    RandomForest. Train set size 19200. 72.4% / 66.3% / 91.3% (0.768)
 RegEx-AST       LinearSVC. Train set size 19200. 91.9% / 93.4% / 90.2% (0.918)
Epoch 1 took 133.