In [1]:
import json
import random
import re
import numpy as np

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import HashingVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer

from sklearn.svm import SVC
from sklearn.naive_bayes import BernoulliNB
from sklearn import linear_model

from scipy.sparse import coo_matrix, vstack

In [12]:
def tokenize_js(script):
    script = re.sub(r'(\/\*[^*]+\*\/)', "", script)
    script = re.sub(r'\/\/.+', "", script)
    return re.findall(r'(\"[^"]+\"|\'[^\']+\'|[\w\\\/\-_\"\']+|{|}|,|[\+\*]|\(|\)|\.|/\*.+\*\/)', script)

def parse_scripts(tbl):
    for item in tbl:
        with open("/scripts/%s.js" % item["sha"]) as f:
            yield f.read().decode(errors='replace')

def test_model(model, model_name):
    for train_size in TRAIN_SIZES:
        model.fit(X_train[train_size], Y_train[train_size])
        test_pred = model.predict(X_test[train_size])
        test_y = Y_test[train_size]

        print "%s. Train set size %d. Accuracy: %d/%d = %f%%" % (
            model_name,
            train_size,
            sum(test_y == test_pred),
            len(test_y),
            ((float(sum(test_y == test_pred)) / len(test_y)) * 100))

In [17]:
with open("/scripts/table_flag.json") as f:
    table = json.load(f)
    
positive_examples = [i for i, e in enumerate(table) if e["flag-any"] == 1]
negative_examples = [i for i, e in enumerate(table) if e["flag-any"] == 0]
random.shuffle(positive_examples)
random.shuffle(negative_examples)
negative_examples = negative_examples[:len(positive_examples)]
print "%d positive and %d negative examples." % (len(positive_examples), len(negative_examples))

TRAIN_SIZES = [100, 400, 800, 1600, 3200, 4666]
TEST_SIZE = 1000

train_tables = {}
for train_size in TRAIN_SIZES:
    train_tables[train_size] = (
        [table[i] for i in positive_examples[:train_size]] + 
        [table[i] for i in negative_examples[:train_size]])

test_table = (
    [table[i] for i in positive_examples[-TEST_SIZE:]] + 
    [table[i] for i in negative_examples[-TEST_SIZE:]])

5666 positive and 5666 negative examples.


In [18]:
X_train = {}
Y_train = {}
X_test = {}
Y_test = {}

for train_size in TRAIN_SIZES:
    vectorizer = TfidfVectorizer(tokenizer=tokenize_js)
    X_train[train_size] = vectorizer.fit_transform(parse_scripts(train_tables[train_size]))
    Y_train[train_size] = np.array([item["flag-any"] for item in train_tables[train_size]])
    X_test[train_size] = vectorizer.transform(parse_scripts(test_table))
    Y_test[train_size] = np.array([item["flag-any"] for item in test_table])

In [19]:
test_model(BernoulliNB(), "Bernoulli")

Bernoulli. Train set size 100. Accuracy: 1191/2000 = 59.550000%
Bernoulli. Train set size 400. Accuracy: 1368/2000 = 68.400000%
Bernoulli. Train set size 800. Accuracy: 1393/2000 = 69.650000%
Bernoulli. Train set size 1600. Accuracy: 1490/2000 = 74.500000%
Bernoulli. Train set size 3200. Accuracy: 1330/2000 = 66.500000%
Bernoulli. Train set size 4666. Accuracy: 1368/2000 = 68.400000%


In [20]:
test_model(SVC(), "SVC")

SVC. Train set size 100. Accuracy: 1240/2000 = 62.000000%
SVC. Train set size 400. Accuracy: 1200/2000 = 60.000000%
SVC. Train set size 800. Accuracy: 1190/2000 = 59.500000%
SVC. Train set size 1600. Accuracy: 1187/2000 = 59.350000%
SVC. Train set size 3200. Accuracy: 1189/2000 = 59.450000%
SVC. Train set size 4666. Accuracy: 1221/2000 = 61.050000%


In [21]:
test_model(linear_model.SGDClassifier(n_iter=1000), "SGD")

SGD. Train set size 100. Accuracy: 1433/2000 = 71.650000%
SGD. Train set size 400. Accuracy: 1535/2000 = 76.750000%
SGD. Train set size 800. Accuracy: 1622/2000 = 81.100000%
SGD. Train set size 1600. Accuracy: 1699/2000 = 84.950000%
SGD. Train set size 3200. Accuracy: 1680/2000 = 84.000000%
SGD. Train set size 4666. Accuracy: 1692/2000 = 84.600000%
