In [1]:
REPO_ROOT = "/usr/src/app"

import json
import pickle
import re

import numpy as np

from sklearn.feature_extraction.text import TfidfVectorizer

In [2]:
with open("%s/scripts/table_balanced.json" % REPO_ROOT) as f:
    data_table = json.load(f)
    
TOTAL_SIZE = len(data_table)
TRAIN_SIZES = []
size = 300
while size < TOTAL_SIZE - 1000:
    TRAIN_SIZES.append(size)
    size *= 2
    
TEST_SIZE = TOTAL_SIZE - TRAIN_SIZES[-1]

data = {
    "train_sizes": TRAIN_SIZES,
    "test_size": TEST_SIZE
}
with open("%s/model-data/metadata.pickle" % (REPO_ROOT,), "w") as f:
    pickle.dump(data, f)

print "Training sizes: %s" % TRAIN_SIZES
print "Test size: %d" % TEST_SIZE

Training sizes: [300, 600, 1200, 2400, 4800, 9600, 19200]
Test size: 3588


In [21]:
def vectorize_table(model_type, tokenizer, parser, table, train_size, test_size):
    vectorizer = TfidfVectorizer(tokenizer=tokenizer)
    
    data = {
        "X_train": vectorizer.fit_transform(parser(table[:train_size])),
        "Y_train": np.array([item["flag-any"] for item in table[:train_size]]),
        "X_test": vectorizer.transform(parser(table[-test_size:])),
        "Y_test": np.array([item["flag-any"] for item in table[-test_size:]]),
    }
    
    with open("%s/model-data/dataset_%s_%d.pickle" % (REPO_ROOT, model_type, train_size), "w") as f:
        pickle.dump(data, f)

In [22]:
def tokenize_js(script):
    script = re.sub(r'(\/\*[^*]+\*\/)', "", script)
    script = re.sub(r'\/\/.+', "", script)
    tokens = re.findall(r'(\"[^"]+\"|\'[^\']+\'|[\w\\\/\-_\"\']+|{|}|,|[\+\*]|\(|\)|\.|/\*.+\*\/)', script)
    return [t.lower() for t in tokens]

def parse_js(tbl):
    for item in tbl:
        with open("%s/scripts/%s.js" % (REPO_ROOT, item["sha"])) as f:
            yield f.read().decode(errors='replace')
            
for train_size in TRAIN_SIZES:
    print "RegEx Training size %d" % train_size
    vectorize_table("RegEx", tokenize_js, parse_js, data_table, train_size, TEST_SIZE)        

RegEx Training size 300
RegEx Training size 600
RegEx Training size 1200
RegEx Training size 2400
RegEx Training size 4800
RegEx Training size 9600
RegEx Training size 19200


In [24]:
def _tokenize_helper(node):
    if isinstance(node, basestring):
        return [node]
    if isinstance(node, bool):
        return ["True" if node else "False"]
    if isinstance(node, list):
        return [x for v in node for x in _tokenize_helper(v)]
    if node is None:
        return []
    
    lst = [".".join([
        "%s:%s" % (key, val)
        for key, val in node.iteritems()
        if not isinstance(val, dict) and not isinstance(val, list)
    ])]
    
    for key, val in node.iteritems():
        if isinstance(val, dict) or isinstance(val, list):
            lst += _tokenize_helper(val)
    
    return lst

def tokenize_ast(ast):
    try:
        ast_json = json.loads(ast)
    except Exception as e:
        print "Failed to parse JSON! %s" % str(e)
        return []
    return [t.lower() for t in _tokenize_helper(ast_json)]
    
def parse_ast(tbl):
    for item in tbl:
        try:
            with open("%s/scripts-ast/%s.js.ast" % (REPO_ROOT, item["sha"])) as f:
                yield f.read()
        except IOError:
            yield "[]"
            
for train_size in TRAIN_SIZES:
    print "AST Training size %d" % train_size
    vectorize_table("AST", tokenize_ast, parse_ast, data_table, train_size, TEST_SIZE)                

AST Training size 300
Failed to parse JSON! maximum recursion depth exceeded while calling a Python object
AST Training size 600
Failed to parse JSON! maximum recursion depth exceeded while calling a Python object
Failed to parse JSON! maximum recursion depth exceeded while calling a Python object
AST Training size 1200
Failed to parse JSON! maximum recursion depth exceeded while calling a Python object
Failed to parse JSON! maximum recursion depth exceeded while calling a Python object
AST Training size 2400
Failed to parse JSON! maximum recursion depth exceeded while calling a Python object
Failed to parse JSON! maximum recursion depth exceeded while calling a Python object
AST Training size 4800
Failed to parse JSON! maximum recursion depth exceeded while calling a Python object
Failed to parse JSON! maximum recursion depth exceeded while calling a Python object
AST Training size 9600
Failed to parse JSON! maximum recursion depth exceeded while calling a Python object
Failed to pars