In [7]:
REPO_ROOT = "/usr/src/app"

import json
import pickle
import re

import numpy as np

from sklearn.feature_extraction.text import HashingVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.pipeline import make_pipeline

import util

In [8]:
with open("%s/scripts/table_balanced.json" % REPO_ROOT) as f:
    data_table = json.load(f)
    
TOTAL_SIZE = len(data_table)
TRAIN_SIZES = []
size = 300
while size < TOTAL_SIZE - 1000:
    TRAIN_SIZES.append(size)
    size *= 2
    
TEST_SIZE = TOTAL_SIZE - TRAIN_SIZES[-1]

data = {
    "train_sizes": TRAIN_SIZES,
    "test_size": TEST_SIZE
}
with open("%s/model-data/metadata.pickle" % (REPO_ROOT,), "w") as f:
    pickle.dump(data, f)

print "Training sizes: %s" % TRAIN_SIZES
print "Test size: %d" % TEST_SIZE

Training sizes: [300, 600, 1200, 2400, 4800, 9600, 19200]
Test size: 3588


In [11]:
def vectorize_table(model_type, tokenizer, parser, table, train_size, test_size, ngram_range=(1,1)):
    #vectorizer = TfidfVectorizer(tokenizer=tokenizer, ngram_range=ngram_range, max_features=50000)
    hasher = HashingVectorizer(tokenizer=tokenizer, ngram_range=ngram_range, n_features=500000)
    vectorizer = make_pipeline(hasher, TfidfTransformer())
    
    data = {
        "X_train": vectorizer.fit_transform(parser(table[:train_size])),
        "Y_train": np.array([item["flag-any"] for item in table[:train_size]]),
        "X_test": vectorizer.transform(parser(table[-test_size:])),
        "Y_test": np.array([item["flag-any"] for item in table[-test_size:]]),
        "shas_test": [item["sha"] for item in table[-test_size:]]
    }
    
    print "Trained: %d features." % data["X_train"].shape[1]
    
    with open("%s/model-data/dataset_%s_%d.pickle" % (REPO_ROOT, model_type, train_size), "w") as f:
        pickle.dump(data, f)

In [12]:
for train_size in TRAIN_SIZES:
    print "RegEx Training size %d" % train_size
    vectorize_table("RegEx", util.tokenize_js, util.parse_js, data_table, train_size, TEST_SIZE)        

    print "BiRegEx Training size %d" % train_size
    vectorize_table("BiRegEx", util.tokenize_js, util.parse_js, data_table, train_size, TEST_SIZE, ngram_range=(1,2))        

    print "TriRegEx Training size %d" % train_size
    vectorize_table("TriRegEx", util.tokenize_js, util.parse_js, data_table, train_size, TEST_SIZE, ngram_range=(1,3))        

RegEx Training size 300
Trained: 500000 features.
BiRegEx Training size 300
Trained: 500000 features.
TriRegEx Training size 300
Trained: 500000 features.
RegEx Training size 600
Trained: 500000 features.
BiRegEx Training size 600
Trained: 500000 features.
TriRegEx Training size 600
Trained: 500000 features.
RegEx Training size 1200
Trained: 500000 features.
BiRegEx Training size 1200
Trained: 500000 features.
TriRegEx Training size 1200
Trained: 500000 features.
RegEx Training size 2400
Trained: 500000 features.
BiRegEx Training size 2400
Trained: 500000 features.
TriRegEx Training size 2400
Trained: 500000 features.
RegEx Training size 4800
Trained: 500000 features.
BiRegEx Training size 4800
Trained: 500000 features.
TriRegEx Training size 4800
Trained: 500000 features.
RegEx Training size 9600
Trained: 500000 features.
BiRegEx Training size 9600
Trained: 500000 features.
TriRegEx Training size 9600
Trained: 500000 features.
RegEx Training size 19200
Trained: 500000 features.
BiRegE

In [13]:
for train_size in TRAIN_SIZES:
    print "AST Training size %d" % train_size
    vectorize_table("AST", util.tokenize_ast, util.parse_ast, data_table, train_size, TEST_SIZE)

    print "BiAST Training size %d" % train_size
    vectorize_table("BiAST", util.tokenize_ast, util.parse_ast, data_table, train_size, TEST_SIZE, ngram_range=(1,2))

    print "TriAST Training size %d" % train_size
    vectorize_table("TriAST", util.tokenize_ast, util.parse_ast, data_table, train_size, TEST_SIZE, ngram_range=(1,3))


AST Training size 300
Failed to parse JSON! maximum recursion depth exceeded while calling a Python object
Trained: 500000 features.
BiAST Training size 300
Failed to parse JSON! maximum recursion depth exceeded while calling a Python object
Trained: 500000 features.
TriAST Training size 300
Failed to parse JSON! maximum recursion depth exceeded while calling a Python object
Trained: 500000 features.
AST Training size 600
Failed to parse JSON! maximum recursion depth exceeded while calling a Python object
Failed to parse JSON! maximum recursion depth exceeded while calling a Python object
Trained: 500000 features.
BiAST Training size 600
Failed to parse JSON! maximum recursion depth exceeded while calling a Python object
Failed to parse JSON! maximum recursion depth exceeded while calling a Python object
Trained: 500000 features.
TriAST Training size 600
Failed to parse JSON! maximum recursion depth exceeded while calling a Python object
Failed to parse JSON! maximum recursion depth ex