In [1]:
REPO_ROOT = "/usr/src/app"

import json
import pickle
import re

import numpy as np

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.pipeline import make_pipeline

import util

In [2]:
with open("%s/scripts/table_balanced.json" % REPO_ROOT) as f:
    data_table = json.load(f)
    
TOTAL_SIZE = len(data_table)
TRAIN_SIZES = []
size = 300
while size < TOTAL_SIZE - 1000:
    TRAIN_SIZES.append(size)
    size *= 2
    
TEST_SIZE = TOTAL_SIZE - TRAIN_SIZES[-1]

data = {
    "train_sizes": TRAIN_SIZES,
    "test_size": TEST_SIZE
}
with open("%s/model-data/metadata.pickle" % (REPO_ROOT,), "w") as f:
    pickle.dump(data, f)

print "Training sizes: %s" % TRAIN_SIZES
print "Test size: %d" % TEST_SIZE

Training sizes: [300, 600, 1200, 2400, 4800, 9600, 19200]
Test size: 3588


In [14]:
def vectorize_table(model_type, analyzer, parser, table, train_size, test_size, test_parser=None):
    vectorizer = TfidfVectorizer(analyzer=analyzer, max_features=500000, min_df=2)
    
    X_train = vectorizer.fit_transform(parser(table[:train_size]))
    if test_parser == None:
        test_parser = parser
    X_test = vectorizer.transform(test_parser(table[-test_size:]))
    data = {
        "X_train": X_train,
        "Y_train": np.array([item["flag-any"] for item in table[:train_size]]),
        "X_test": X_test,
        "Y_test": np.array([item["flag-any"] for item in table[-test_size:]]),
        "shas_test": [item["sha"] for item in table[-test_size:]],
        "urls_test": [item["url"] for item in table[-test_size:]]
    }
    
    print "Trained: %d features." % data["X_train"].shape[1]
    
    with open("%s/model-data/dataset_%s_%d.pickle" % (REPO_ROOT, model_type, train_size), "w") as f:
        pickle.dump(data, f)
    
    with open("%s/model-data/vocab_%s_%d.pickle" % (REPO_ROOT, model_type, train_size), "w") as f:
        pickle.dump({
            "vocab": vectorizer.vocabulary_,
            "idf": vectorizer.idf_
        }, f)


In [45]:
for train_size in TRAIN_SIZES:
    print "RegEx Training size %d" % train_size
    vectorize_table("RegEx", util.tokenize_js, util.parse_js, data_table, train_size, TEST_SIZE)        

    print "BiRegEx Training size %d" % train_size
    vectorize_table("BiRegEx", util.ngramizer(util.tokenize_js, 2), util.parse_js, data_table, train_size, TEST_SIZE)        

    print "TriRegEx Training size %d" % train_size
    vectorize_table("TriRegEx", util.ngramizer(util.tokenize_js, 3), util.parse_js, data_table, train_size, TEST_SIZE)        

RegEx Training size 300
Trained: 5911 features.
BiRegEx Training size 300
Trained: 38966 features.
TriRegEx Training size 300
Trained: 106503 features.
RegEx Training size 600
Trained: 9584 features.
BiRegEx Training size 600
Trained: 70319 features.
TriRegEx Training size 600
Trained: 204429 features.
RegEx Training size 1200
Trained: 15842 features.
BiRegEx Training size 1200
Trained: 120376 features.
TriRegEx Training size 1200
Trained: 374022 features.
RegEx Training size 2400
Trained: 23422 features.
BiRegEx Training size 2400
Trained: 191982 features.
TriRegEx Training size 2400
Trained: 500000 features.
RegEx Training size 4800
Trained: 38305 features.
BiRegEx Training size 4800
Trained: 300533 features.
TriRegEx Training size 4800
Trained: 500000 features.
RegEx Training size 9600
Trained: 58212 features.
BiRegEx Training size 9600
Trained: 455033 features.
TriRegEx Training size 9600
Trained: 500000 features.
RegEx Training size 19200
Trained: 113336 features.
BiRegEx Training

In [None]:
for train_size in TRAIN_SIZES:
    print "AST Training size %d" % train_size
    vectorize_table("AST", util.tokenize_ast, util.parse_ast, data_table, train_size, TEST_SIZE)

    print "BiAST Training size %d" % train_size
    vectorize_table("BiAST", util.ngramizer(util.tokenize_ast, 2), util.parse_ast, data_table, train_size, TEST_SIZE)

    print "TriAST Training size %d" % train_size
    vectorize_table("TriAST", util.ngramizer(util.tokenize_ast, 3), util.parse_ast, data_table, train_size, TEST_SIZE)


AST Training size 9600
Failed to parse JSON! maximum recursion depth exceeded while calling a Python object
Failed to parse JSON! maximum recursion depth exceeded while calling a Python object
Trained: 209970 features.
BiAST Training size 9600
Failed to parse JSON! maximum recursion depth exceeded while calling a Python object
Failed to parse JSON! maximum recursion depth exceeded while calling a Python object
Trained: 500000 features.
TriAST Training size 9600
Failed to parse JSON! maximum recursion depth exceeded while calling a Python object
Failed to parse JSON! maximum recursion depth exceeded while calling a Python object
Trained: 500000 features.
AST Training size 19200
Failed to parse JSON! maximum recursion depth exceeded while calling a Python object
Failed to parse JSON! maximum recursion depth exceeded while calling a Python object
Failed to parse JSON! maximum recursion depth exceeded while calling a Python object
Trained: 408414 features.
BiAST Training size 19200
Failed 

In [20]:
for train_size in TRAIN_SIZES:
    print "Url3 Training size %d" % train_size
    vectorize_table("Url3", util.ngramizer(util.tokenize_url, 3), util.parse_url, data_table, train_size, TEST_SIZE)

    print "Url6 Training size %d" % train_size
    vectorize_table("Url6", util.ngramizer(util.tokenize_url, 6), util.parse_url, data_table, train_size, TEST_SIZE)        

    print "Url12 Training size %d" % train_size
    vectorize_table("Url12", util.ngramizer(util.tokenize_url, 12), util.parse_url, data_table, train_size, TEST_SIZE)        

Url3 Training size 300
Trained: 6772 features.
Url6 Training size 300
Trained: 23336 features.
Url12 Training size 300
Trained: 51990 features.
Url3 Training size 600
Trained: 10132 features.
Url6 Training size 600
Trained: 41612 features.
Url12 Training size 600
Trained: 102880 features.
Url3 Training size 1200
Trained: 15086 features.
Url6 Training size 1200
Trained: 70504 features.
Url12 Training size 1200
Trained: 181596 features.
Url3 Training size 2400
Trained: 22250 features.
Url6 Training size 2400
Trained: 127294 features.
Url12 Training size 2400
Trained: 368551 features.
Url3 Training size 4800
Trained: 30477 features.
Url6 Training size 4800
Trained: 219906 features.
Url12 Training size 4800
Trained: 500000 features.
Url3 Training size 9600
Trained: 40396 features.
Url6 Training size 9600
Trained: 358936 features.
Url12 Training size 9600
Trained: 500000 features.
Url3 Training size 19200
Trained: 52669 features.
Url6 Training size 19200
Trained: 500000 features.
Url12 Trai

In [8]:
SIZE_CUTOFFS = [2**n for n in xrange(5, 20)]

def sizes_table(table, train_size, test_size):
    X_train = np.array([
        [1 if len(script) > c else 0 for c in SIZE_CUTOFFS]
        for script in util.parse_js(table[:train_size])
    ])
    
    X_test = np.array([
        [1 if len(script) > c else 0 for c in SIZE_CUTOFFS]
        for script in util.parse_js(table[-test_size:])
    ])
    
    data = {
        "X_train": X_train,
        "Y_train": np.array([item["flag-any"] for item in table[:train_size]]),
        "X_test": X_test,
        "Y_test": np.array([item["flag-any"] for item in table[-test_size:]]),
        "shas_test": [item["sha"] for item in table[-test_size:]],
        "urls_test": [item["url"] for item in table[-test_size:]]
    }
    
    with open("%s/model-data/dataset_%s_%d.pickle" % (REPO_ROOT, "FileSize", train_size), "w") as f:
        pickle.dump(data, f)
    
sizes_table(data_table, TRAIN_SIZES[-1], TEST_SIZE)

In [19]:
SIZES = [["1K", 1024], ["4K", 4096], ["16K", 16384], ["64K", 65536], ["256K", 262144], ["1M", 1048576]]
for name, cutoff in SIZES:
    print "BiRegEx%s Training" % name
    vectorize_table("BiRegEx%s" % name,
                    util.ngramizer(util.tokenize_js, 2),
                    util.truncated_parse_js(cutoff), data_table, TRAIN_SIZES[-1], TEST_SIZE)        
print "Done."

BiRegEx1K Training
Trained: 101468 features.
BiRegEx4K Training
Trained: 184501 features.
BiRegEx16K Training
Trained: 309442 features.
BiRegEx64K Training
Trained: 455621 features.
BiRegEx256K Training
Trained: 500000 features.
BiRegEx1M Training
Trained: 500000 features.
Done.
