In [1]:
REPO_ROOT = "/usr/src/app"

import json
import pickle
import re

import numpy as np

from sklearn.feature_extraction.text import TfidfVectorizer

import util

In [2]:
with open("%s/scripts/table_balanced.json" % REPO_ROOT) as f:
    data_table = json.load(f)
    
TOTAL_SIZE = len(data_table)
TRAIN_SIZES = []
size = 300
while size < TOTAL_SIZE - 1000:
    TRAIN_SIZES.append(size)
    size *= 2
    
TEST_SIZE = TOTAL_SIZE - TRAIN_SIZES[-1]

data = {
    "train_sizes": TRAIN_SIZES,
    "test_size": TEST_SIZE
}
with open("%s/model-data/metadata.pickle" % (REPO_ROOT,), "w") as f:
    pickle.dump(data, f)

print "Training sizes: %s" % TRAIN_SIZES
print "Test size: %d" % TEST_SIZE

Training sizes: [300, 600, 1200, 2400, 4800, 9600, 19200]
Test size: 3588


In [3]:
def vectorize_table(model_type, tokenizer, parser, table, train_size, test_size, ngram_range=(1,1)):
    vectorizer = TfidfVectorizer(tokenizer=tokenizer, ngram_range=ngram_range)
    
    data = {
        "X_train": vectorizer.fit_transform(parser(table[:train_size])),
        "Y_train": np.array([item["flag-any"] for item in table[:train_size]]),
        "X_test": vectorizer.transform(parser(table[-test_size:])),
        "Y_test": np.array([item["flag-any"] for item in table[-test_size:]]),
    }
    
    with open("%s/model-data/dataset_%s_%d.pickle" % (REPO_ROOT, model_type, train_size), "w") as f:
        pickle.dump(data, f)

In [22]:
for train_size in TRAIN_SIZES:
    print "RegEx Training size %d" % train_size
    vectorize_table("RegEx", util.tokenize_js, util.parse_js, data_table, train_size, TEST_SIZE)        

RegEx Training size 300
RegEx Training size 600
RegEx Training size 1200
RegEx Training size 2400
RegEx Training size 4800
RegEx Training size 9600
RegEx Training size 19200


In [4]:
for train_size in TRAIN_SIZES:
    print "AST Training size %d" % train_size
    vectorize_table("AST", util.tokenize_ast, util.parse_ast, data_table, train_size, TEST_SIZE)
    
    print "Bigram Training size %d" % train_size
    vectorize_table("Bigram", util.tokenize_ast, util.parse_ast, data_table, train_size, TEST_SIZE, ngram_range=(1,2))

Bigram Training size 300
Failed to parse JSON! maximum recursion depth exceeded while calling a Python object
Bigram Training size 600
Failed to parse JSON! maximum recursion depth exceeded while calling a Python object
Failed to parse JSON! maximum recursion depth exceeded while calling a Python object
Bigram Training size 1200
Failed to parse JSON! maximum recursion depth exceeded while calling a Python object
Failed to parse JSON! maximum recursion depth exceeded while calling a Python object
Bigram Training size 2400
Failed to parse JSON! maximum recursion depth exceeded while calling a Python object
Failed to parse JSON! maximum recursion depth exceeded while calling a Python object
Bigram Training size 4800
Failed to parse JSON! maximum recursion depth exceeded while calling a Python object
Failed to parse JSON! maximum recursion depth exceeded while calling a Python object
Bigram Training size 9600
Failed to parse JSON! maximum recursion depth exceeded while calling a Python obj