In [1]:
# imports and setup
import json
from itertools import islice, imap

## Helper Functions

In [41]:
def parse_ast_line(line, sep="\t"):
    script_id, ast_json = line.strip().split(sep, 1)
    return script_id, json.loads(ast_json)
      
def parse_ast(lines):
    for line in lines:
        if len(line) > 1:
            yield parse_ast_line(line)
            
def filter_ast_by_type(ast, type):
    if _is_node_of_type(ast, type):
        yield ast
    elif isinstance(ast, dict):
        for _, val in ast.iteritems():
            for x in filter_ast_by_type(val, type):
                yield x
    elif isinstance(ast, list):
        for val in ast:
            for x in filter_ast_by_type(val, type):
                yield x
      
def _is_node_of_type(d, type):
    return isinstance(d, dict) and \
           "type" in d and \
           d["type"] == type
      
def ast_types(ast):
    if isinstance(ast, dict):
        if "type" in ast:
            yield ast["type"]
        for _, val in ast.iteritems():
            for x in ast_types(val):
                yield x
    elif isinstance(ast, list):
        for val in ast:
            for x in ast_types(val):
                yield x      

## Filter AST nodes with type of Identifier

Here, try one program, and print out all its identifiers.

In [3]:
AST_FILE = '../lang/ast.json11'

with open(AST_FILE, 'r') as f:
    _, one_program = parse_ast(f.readlines()).next()

In [44]:
set( ast_types(one_program) )

{u'ArrayExpression',
 u'AssignmentExpression',
 u'BinaryExpression',
 u'BlockStatement',
 u'CallExpression',
 u'EmptyStatement',
 u'ExpressionStatement',
 u'FunctionExpression',
 u'Identifier',
 u'IfStatement',
 u'Literal',
 u'LogicalExpression',
 u'MemberExpression',
 u'ObjectExpression',
 u'Program',
 u'Property',
 u'ReturnStatement',
 u'ThisExpression',
 u'UnaryExpression',
 u'VariableDeclaration',
 u'VariableDeclarator'}

In [63]:
list(filter_ast_by_type(one_program, "FunctionExpression"))[:10]

[{u'body': {u'body': [{u'argument': {u'arguments': [{u'left': {u'name': u'$n',
         u'type': u'Identifier'},
        u'operator': u'>',
        u'right': {u'raw': u'1', u'type': u'Literal', u'value': 1},
        u'type': u'BinaryExpression'}],
      u'callee': {u'name': u'Number', u'type': u'Identifier'},
      u'type': u'CallExpression'},
     u'type': u'ReturnStatement'}],
   u'type': u'BlockStatement'},
  u'expression': False,
  u'generator': False,
  u'id': None,
  u'params': [{u'name': u'$n', u'type': u'Identifier'}],
  u'type': u'FunctionExpression'},
 {u'body': {u'body': [{u'expression': {u'left': {u'computed': False,
       u'object': {u'arguments': [{u'raw': u"'autocomplete'",
          u'type': u'Literal',
          u'value': u'autocomplete'}],
        u'callee': {u'computed': False,
         u'object': {u'arguments': [{u'raw': u"'form-autocomplete'",
            u'type': u'Literal',
            u'value': u'form-autocomplete'}],
          u'callee': {u'computed': False,
 

## Naive word2vec

Use identifier names as vocabulary.

In [52]:
def should_keep_string(s):
    return 1 < len(s) < 16
def load_sentences(filename, sentences):
    with open(filename, 'r') as f:
        for id, program in parse_ast(f.readlines()):
            idents = filter_ast_by_type(program, type="Identifier")
            idents = list( imap(lambda d:d["name"], idents) )
            lits = filter_ast_by_type(program, type="Literal")
            lits = [l["value"] for l in lits \
                      if isinstance(l["value"], str) and should_keep_string(l['value'])]
            # sentence = idents + lits
            sentences.append((id, idents))
            sentences.append((id, lits))
    return sentences

In [6]:
import sys
paths = ['/home/yiran/.local/lib/python2.7/site-packages/PyOpenGL-3.0.1-py2.7.egg', '/home/yiran/.local/lib/python2.7/site-packages/ftputil-3.2-py2.7.egg', '/home/yiran/anaconda/lib/python27.zip', '/home/yiran/anaconda/lib/python2.7', '/home/yiran/anaconda/lib/python2.7/plat-linux2', '/home/yiran/anaconda/lib/python2.7/lib-tk', '/home/yiran/anaconda/lib/python2.7/lib-old', '/home/yiran/anaconda/lib/python2.7/lib-dynload', '/home/yiran/.local/lib/python2.7/site-packages', '/home/yiran/anaconda/lib/python2.7/site-packages/Sphinx-1.3.5-py2.7.egg', '/home/yiran/anaconda/lib/python2.7/site-packages/setuptools-19.6.2-py2.7.egg', '/home/yiran/anaconda/lib/python2.7/site-packages', '/home/yiran/anaconda/lib/python2.7/site-packages/cryptography-1.0.2-py2.7-linux-x86_64.egg']
for p in paths:
    sys.path.insert(0, p)

In [7]:
from gensim.models.word2vec import Word2Vec
from gensim.models.doc2vec import Doc2Vec, TaggedDocument
import glob

Vendor:  Continuum Analytics, Inc.
Package: mkl
Message: trial mode expires in 29 days


In [53]:
sentences = []
for f in glob.glob("../lang/ast.json*"):
    print ("Loading %s" % f)
    load_sentences(f, sentences)

Loading ../lang/ast.json26
Loading ../lang/ast.json45
Loading ../lang/ast.json40
Loading ../lang/ast.json36
Loading ../lang/ast.json18
Loading ../lang/ast.json02
Loading ../lang/ast.json38
Loading ../lang/ast.json47
Loading ../lang/ast.json53
Loading ../lang/ast.json66
Loading ../lang/ast.json72
Loading ../lang/ast.json04
Loading ../lang/ast.json48
Loading ../lang/ast.json65
Loading ../lang/ast.json64
Loading ../lang/ast.json49
Loading ../lang/ast.json33
Loading ../lang/ast.json30
Loading ../lang/ast.json10
Loading ../lang/ast.json51
Loading ../lang/ast.json60
Loading ../lang/ast.json31
Loading ../lang/ast.json34
Loading ../lang/ast.json75
Loading ../lang/ast.json43
Loading ../lang/ast.json73
Loading ../lang/ast.json57
Loading ../lang/ast.json14
Loading ../lang/ast.json42
Loading ../lang/ast.json08
Loading ../lang/ast.json76
Loading ../lang/ast.json63
Loading ../lang/ast.json23
Loading ../lang/ast.json71
Loading ../lang/ast.json07
Loading ../lang/ast.json50
Loading ../lang/ast.json25
L

In [54]:
docs = map(lambda (id,s): TaggedDocument(words=s, tags=[id]), sentences)

In [56]:
model = Doc2Vec(docs, size=100, window=5, 
                min_count=5, workers=2, dm=0,
                max_vocab_size=1000000,
                hs=0, negative=5,
                iter=15,
                dbow_words=1)

In [57]:
model.most_similar(positive="createElement")

[(u'appendChild', 0.7726303339004517),
 (u'applyBindings', 0.6737308502197266),
 (u'setAttribute', 0.6706061363220215),
 (u'renderImage', 0.6631907224655151),
 (u'header1', 0.6613990068435669),
 (u'getElementsByTagName', 0.6594642400741577),
 (u'cssText', 0.6588724255561829),
 (u'onErrorBannerClick', 0.6517629623413086),
 (u'utilsStyleElement', 0.6492123603820801),
 (u'CssUtils', 0.6470474600791931)]

In [58]:
model.most_similar("ajax")

[(u'$cart', 0.7283447980880737),
 (u'dataType', 0.7078338265419006),
 (u'useajax', 0.7076820731163025),
 (u'wpcf7AjaxSuccess', 0.6991125345230103),
 (u'url_edit_entreprise', 0.69660484790802),
 (u'url_edit_crea_offre', 0.6951600909233093),
 (u'epnConfig', 0.6921828985214233),
 (u'xmlhttprequest', 0.6858308911323547),
 (u'data_to_send', 0.68504798412323),
 (u'sel_affiliate_Id', 0.6821858882904053)]

In [65]:
import random
import re
import numpy as np

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import HashingVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer

from sklearn.svm import SVC
from sklearn.naive_bayes import BernoulliNB,MultinomialNB
from sklearn import linear_model

In [17]:
with open("./table_flag.json") as f:
    table = json.load(f)
    
positive_examples = [i for i, e in enumerate(table) if e["flag-any"] == 1]
negative_examples = [i for i, e in enumerate(table) if e["flag-any"] == 0]
random.shuffle(positive_examples)
random.shuffle(negative_examples)
negative_examples = negative_examples[:len(positive_examples)]
print "%d positive and %d negative examples." % (len(positive_examples), len(negative_examples))

train_table = [table[i] for i in positive_examples[:200]] + [table[i] for i in negative_examples[:200]]
test_table = [table[i] for i in positive_examples[200:]] + [table[i] for i in negative_examples[200:]]

328 positive and 328 negative examples.


In [32]:
def get_features(table, model):
    X = np.zeros(shape=(len(table), model.vector_size))
    not_missing = []
    for i,cell in enumerate(table):
        if cell["sha"] in model.docvecs:
            not_missing.append(i)
            X[i] = model.docvecs[cell["sha"]]
    return X, np.array(not_missing)

In [59]:
X_train, train_keep = get_features(train_table, model)
X_test, test_keep = get_features(test_table, model)


Y_train = np.array([item["flag-any"] for item in train_table])
Y_test = np.array([item["flag-any"] for item in test_table])

X_train = X_train[train_keep]
X_test = X_test[test_keep]
Y_train = Y_train[train_keep]
Y_test = Y_test[test_keep]

In [64]:
X_train

array([[-0.37665585, -0.25804096, -0.09731802, ...,  0.59474295,
        -0.36818811,  0.49149922],
       [ 0.34879792,  0.73458415,  0.38026088, ...,  0.82438803,
        -0.28636995,  0.9368186 ],
       [-0.00947819,  0.23088087, -0.16749379, ...,  0.21576415,
        -0.07779026, -0.09591251],
       ..., 
       [-0.02744515,  0.17553316,  0.07467901, ...,  0.27111956,
         0.04442127,  0.12945922],
       [ 0.00728396, -0.22763309,  0.24821691, ...,  0.68039918,
        -0.08972689, -0.29166201],
       [-0.45122156, -0.0846123 ,  0.18062885, ...,  0.1151379 ,
         0.42196599,  0.08444414]])

In [68]:
model = BernoulliNB()
model.fit(X_train, Y_train)
test_pred = model.predict(X_test)

print "Baseline: %d/%d = %f%%" % (sum(Y_test == 0), len(Y_test), ((float(sum(Y_test == 0)) / len(Y_test)) * 100))
print "Accuracy: %d/%d = %f%%" % (sum(Y_test == test_pred), len(Y_test), ((float(sum(Y_test == test_pred)) / len(Y_test)) * 100))

Baseline: 123/250 = 49.200000%
Accuracy: 176/250 = 70.400000%


In [61]:
model = SVC()
model.fit(X_train, Y_train)
test_pred = model.predict(X_test)

print "Baseline: %d/%d = %f%%" % (sum(Y_test == 0), len(Y_test), ((float(sum(Y_test == 0)) / len(Y_test)) * 100))
print "Accuracy: %d/%d = %f%%" % (sum(Y_test == test_pred), len(Y_test), ((float(sum(Y_test == test_pred)) / len(Y_test)) * 100))

Baseline: 123/250 = 49.200000%
Accuracy: 160/250 = 64.000000%


In [62]:
model = linear_model.SGDClassifier(class_weight="auto")
model.fit(X_train, Y_train)
test_pred = model.predict(X_test)

print "Baseline: %d/%d = %f%%" % (sum(Y_test == 0), len(Y_test), ((float(sum(Y_test == 0)) / len(Y_test)) * 100))
print "Accuracy: %d/%d = %f%%" % (sum(Y_test == test_pred), len(Y_test), ((float(sum(Y_test == test_pred)) / len(Y_test)) * 100))

Baseline: 123/250 = 49.200000%
Accuracy: 173/250 = 69.200000%
