In [2]:
# imports and setup
import json
from itertools import islice, imap

## Helper Functions

In [3]:
def parse_ast_line(line, sep="\t"):
    script_id, ast_json = line.strip().split(sep, 1)
    return script_id, json.loads(ast_json)
      
def parse_ast(lines):
    for line in lines:
        if len(line) > 1:
            yield parse_ast_line(line)
            
def filter_ast_by_type(ast, type):
    if _is_node_of_type(ast, type):
        yield ast
    elif isinstance(ast, dict):
        for _, val in ast.iteritems():
            for x in filter_ast_by_type(val, type):
                yield x
    elif isinstance(ast, list):
        for val in ast:
            for x in filter_ast_by_type(val, type):
                yield x
      
def _is_node_of_type(d, type):
    return isinstance(d, dict) and \
           "type" in d and \
           d["type"] == type
      
def ast_types(ast):
    if isinstance(ast, dict):
        if "type" in ast:
            yield ast["type"]
        for _, val in ast.iteritems():
            for x in ast_types(val):
                yield x
    elif isinstance(ast, list):
        for val in ast:
            for x in ast_types(val):
                yield x      

## Filter AST nodes with type of Identifier

Here, try one program, and print out all its identifiers.

In [4]:
AST_FILE = '../lang/ast.json11'

with open(AST_FILE, 'r') as f:
    _, one_program = parse_ast(f.readlines()).next()

In [5]:
set( ast_types(one_program) )

{u'ArrayExpression',
 u'AssignmentExpression',
 u'BinaryExpression',
 u'BlockStatement',
 u'CallExpression',
 u'EmptyStatement',
 u'ExpressionStatement',
 u'FunctionExpression',
 u'Identifier',
 u'IfStatement',
 u'Literal',
 u'LogicalExpression',
 u'MemberExpression',
 u'ObjectExpression',
 u'Program',
 u'Property',
 u'ReturnStatement',
 u'ThisExpression',
 u'UnaryExpression',
 u'VariableDeclaration',
 u'VariableDeclarator'}

In [6]:
list(filter_ast_by_type(one_program, "Identifier"))[:10]

[{u'name': u'Number', u'type': u'Identifier'},
 {u'name': u'$n', u'type': u'Identifier'},
 {u'name': u'$n', u'type': u'Identifier'},
 {u'name': u'locale', u'type': u'Identifier'},
 {u'name': u'Drupal', u'type': u'Identifier'},
 {u'name': u'renderItem', u'type': u'Identifier'},
 {u'name': u'apachesolr_autocomplete', u'type': u'Identifier'},
 {u'name': u'Drupal', u'type': u'Identifier'},
 {u'name': u'_renderItem', u'type': u'Identifier'},
 {u'name': u'data', u'type': u'Identifier'}]

## Naive word2vec

Use identifier names as vocabulary.

In [7]:
def should_keep_string(s):
    return 1 < len(s) < 16
def load_sentences(filename, sentences):
    with open(filename, 'r') as f:
        for id, program in parse_ast(f.readlines()):
            idents = filter_ast_by_type(program, type="Identifier")
            idents = list( imap(lambda d:d["name"], idents) )
            lits = filter_ast_by_type(program, type="Literal")
            lits = [l["value"] for l in lits \
                      if isinstance(l["value"], str) and should_keep_string(l['value'])]
            # sentence = idents + lits
            sentences.append((id, idents))
            sentences.append((id, lits))
    return sentences

In [8]:
import sys
paths = ['/home/yiran/.local/lib/python2.7/site-packages/PyOpenGL-3.0.1-py2.7.egg', '/home/yiran/.local/lib/python2.7/site-packages/ftputil-3.2-py2.7.egg', '/home/yiran/anaconda/lib/python27.zip', '/home/yiran/anaconda/lib/python2.7', '/home/yiran/anaconda/lib/python2.7/plat-linux2', '/home/yiran/anaconda/lib/python2.7/lib-tk', '/home/yiran/anaconda/lib/python2.7/lib-old', '/home/yiran/anaconda/lib/python2.7/lib-dynload', '/home/yiran/.local/lib/python2.7/site-packages', '/home/yiran/anaconda/lib/python2.7/site-packages/Sphinx-1.3.5-py2.7.egg', '/home/yiran/anaconda/lib/python2.7/site-packages/setuptools-19.6.2-py2.7.egg', '/home/yiran/anaconda/lib/python2.7/site-packages', '/home/yiran/anaconda/lib/python2.7/site-packages/cryptography-1.0.2-py2.7-linux-x86_64.egg']
for p in paths:
    sys.path.insert(0, p)

In [9]:
from gensim.models.word2vec import Word2Vec
from gensim.models.doc2vec import Doc2Vec, TaggedDocument
import glob

Vendor:  Continuum Analytics, Inc.
Package: mkl
Message: trial mode expires in 30 days


In [10]:
sentences = []
for f in glob.glob("../lang/ast.json*"):
    print ("Loading %s" % f)
    load_sentences(f, sentences)

Loading ../lang/ast.json26
Loading ../lang/ast.json45
Loading ../lang/ast.json40
Loading ../lang/ast.json36
Loading ../lang/ast.json18
Loading ../lang/ast.json02
Loading ../lang/ast.json38
Loading ../lang/ast.json47
Loading ../lang/ast.json53
Loading ../lang/ast.json66
Loading ../lang/ast.json72
Loading ../lang/ast.json04
Loading ../lang/ast.json48
Loading ../lang/ast.json65
Loading ../lang/ast.json64
Loading ../lang/ast.json49
Loading ../lang/ast.json33
Loading ../lang/ast.json30
Loading ../lang/ast.json10
Loading ../lang/ast.json51
Loading ../lang/ast.json60
Loading ../lang/ast.json31
Loading ../lang/ast.json34
Loading ../lang/ast.json75
Loading ../lang/ast.json43
Loading ../lang/ast.json73
Loading ../lang/ast.json57
Loading ../lang/ast.json14
Loading ../lang/ast.json42
Loading ../lang/ast.json08
Loading ../lang/ast.json76
Loading ../lang/ast.json63
Loading ../lang/ast.json23
Loading ../lang/ast.json71
Loading ../lang/ast.json07
Loading ../lang/ast.json50
Loading ../lang/ast.json25
L

In [19]:
def is_chart(s):
    return len(s) == 1 and s.lower() in "abcdefghijklmnopqrstuvwxyz"

def remove_single_char(sentences):
    return [s.lower() for s in sentences if not is_char(s)]
  
docs = map(lambda (id,s): TaggedDocument(words=remove_single_char(s), tags=[id]), sentences)

In [20]:
model = Doc2Vec(docs, size=200, window=15, 
                min_count=5, workers=2, dm=0,
                max_vocab_size=1000000,
                hs=0, negative=5,
                iter=20,
                sample=5e-5,
                dbow_words=1)

In [22]:
model.most_similar(positive="createelement")

[(u'appendchild', 0.7830908298492432),
 (u'getelementsbytagname', 0.7050471305847168),
 (u'classname', 0.6742371320724487),
 (u'innerhtml', 0.6652939319610596),
 (u'removechild', 0.6607442498207092),
 (u'setattribute', 0.6391154527664185),
 (u'insertbefore', 0.6216260194778442),
 (u'firstchild', 0.6055293083190918),
 (u'document', 0.6001043319702148),
 (u'createtextnode', 0.5787196755409241)]

In [23]:
model.most_similar("ajax")

[(u'datatype', 0.7558028101921082),
 (u'success', 0.639014482498169),
 (u'beforesend', 0.617257833480835),
 (u'url', 0.6130246520042419),
 (u'eventresponse', 0.593224823474884),
 (u'keypressresponse', 0.590523898601532),
 (u'updatefilters', 0.571739137172699),
 (u'responsetext', 0.5300601124763489),
 (u'ajaxing', 0.5285371541976929),
 (u'new_mdp', 0.5220765471458435)]

In [24]:
doc2vec_model = model

In [28]:
import random
import re
import numpy as np

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import HashingVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer

from sklearn.svm import SVC
from sklearn.naive_bayes import BernoulliNB,MultinomialNB
from sklearn import linear_model

In [29]:
with open("./table_flag.json") as f:
    table = json.load(f)
    
positive_examples = [i for i, e in enumerate(table) if e["flag-any"] == 1]
negative_examples = [i for i, e in enumerate(table) if e["flag-any"] == 0]
random.shuffle(positive_examples)
random.shuffle(negative_examples)
negative_examples = negative_examples[:len(positive_examples)]
print "%d positive and %d negative examples." % (len(positive_examples), len(negative_examples))

train_table = [table[i] for i in positive_examples[:200]] + [table[i] for i in negative_examples[:200]]
test_table = [table[i] for i in positive_examples[200:]] + [table[i] for i in negative_examples[200:]]

328 positive and 328 negative examples.


In [30]:
def get_features(table, model):
    X = np.zeros(shape=(len(table), model.vector_size))
    not_missing = []
    for i,cell in enumerate(table):
        if cell["sha"] in model.docvecs:
            not_missing.append(i)
            X[i] = model.docvecs[cell["sha"]]
    return X, np.array(not_missing)

In [31]:
X_train, train_keep = get_features(train_table, model)
X_test, test_keep = get_features(test_table, model)


Y_train = np.array([item["flag-any"] for item in train_table])
Y_test = np.array([item["flag-any"] for item in test_table])

X_train = X_train[train_keep]
X_test = X_test[test_keep]
Y_train = Y_train[train_keep]
Y_test = Y_test[test_keep]

In [32]:
X_train

array([[-0.05647162,  0.07979779,  0.13874571, ..., -0.32694811,
        -0.00296148, -0.07046972],
       [ 0.05981288,  0.02987484,  0.05384297, ..., -0.07373006,
         0.09519708, -0.05659505],
       [ 0.45967886,  0.56281298,  0.2791875 , ..., -0.15507841,
        -0.27003232, -0.47069818],
       ..., 
       [ 0.0357193 , -0.04053598, -0.00395663, ...,  0.01871301,
         0.00543078, -0.01476625],
       [ 0.03593505,  0.02288858,  0.01038239, ...,  0.0116191 ,
        -0.04680463, -0.02305219],
       [ 0.08578591,  0.24274816, -0.08316214, ..., -0.18644308,
        -0.2454115 ,  0.20583117]])

In [33]:
model = BernoulliNB()
model.fit(X_train, Y_train)
test_pred = model.predict(X_test)

print "Baseline: %d/%d = %f%%" % (sum(Y_test == 0), len(Y_test), ((float(sum(Y_test == 0)) / len(Y_test)) * 100))
print "Accuracy: %d/%d = %f%%" % (sum(Y_test == test_pred), len(Y_test), ((float(sum(Y_test == test_pred)) / len(Y_test)) * 100))

Baseline: 123/249 = 49.397590%
Accuracy: 192/249 = 77.108434%


In [34]:
model = SVC()
model.fit(X_train, Y_train)
test_pred = model.predict(X_test)

print "Baseline: %d/%d = %f%%" % (sum(Y_test == 0), len(Y_test), ((float(sum(Y_test == 0)) / len(Y_test)) * 100))
print "Accuracy: %d/%d = %f%%" % (sum(Y_test == test_pred), len(Y_test), ((float(sum(Y_test == test_pred)) / len(Y_test)) * 100))

Baseline: 123/249 = 49.397590%
Accuracy: 184/249 = 73.895582%


In [35]:
model = linear_model.SGDClassifier(class_weight="auto")
model.fit(X_train, Y_train)
test_pred = model.predict(X_test)

print "Baseline: %d/%d = %f%%" % (sum(Y_test == 0), len(Y_test), ((float(sum(Y_test == 0)) / len(Y_test)) * 100))
print "Accuracy: %d/%d = %f%%" % (sum(Y_test == test_pred), len(Y_test), ((float(sum(Y_test == test_pred)) / len(Y_test)) * 100))

Baseline: 123/249 = 49.397590%
Accuracy: 180/249 = 72.289157%
