In [1]:
# imports and setup
import json
from itertools import islice, imap

## Helper Functions

In [2]:
def parse_ast_line(line, sep="\t"):
    script_id, ast_json = line.strip().split(sep, 1)
    return script_id, json.loads(ast_json)
      
def parse_ast(lines):
    for line in lines:
        if len(line) > 1:
            yield parse_ast_line(line)
            
def filter_ast_by_type(ast, type):
    if _is_node_of_type(ast, type):
        yield ast
    elif isinstance(ast, dict):
        for _, val in ast.iteritems():
            for x in filter_ast_by_type(val, type):
                yield x
    elif isinstance(ast, list):
        for val in ast:
            for x in filter_ast_by_type(val, type):
                yield x
      
def _is_node_of_type(d, type):
    return isinstance(d, dict) and \
           "type" in d and \
           d["type"] == type

## Filter AST nodes with type of Identifier

Here, try one program, and print out all its identifiers.

In [3]:
AST_FILE = '../lang/ast.json11'

with open(AST_FILE, 'r') as f:
    _, one_program = parse_ast(f.readlines()).next()

In [4]:
list(filter_ast_by_type(one_program, "Identifier"))[:10]

[{u'name': u'Number', u'type': u'Identifier'},
 {u'name': u'$n', u'type': u'Identifier'},
 {u'name': u'$n', u'type': u'Identifier'},
 {u'name': u'locale', u'type': u'Identifier'},
 {u'name': u'Drupal', u'type': u'Identifier'},
 {u'name': u'renderItem', u'type': u'Identifier'},
 {u'name': u'apachesolr_autocomplete', u'type': u'Identifier'},
 {u'name': u'Drupal', u'type': u'Identifier'},
 {u'name': u'_renderItem', u'type': u'Identifier'},
 {u'name': u'data', u'type': u'Identifier'}]

## Naive word2vec

Use identifier names as vocabulary.

In [15]:
def load_sentences(filename, sentences):
    with open(filename, 'r') as f:
        for _, program in parse_ast(f.readlines()):
            sentence = filter_ast_by_type(program, type="Identifier")
            sentence = list( imap(lambda d:d["name"], sentence) )
            sentences.append(sentence)
    return sentences

In [6]:
import sys
paths = ['/home/yiran/.local/lib/python2.7/site-packages/PyOpenGL-3.0.1-py2.7.egg', '/home/yiran/.local/lib/python2.7/site-packages/ftputil-3.2-py2.7.egg', '/home/yiran/anaconda/lib/python27.zip', '/home/yiran/anaconda/lib/python2.7', '/home/yiran/anaconda/lib/python2.7/plat-linux2', '/home/yiran/anaconda/lib/python2.7/lib-tk', '/home/yiran/anaconda/lib/python2.7/lib-old', '/home/yiran/anaconda/lib/python2.7/lib-dynload', '/home/yiran/.local/lib/python2.7/site-packages', '/home/yiran/anaconda/lib/python2.7/site-packages/Sphinx-1.3.5-py2.7.egg', '/home/yiran/anaconda/lib/python2.7/site-packages/setuptools-19.6.2-py2.7.egg', '/home/yiran/anaconda/lib/python2.7/site-packages', '/home/yiran/anaconda/lib/python2.7/site-packages/cryptography-1.0.2-py2.7-linux-x86_64.egg']
for p in paths:
    sys.path.insert(0, p)

In [10]:
from gensim.models.word2vec import Word2Vec
import glob

In [None]:
sentences = []
for f in glob.glob("../lang/ast.json*"):
    print ("Loading %s" % f)
    load_sentences(f, sentences)

Loading ../lang/ast.json26
Loading ../lang/ast.json45
Loading ../lang/ast.json40
Loading ../lang/ast.json36
Loading ../lang/ast.json18
Loading ../lang/ast.json02
Loading ../lang/ast.json38
Loading ../lang/ast.json47
Loading ../lang/ast.json53
Loading ../lang/ast.json66
Loading ../lang/ast.json72
Loading ../lang/ast.json04
Loading ../lang/ast.json48

In [21]:
model = Word2Vec(sentences, size=100, window=5, min_count=5, workers=4)

In [22]:
model.vocab

{u'onMemberItemClick': <gensim.models.word2vec.Vocab at 0x7f6402ee3c90>,
 u'wrapperElement': <gensim.models.word2vec.Vocab at 0x7f63f97ea650>,
 u'canModerate': <gensim.models.word2vec.Vocab at 0x7f63f97ea290>,
 u'u0': <gensim.models.word2vec.Vocab at 0x7f63fedd9350>,
 u'_jQuery': <gensim.models.word2vec.Vocab at 0x7f6400582c90>,
 u'handleLoginClick': <gensim.models.word2vec.Vocab at 0x7f63ff4a3610>,
 u'removeMap': <gensim.models.word2vec.Vocab at 0x7f63fc188cd0>,
 u'show_description': <gensim.models.word2vec.Vocab at 0x7f63ffc79ad0>,
 u'canUserCreateAndDeleteUserGroups': <gensim.models.word2vec.Vocab at 0x7f64007e5d90>,
 u'callbackHash': <gensim.models.word2vec.Vocab at 0x7f641ae31650>,
 u'jqDeferred': <gensim.models.word2vec.Vocab at 0x7f64005827d0>,
 u'$prevHtml': <gensim.models.word2vec.Vocab at 0x7f6400582190>,
 u'yellow': <gensim.models.word2vec.Vocab at 0x7f63f97eadd0>,
 u'factory': <gensim.models.word2vec.Vocab at 0x7f641ef6d1d0>,
 u'DELAY': <gensim.models.word2vec.Vocab at 0x7f

In [24]:
model.most_similar(positive=["jQuery"], negative="lodash")

[(u'allowed', 0.6279492974281311),
 (u'$msg_input', 0.60919588804245),
 (u'Unknown', 0.6089186668395996),
 (u'focusMessageInput', 0.6046967506408691),
 (u'eagerLoadAssets', 0.5932580828666687),
 (u'is_our_app', 0.5896718502044678),
 (u'_bindUI', 0.5884132981300354),
 (u'LinearRing', 0.5816546678543091),
 (u'activateADA', 0.5615449547767639),
 (u'ABTest', 0.558937132358551)]