In [1]:
# imports and setup
import json
from itertools import islice, imap

## Helper Functions

In [2]:
def parse_ast_line(line, sep="\t"):
    script_id, ast_json = line.strip().split(sep, 1)
    return script_id, json.loads(ast_json)
      
def parse_ast(lines):
    for line in lines:
        if len(line) > 1:
            yield parse_ast_line(line)
            
def filter_ast_by_type(ast, type):
    if _is_node_of_type(ast, type):
        yield ast
    elif isinstance(ast, dict):
        for _, val in ast.iteritems():
            for x in filter_ast_by_type(val, type):
                yield x
    elif isinstance(ast, list):
        for val in ast:
            for x in filter_ast_by_type(val, type):
                yield x
      
def _is_node_of_type(d, type):
    return isinstance(d, dict) and \
           "type" in d and \
           d["type"] == type

## Filter AST nodes with type of Identifier

Here, try one program, and print out all its identifiers.

In [3]:
AST_FILE = '../lang/ast.json11'

with open(AST_FILE, 'r') as f:
    _, one_program = parse_ast(f.readlines()).next()

In [4]:
list(filter_ast_by_type(one_program, "Identifier"))[:10]

[{u'name': u'Number', u'type': u'Identifier'},
 {u'name': u'$n', u'type': u'Identifier'},
 {u'name': u'$n', u'type': u'Identifier'},
 {u'name': u'locale', u'type': u'Identifier'},
 {u'name': u'Drupal', u'type': u'Identifier'},
 {u'name': u'renderItem', u'type': u'Identifier'},
 {u'name': u'apachesolr_autocomplete', u'type': u'Identifier'},
 {u'name': u'Drupal', u'type': u'Identifier'},
 {u'name': u'_renderItem', u'type': u'Identifier'},
 {u'name': u'data', u'type': u'Identifier'}]

## Naive word2vec

Use identifier names as vocabulary.

In [15]:
def load_sentences(filename, sentences):
    with open(filename, 'r') as f:
        for _, program in parse_ast(f.readlines()):
            sentence = filter_ast_by_type(program, type="Identifier")
            sentence = list( imap(lambda d:d["name"], sentence) )
            sentences.append(sentence)
    return sentences

In [6]:
import sys
paths = ['/home/yiran/.local/lib/python2.7/site-packages/PyOpenGL-3.0.1-py2.7.egg', '/home/yiran/.local/lib/python2.7/site-packages/ftputil-3.2-py2.7.egg', '/home/yiran/anaconda/lib/python27.zip', '/home/yiran/anaconda/lib/python2.7', '/home/yiran/anaconda/lib/python2.7/plat-linux2', '/home/yiran/anaconda/lib/python2.7/lib-tk', '/home/yiran/anaconda/lib/python2.7/lib-old', '/home/yiran/anaconda/lib/python2.7/lib-dynload', '/home/yiran/.local/lib/python2.7/site-packages', '/home/yiran/anaconda/lib/python2.7/site-packages/Sphinx-1.3.5-py2.7.egg', '/home/yiran/anaconda/lib/python2.7/site-packages/setuptools-19.6.2-py2.7.egg', '/home/yiran/anaconda/lib/python2.7/site-packages', '/home/yiran/anaconda/lib/python2.7/site-packages/cryptography-1.0.2-py2.7-linux-x86_64.egg']
for p in paths:
    sys.path.insert(0, p)

In [10]:
from gensim.models.word2vec import Word2Vec
import glob

In [25]:
sentences = []
for f in glob.glob("../lang/ast.json*"):
    print ("Loading %s" % f)
    load_sentences(f, sentences)

Loading ../lang/ast.json26
Loading ../lang/ast.json45
Loading ../lang/ast.json40
Loading ../lang/ast.json36
Loading ../lang/ast.json18
Loading ../lang/ast.json02
Loading ../lang/ast.json38
Loading ../lang/ast.json47
Loading ../lang/ast.json53
Loading ../lang/ast.json66
Loading ../lang/ast.json72
Loading ../lang/ast.json04
Loading ../lang/ast.json48
Loading ../lang/ast.json65
Loading ../lang/ast.json64
Loading ../lang/ast.json49
Loading ../lang/ast.json33
Loading ../lang/ast.json30
Loading ../lang/ast.json10
Loading ../lang/ast.json51
Loading ../lang/ast.json60
Loading ../lang/ast.json31
Loading ../lang/ast.json34
Loading ../lang/ast.json75
Loading ../lang/ast.json43
Loading ../lang/ast.json73
Loading ../lang/ast.json57
Loading ../lang/ast.json14
Loading ../lang/ast.json42
Loading ../lang/ast.json08
Loading ../lang/ast.json76
Loading ../lang/ast.json63
Loading ../lang/ast.json23
Loading ../lang/ast.json71
Loading ../lang/ast.json07
Loading ../lang/ast.json50
Loading ../lang/ast.json25
L

In [26]:
model = Word2Vec(sentences, size=100, window=5, min_count=5, workers=4)

In [27]:
model.vocab

{u'buttonOptions': <gensim.models.word2vec.Vocab at 0x7f641497d8d0>,
 u'_jQuery': <gensim.models.word2vec.Vocab at 0x7f641497d910>,
 u'sending2': <gensim.models.word2vec.Vocab at 0x7f63b324da50>,
 u'show_description': <gensim.models.word2vec.Vocab at 0x7f63b274ba50>,
 u'median_corrected': <gensim.models.word2vec.Vocab at 0x7f63f4877e90>,
 u'$prevHtml': <gensim.models.word2vec.Vocab at 0x7f63fac194d0>,
 u'endCh': <gensim.models.word2vec.Vocab at 0x7f63f4877ed0>,
 u'fallbackNS': <gensim.models.word2vec.Vocab at 0x7f63fac19410>,
 u'serviceFlag': <gensim.models.word2vec.Vocab at 0x7f63b30d84d0>,
 u'mailAgence': <gensim.models.word2vec.Vocab at 0x7f63fac19110>,
 u'hanging': <gensim.models.word2vec.Vocab at 0x7f63f4877f10>,
 u'_show_authy': <gensim.models.word2vec.Vocab at 0x7f63f4877f50>,
 u'myToggleClass': <gensim.models.word2vec.Vocab at 0x7f63c09a85d0>,
 u'isLogged': <gensim.models.word2vec.Vocab at 0x7f63cf96a350>,
 u'localized': <gensim.models.word2vec.Vocab at 0x7f640b7c8090>,
 u'enab

In [33]:
model.most_similar(positive="createElement")

[(u'appendChild', 0.619277834892273),
 (u'createTextNode', 0.604455292224884),
 (u'createText', 0.576920211315155),
 (u'trcGetChildByClassName', 0.5707101225852966),
 (u'body', 0.5525359511375427),
 (u'$MessengerVideoPlayer1', 0.5305860042572021),
 (u'waitingScripts', 0.52565997838974),
 (u'byUrl', 0.5249150991439819),
 (u'currentScript', 0.5237258672714233),
 (u'safeFragment', 0.5164779424667358)]

In [40]:
model.most_similar("ajax")

[(u'getJSON', 0.7115294337272644),
 (u'success', 0.710839033126831),
 (u'dataType', 0.6964539289474487),
 (u'errorCb', 0.6803914308547974),
 (u'posturl', 0.6754568815231323),
 (u'url', 0.6729618310928345),
 (u'rfsh', 0.6606138944625854),
 (u'ajaxOptions', 0.6587070226669312),
 (u'dataParam', 0.6557852029800415),
 (u'$cf', 0.6461215019226074)]