# Experiments with Doc2Vec

Try to represent code in Doc2Vec. 

## Setup

In [1]:
from gensim.models.doc2vec import TaggedDocument
from gensim.models.doc2vec import Doc2Vec
import gensim.models.doc2vec
from code_embeddings.utils import tokenize
from javalang import tokenizer
from pathlib import Path
import multiprocessing

assert gensim.models.doc2vec.FAST_VERSION > -1, "This will be painfully slow otherwise"

## Build and Train the Model

In [2]:
test_code_dir = Path('test_data')
train_code_dir = Path('training_data')
models_dir = Path('models')

In [5]:
def read_train_corpus():
    for i, file in enumerate(train_code_dir.glob('./java_projects/**/*.java')):
        if not file.is_file():  # oddly, some of these are not files
            continue
        tokens = None
        with file.open() as f:
            try:
                code = f.read()
                tokens = list(tokenizer.tokenize(code))
                tokens = [token for t in tokens for token in t.value.split(" ")]
            except Exception as e:
                # print("Error: %s" % e)
                pass
        if tokens:
            yield TaggedDocument(tokens, [file.name])
        if i % 10000 == 0:
            print("Processed %s records" % i)
        if i > 100000:
            break


train_corpus = list(read_train_corpus())
print("Training corpus size: %s" % len(train_corpus))


Processed 10000 records


Processed 20000 records


Processed 30000 records


Processed 40000 records


Processed 50000 records


Processed 60000 records


Processed 70000 records


Processed 80000 records


Processed 90000 records


Processed 100000 records
Training corpus size: 99088


In [6]:
model = Doc2Vec(dm=0,  # training algorithm: 1 = PV-DM, 0 = PV-DBOW
                min_count=2,  # Ignores all words with total frequency lower than this
                max_vocab_size=None,
                workers=multiprocessing.cpu_count(),  # number of cores
                epochs=20,  # Number of iterations (epochs) over the corpus
                vector_size=50,
                dbow_words=1)

In [7]:
model.build_vocab(train_corpus)

In [8]:
%time model.train(train_corpus, total_examples=model.corpus_count, epochs=model.epochs)

CPU times: user 2h 57min 35s, sys: 1min 25s, total: 2h 59min 1s
Wall time: 49min 37s


In [9]:
model.save(str(models_dir / "github-java-vectors.bin"))

In [10]:
model.wv.most_similar('--')

[('eleIndex', 0.8072444796562195),
 ('peekCount', 0.7907533049583435),
 ('pendingCallbacks', 0.7730069160461426),
 ('actualTargetIndex', 0.7661998867988586),
 ('srcByteCount', 0.7639729976654053),
 ('countLoadedFileds', 0.7629091739654541),
 ('updateSimpleLine', 0.7598205804824829),
 ('do', 0.7588541507720947),
 ('patIdxEnd', 0.7573974132537842),
 ('scanWhitespace', 0.7559382319450378)]

In [11]:
model.wv.most_similar('count')

[('counter', 0.7916474342346191),
 ('last', 0.7904374599456787),
 ('listsSize', 0.7793775796890259),
 ('firstHalf', 0.7724998593330383),
 ('increment', 0.7703016996383667),
 ('cnt', 0.7674715518951416),
 ('total', 0.7653804421424866),
 ('numHistory', 0.763133704662323),
 ('numTicks', 0.7631007432937622),
 ('bitmapLength', 0.7610165476799011)]

In [12]:
model.wv.most_similar('NullPointerException')

[('IllegalArgumentException', 0.8226144313812256),
 ('IllegalStateException', 0.8146161437034607),
 ('"null', 0.7925406694412231),
 ('"DhDsaExchange', 0.782356321811676),
 ('allowed"', 0.781735897064209),
 ('throw', 0.7796768546104431),
 ('"gbeanType', 0.7724958062171936),
 ('\\"bundleContext\\"', 0.7692555785179138),
 ('cannot', 0.7691822052001953),
 ('\\"bundle\\"', 0.7661920189857483)]

In [None]:
def read_test_corpus():
    for programming_language in test_code_dir.glob('./Java'):
        if not programming_language.is_dir():
            continue
        for programming_task in programming_language.glob('./*'):
            if not programming_task.is_dir():
                continue
            for implementation in programming_task.glob('./*'):
                with implementation.open() as f:
                    # tokens = tokenize(f.read())
                    code = f.read()
                    try:
                        tokens = list(tokenizer.tokenize(code))
                        tokens = [token for t in tokens for token in t.value.split(" ")]
                    except:
                        pass
                yield TaggedDocument(tokens, [implementation.name])
                
                
test_corpus = list(read_test_corpus())