# Experiments with Doc2Vec

Try to represent code in Doc2Vec. 

## Setup

In [1]:
import multiprocessing
from pathlib import Path
import csv

import gensim.models.doc2vec
import regex
from gensim.models.doc2vec import Doc2Vec
from gensim.models.doc2vec import TaggedDocument
from javalang import tokenizer

from code_embeddings.utils import tokenize

assert gensim.models.doc2vec.FAST_VERSION > -1, "This will be painfully slow otherwise"


## Build and Train the Model

In [2]:
test_code_dir = Path('../test_data')
train_code_dir = Path('../training_data')
models_dir = Path('../models')

In [3]:
# doc2vec parameters
vector_size = 50
window_size = 16
min_count = 5
sampling_threshold = 1e-5
negative_size = 5
epochs = 20
dm = 0  # 0 = dbow; 1 = dmpv
worker_count = multiprocessing.cpu_count()  # number of parallel processes

In [4]:
def split_methods(code):
    """Parse Java files into separate methods

        :param code: Java code to parse.
        :rtype: map
    """
    pattern = r'(?:(?:public|private|static|protected)\s+)*\s*[\w\<\>\[\]]+\s+\w+\s*\([^{]+({(?:[^{}]+\/\*.*?\*\/|[^{}]+\/\/.*?$|[^{}]+|(?1))*+})'
    scanner = regex.finditer(pattern, code, regex.MULTILINE)
    return map(lambda match: match.group(0), scanner)

In [5]:
def read_train_corpus():
    count = -1
    dict = {}
    for i, file in enumerate(train_code_dir.glob('./java_projects/**/*.java')):
        if not file.is_file():  # oddly, some of these are not files
            continue
        with file.open() as f:
            try:
                code = f.read()
                methods = split_methods(code)
                for method in methods:
                    tokens = list(tokenizer.tokenize(method))
                    tokens = [token for t in tokens for token in t.value.split(" ")]
                    if tokens:
                        count += 1
                        path = file.__fspath__()
                        dict[count] = path
                        yield TaggedDocument(tokens, [count])
            except tokenizer.LexerError as e:
                # print("%s: %s" % (type(e).__name__, e))
                pass
            except UnicodeDecodeError as e:
                # print("%s: %s" % (type(e).__name__, e))
                pass
            except Exception as e:
                print("%s: %s" % (type(e).__name__, e))
                pass
        if i % 10000 == 0:
            print("Processed %s records" % i)
        if i > 100000:
            break

    # also include test corpus in training! 
    for programming_language in test_code_dir.glob('./Java'):
        if not programming_language.is_dir():
            continue
        for programming_task in programming_language.glob('./*'):
            if not programming_task.is_dir():
                continue
            for implementation in programming_task.glob('./*'):
                with implementation.open() as f:
                    try:
                        code = f.read()
                        tokens = list(tokenizer.tokenize(code))
                        tokens = [token for t in tokens for token in t.value.split(" ")]
                        if tokens:
                            count += 1
                            path = implementation.__fspath__()
                            dict[count] = path
                            yield TaggedDocument(tokens, [count])
                    except Exception as e:
                        print("Warning: %s" % e)
                        pass

    # save map to csv
    with open(str(models_dir / 'java_doc_map.csv'), 'w', newline='') as csvfile:
        w = csv.writer(csvfile)
        for key, val in dict.items():
            w.writerow([key, val])
                
   


%time train_corpus = list(read_train_corpus())
print("Training corpus size: %s" % len(train_corpus))


AttributeError: '_io.TextIOWrapper' object has no attribute '__fspath__'
AttributeError: '_io.TextIOWrapper' object has no attribute '__fspath__'
AttributeError: '_io.TextIOWrapper' object has no attribute '__fspath__'
AttributeError: '_io.TextIOWrapper' object has no attribute '__fspath__'
AttributeError: '_io.TextIOWrapper' object has no attribute '__fspath__'
AttributeError: '_io.TextIOWrapper' object has no attribute '__fspath__'
AttributeError: '_io.TextIOWrapper' object has no attribute '__fspath__'
AttributeError: '_io.TextIOWrapper' object has no attribute '__fspath__'
AttributeError: '_io.TextIOWrapper' object has no attribute '__fspath__'
AttributeError: '_io.TextIOWrapper' object has no attribute '__fspath__'
AttributeError: '_io.TextIOWrapper' object has no attribute '__fspath__'
AttributeError: '_io.TextIOWrapper' object has no attribute '__fspath__'
AttributeError: '_io.TextIOWrapper' object has no attribute '__fspath__'
AttributeError: '_io.TextIOWrapper' object has no a

AttributeError: '_io.TextIOWrapper' object has no attribute '__fspath__'
AttributeError: '_io.TextIOWrapper' object has no attribute '__fspath__'
AttributeError: '_io.TextIOWrapper' object has no attribute '__fspath__'
AttributeError: '_io.TextIOWrapper' object has no attribute '__fspath__'
AttributeError: '_io.TextIOWrapper' object has no attribute '__fspath__'
AttributeError: '_io.TextIOWrapper' object has no attribute '__fspath__'
AttributeError: '_io.TextIOWrapper' object has no attribute '__fspath__'
AttributeError: '_io.TextIOWrapper' object has no attribute '__fspath__'
AttributeError: '_io.TextIOWrapper' object has no attribute '__fspath__'
AttributeError: '_io.TextIOWrapper' object has no attribute '__fspath__'
AttributeError: '_io.TextIOWrapper' object has no attribute '__fspath__'
AttributeError: '_io.TextIOWrapper' object has no attribute '__fspath__'
AttributeError: '_io.TextIOWrapper' object has no attribute '__fspath__'
AttributeError: '_io.TextIOWrapper' object has no a

AttributeError: '_io.TextIOWrapper' object has no attribute '__fspath__'
AttributeError: '_io.TextIOWrapper' object has no attribute '__fspath__'
AttributeError: '_io.TextIOWrapper' object has no attribute '__fspath__'
AttributeError: '_io.TextIOWrapper' object has no attribute '__fspath__'
AttributeError: '_io.TextIOWrapper' object has no attribute '__fspath__'
AttributeError: '_io.TextIOWrapper' object has no attribute '__fspath__'
AttributeError: '_io.TextIOWrapper' object has no attribute '__fspath__'
AttributeError: '_io.TextIOWrapper' object has no attribute '__fspath__'
AttributeError: '_io.TextIOWrapper' object has no attribute '__fspath__'
AttributeError: '_io.TextIOWrapper' object has no attribute '__fspath__'
AttributeError: '_io.TextIOWrapper' object has no attribute '__fspath__'
AttributeError: '_io.TextIOWrapper' object has no attribute '__fspath__'
AttributeError: '_io.TextIOWrapper' object has no attribute '__fspath__'
AttributeError: '_io.TextIOWrapper' object has no a

AttributeError: '_io.TextIOWrapper' object has no attribute '__fspath__'
AttributeError: '_io.TextIOWrapper' object has no attribute '__fspath__'
AttributeError: '_io.TextIOWrapper' object has no attribute '__fspath__'
AttributeError: '_io.TextIOWrapper' object has no attribute '__fspath__'
AttributeError: '_io.TextIOWrapper' object has no attribute '__fspath__'
AttributeError: '_io.TextIOWrapper' object has no attribute '__fspath__'
AttributeError: '_io.TextIOWrapper' object has no attribute '__fspath__'
AttributeError: '_io.TextIOWrapper' object has no attribute '__fspath__'
AttributeError: '_io.TextIOWrapper' object has no attribute '__fspath__'
AttributeError: '_io.TextIOWrapper' object has no attribute '__fspath__'
AttributeError: '_io.TextIOWrapper' object has no attribute '__fspath__'
AttributeError: '_io.TextIOWrapper' object has no attribute '__fspath__'
AttributeError: '_io.TextIOWrapper' object has no attribute '__fspath__'
AttributeError: '_io.TextIOWrapper' object has no a

AttributeError: '_io.TextIOWrapper' object has no attribute '__fspath__'
AttributeError: '_io.TextIOWrapper' object has no attribute '__fspath__'
AttributeError: '_io.TextIOWrapper' object has no attribute '__fspath__'
AttributeError: '_io.TextIOWrapper' object has no attribute '__fspath__'
AttributeError: '_io.TextIOWrapper' object has no attribute '__fspath__'
AttributeError: '_io.TextIOWrapper' object has no attribute '__fspath__'
AttributeError: '_io.TextIOWrapper' object has no attribute '__fspath__'
AttributeError: '_io.TextIOWrapper' object has no attribute '__fspath__'
AttributeError: '_io.TextIOWrapper' object has no attribute '__fspath__'
AttributeError: '_io.TextIOWrapper' object has no attribute '__fspath__'
AttributeError: '_io.TextIOWrapper' object has no attribute '__fspath__'
AttributeError: '_io.TextIOWrapper' object has no attribute '__fspath__'
AttributeError: '_io.TextIOWrapper' object has no attribute '__fspath__'
AttributeError: '_io.TextIOWrapper' object has no a

AttributeError: '_io.TextIOWrapper' object has no attribute '__fspath__'
AttributeError: '_io.TextIOWrapper' object has no attribute '__fspath__'
AttributeError: '_io.TextIOWrapper' object has no attribute '__fspath__'
AttributeError: '_io.TextIOWrapper' object has no attribute '__fspath__'
AttributeError: '_io.TextIOWrapper' object has no attribute '__fspath__'
AttributeError: '_io.TextIOWrapper' object has no attribute '__fspath__'
AttributeError: '_io.TextIOWrapper' object has no attribute '__fspath__'
AttributeError: '_io.TextIOWrapper' object has no attribute '__fspath__'
AttributeError: '_io.TextIOWrapper' object has no attribute '__fspath__'
AttributeError: '_io.TextIOWrapper' object has no attribute '__fspath__'
AttributeError: '_io.TextIOWrapper' object has no attribute '__fspath__'
AttributeError: '_io.TextIOWrapper' object has no attribute '__fspath__'
AttributeError: '_io.TextIOWrapper' object has no attribute '__fspath__'
AttributeError: '_io.TextIOWrapper' object has no a

AttributeError: '_io.TextIOWrapper' object has no attribute '__fspath__'
AttributeError: '_io.TextIOWrapper' object has no attribute '__fspath__'
AttributeError: '_io.TextIOWrapper' object has no attribute '__fspath__'
AttributeError: '_io.TextIOWrapper' object has no attribute '__fspath__'
AttributeError: '_io.TextIOWrapper' object has no attribute '__fspath__'
AttributeError: '_io.TextIOWrapper' object has no attribute '__fspath__'
AttributeError: '_io.TextIOWrapper' object has no attribute '__fspath__'
AttributeError: '_io.TextIOWrapper' object has no attribute '__fspath__'
AttributeError: '_io.TextIOWrapper' object has no attribute '__fspath__'
AttributeError: '_io.TextIOWrapper' object has no attribute '__fspath__'
AttributeError: '_io.TextIOWrapper' object has no attribute '__fspath__'
AttributeError: '_io.TextIOWrapper' object has no attribute '__fspath__'
AttributeError: '_io.TextIOWrapper' object has no attribute '__fspath__'
AttributeError: '_io.TextIOWrapper' object has no a

AttributeError: '_io.TextIOWrapper' object has no attribute '__fspath__'
AttributeError: '_io.TextIOWrapper' object has no attribute '__fspath__'
AttributeError: '_io.TextIOWrapper' object has no attribute '__fspath__'
AttributeError: '_io.TextIOWrapper' object has no attribute '__fspath__'
AttributeError: '_io.TextIOWrapper' object has no attribute '__fspath__'
AttributeError: '_io.TextIOWrapper' object has no attribute '__fspath__'
AttributeError: '_io.TextIOWrapper' object has no attribute '__fspath__'
AttributeError: '_io.TextIOWrapper' object has no attribute '__fspath__'
AttributeError: '_io.TextIOWrapper' object has no attribute '__fspath__'
AttributeError: '_io.TextIOWrapper' object has no attribute '__fspath__'
AttributeError: '_io.TextIOWrapper' object has no attribute '__fspath__'
AttributeError: '_io.TextIOWrapper' object has no attribute '__fspath__'
AttributeError: '_io.TextIOWrapper' object has no attribute '__fspath__'
AttributeError: '_io.TextIOWrapper' object has no a

AttributeError: '_io.TextIOWrapper' object has no attribute '__fspath__'
AttributeError: '_io.TextIOWrapper' object has no attribute '__fspath__'
AttributeError: '_io.TextIOWrapper' object has no attribute '__fspath__'
AttributeError: '_io.TextIOWrapper' object has no attribute '__fspath__'
AttributeError: '_io.TextIOWrapper' object has no attribute '__fspath__'
AttributeError: '_io.TextIOWrapper' object has no attribute '__fspath__'
AttributeError: '_io.TextIOWrapper' object has no attribute '__fspath__'
AttributeError: '_io.TextIOWrapper' object has no attribute '__fspath__'
AttributeError: '_io.TextIOWrapper' object has no attribute '__fspath__'
AttributeError: '_io.TextIOWrapper' object has no attribute '__fspath__'
AttributeError: '_io.TextIOWrapper' object has no attribute '__fspath__'
AttributeError: '_io.TextIOWrapper' object has no attribute '__fspath__'
AttributeError: '_io.TextIOWrapper' object has no attribute '__fspath__'
AttributeError: '_io.TextIOWrapper' object has no a

AttributeError: '_io.TextIOWrapper' object has no attribute '__fspath__'
AttributeError: '_io.TextIOWrapper' object has no attribute '__fspath__'
AttributeError: '_io.TextIOWrapper' object has no attribute '__fspath__'
AttributeError: '_io.TextIOWrapper' object has no attribute '__fspath__'
AttributeError: '_io.TextIOWrapper' object has no attribute '__fspath__'
AttributeError: '_io.TextIOWrapper' object has no attribute '__fspath__'
AttributeError: '_io.TextIOWrapper' object has no attribute '__fspath__'
AttributeError: '_io.TextIOWrapper' object has no attribute '__fspath__'
AttributeError: '_io.TextIOWrapper' object has no attribute '__fspath__'
AttributeError: '_io.TextIOWrapper' object has no attribute '__fspath__'
AttributeError: '_io.TextIOWrapper' object has no attribute '__fspath__'
AttributeError: '_io.TextIOWrapper' object has no attribute '__fspath__'
AttributeError: '_io.TextIOWrapper' object has no attribute '__fspath__'
AttributeError: '_io.TextIOWrapper' object has no a

AttributeError: '_io.TextIOWrapper' object has no attribute '__fspath__'
AttributeError: '_io.TextIOWrapper' object has no attribute '__fspath__'
AttributeError: '_io.TextIOWrapper' object has no attribute '__fspath__'
AttributeError: '_io.TextIOWrapper' object has no attribute '__fspath__'
AttributeError: '_io.TextIOWrapper' object has no attribute '__fspath__'
AttributeError: '_io.TextIOWrapper' object has no attribute '__fspath__'
AttributeError: '_io.TextIOWrapper' object has no attribute '__fspath__'
AttributeError: '_io.TextIOWrapper' object has no attribute '__fspath__'
AttributeError: '_io.TextIOWrapper' object has no attribute '__fspath__'
AttributeError: '_io.TextIOWrapper' object has no attribute '__fspath__'
AttributeError: '_io.TextIOWrapper' object has no attribute '__fspath__'
AttributeError: '_io.TextIOWrapper' object has no attribute '__fspath__'
AttributeError: '_io.TextIOWrapper' object has no attribute '__fspath__'
AttributeError: '_io.TextIOWrapper' object has no a

KeyboardInterrupt: 

In [None]:
def read_test_corpus():
    color_val = 0
    for programming_language in test_code_dir.glob('./Java'):
        if not programming_language.is_dir():
            continue
        for programming_task in programming_language.glob('./*'):
            if not programming_task.is_dir():
                continue
            color_val += 1
            for implementation in programming_task.glob('./*'):
                with implementation.open() as f:
                    try:
                        code = f.read()
                        tokens = list(tokenizer.tokenize(code))
                        tokens = [token for t in tokens for token in t.value.split(" ")]
                        if tokens:
                            yield TaggedDocument(tokens, [programming_task.name, implementation.name, programming_language.name, color_val])
                    except Exception as e:
                        print("Warning: %s" % e)
                        pass
                
                
test_corpus = list(read_test_corpus())
print("Test corpus size: %s" % len(test_corpus))

In [None]:
%%time 
model = Doc2Vec(train_corpus,
                vector_size=vector_size,
                window=window_size,
                min_count=min_count, 
                sample=sampling_threshold,
                negative=negative_size,
                dbow_words=1,
                epochs=epochs, 
                dm=dm,
                workers=worker_count)

In [None]:
model.save(str(models_dir / "github-java-vectors.bin"))

In [None]:
model.wv.most_similar('i')

In [None]:
model.wv.most_similar('count')

In [None]:
model.wv.most_similar('NullPointerException')

In [None]:
import scipy

test1 = [doc for doc in test_corpus if doc.tags[1] == 'sieve-of-eratosthenes-6.java'][0]
test2 = [doc for doc in test_corpus if doc.tags[1] == 'sieve-of-eratosthenes-6.java'][0]

test1_vector = model.infer_vector(test1.words, steps=200)
test2_vector = model.infer_vector(test2.words, steps=200)

dist = scipy.spatial.distance.cosine(test1_vector, test2_vector)
print(dist)


In [None]:
dict = {}
with open(str(models_dir / 'java_doc_map.csv'), newline='') as csvfile:
    r = csv.reader(csvfile)
    for row in r:
        dict[int(row[0])] = row[1]

sims = model.docvecs.most_similar([test1_vector])
print(dict[0])
f = Path(dict[sims[0][0]])
print(f.name)
print("Similarity: %s" % sims[0][1])
with f.open() as fin:
    print(fin.read(), end="")

In [None]:
docs = [{'name': doc.tags[0], 'vec': model.infer_vector(doc.words, steps=200), 'color': doc.tags[3]} for doc in test_corpus]
num_colors = docs[-1]['color']

In [None]:
%matplotlib inline

import matplotlib.pyplot as plt
import matplotlib.cm as mplcm
import matplotlib.colors as colors
import matplotlib
from sklearn.manifold import TSNE

matplotlib.rcParams['figure.figsize'] = (32, 16)

tsne = TSNE(perplexity=40, n_components=2, init='pca', n_iter=2500, random_state=23)
twoDimVecs = tsne.fit_transform([doc['vec'] for doc in docs])

cm = plt.get_cmap('gist_rainbow')
cNorm = colors.Normalize(vmin=0, vmax=num_colors-1)
scalarMap = mplcm.ScalarMappable(norm=cNorm, cmap=cm)

fig, ax = plt.subplots()
for doc, twoDimVec in zip(docs, twoDimVecs):
    ax.scatter(twoDimVec[0], twoDimVec[1], color=scalarMap.to_rgba(doc['color']))
    plt.annotate(doc['name'],
                 xy=(twoDimVec[0], twoDimVec[1]),
                 xytext=(5, 2),
                 textcoords='offset points',
                 ha='right',
                 va='bottom')
plt.show()