In [1]:
from itertools import groupby
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.neural_network import MLPClassifier
from sklearn.ensemble import RandomForestClassifier
from utilities import read_dataset_folder, describe, statistics

import re
import csv
import os
import subprocess
import random
import numpy as np
import shutil

def extract_files(programs, destination):
    os.makedirs(destination)
    for i, (program, task) in enumerate(programs):
        filename = f'task{task:02d}-program{i:05d}.py'
        path = os.path.join(destination, filename)
        with open(path, 'w', encoding='utf-8') as file:
            file.write(program)

def mine_path_contexts(volume):
    process = subprocess.run([
        'docker', 'run',
        '-v', '%s:/astminer/files' % os.path.abspath(volume),
        '--rm', 'voudy/astminer',
        'files/astminer.yml'
    ], capture_output=True, encoding='utf8')
    print(process.stdout)

def read_csv_lines(file):
    with open(file, 'r', encoding='utf-8', newline='') as file:
        reader = csv.reader(file, delimiter=' ', quotechar='"')
        return list(reader)

def write_csv_lines(file, lines):
    with open(file, 'w', encoding='utf-8', newline='') as file:
        writer = csv.writer(file, delimiter=' ', lineterminator="\n")
        writer.writerows(lines)

# cat input | cut -d' ' -f1 | awk '{n[$0]++} END {for (i in n) print i,n[i]}' > output
def target_histogram_file(input, output):
    lines = read_csv_lines(input)
    groups = groupby(sorted(line[0] for line in lines))
    counts = [[key, len(list(value))] for key, value in groups]
    write_csv_lines(output, counts)

# cat input | cut -d' ' -f2- | tr ' ' '\n' | cut -d',' -f1,3 | tr ',' '\n' | awk '{n[$0]++} END {for (i in n) print i,n[i]}' > output
def origin_histogram_file(input, output):
    lines = read_csv_lines(input)
    keys = sorted(int(i) for line in lines for path in line[1:] for i in path.split(',')[::2])
    groups = groupby(keys)
    counts = [[key, len(list(value))] for key, value in groups]
    write_csv_lines(output, counts)

# cat input | cut -d' ' -f2- | tr ' ' '\n' | cut -d',' -f2 | awk '{n[$0]++} END {for (i in n) print i,n[i]}' > output
def path_histogram_file(input, output):
    lines = read_csv_lines(input)
    keys = sorted(int(path.split(',')[1]) for line in lines for path in line[1:])
    groups = groupby(keys)
    counts = [[key, len(list(value))] for key, value in groups]
    write_csv_lines(output, counts)

def make_histograms(dataset_file):
    path, _ = os.path.splitext(os.path.abspath(dataset_file))
    target_histogram_file(path + '.c2s', path + '.histo.tgt.c2v')
    origin_histogram_file(path + '.c2s', path + '.histo.ori.c2v')
    path_histogram_file(path + '.c2s', path + '.histo.path.c2v')
    print(f'Successfully generated histogram files.')

def preprocess(dataset_file):
    path = os.path.dirname(os.path.abspath(dataset_file))
    cwd = os.getcwd()
    args = [
        'python', 'preprocess.py',
        '--train_data', os.path.join(path, 'path_contexts.c2s'),
        '--test_data', os.path.join(path, 'path_contexts.c2s'),
        '--val_data', os.path.join(path, 'path_contexts.c2s'),
        '--max_contexts', '200',
        '--word_vocab_size', '1301136',
        '--path_vocab_size', '911417',
        '--target_vocab_size', '261245',
        '--word_histogram', os.path.join(path, 'path_contexts.histo.ori.c2v'),
        '--path_histogram', os.path.join(path, 'path_contexts.histo.path.c2v'),
        '--target_histogram', os.path.join(path, 'path_contexts.histo.tgt.c2v'),
        '--output_name', os.path.join(path, 'output')
    ]
    os.chdir('../external/code2vec')
    process = subprocess.run(args, capture_output=True, encoding='utf8')
    print(process.stdout)
    os.chdir(cwd)

def train(folder):
    folder = os.path.abspath(folder)
    if any('saved_model' in file for file in os.listdir(folder)):
        print(f'Network already trained, aborting')
        return
    cwd = os.getcwd()
    args = [
        'python', 'code2vec.py',
        '--data', os.path.join(folder, 'output').replace('\\', '/'),
        '--test', os.path.join(folder, 'output.val.c2v').replace('\\', '/'),
        '--save', os.path.join(folder, 'saved_model').replace('\\', '/')
    ]
    os.chdir('../external/code2vec')
    process = subprocess.run(args, capture_output=True, encoding='utf8')
    print(process.stdout)
    print(process.stderr)
    os.chdir(cwd)

def export_vectors(model):
    model = os.path.abspath(model)
    cwd = os.getcwd()
    args = [
        'python', 'code2vec.py',
        '--load', model.replace('\\', '/').rstrip('/'),
        '--test', os.path.join(os.path.dirname(model), 'output.train.c2v').replace('\\', '/').rstrip('/'),
        '--export_code_vectors'
    ]
    os.chdir('../external/code2vec')
    process = subprocess.run(args, capture_output=True, encoding='utf8')
    print(process.stdout)
    os.chdir(cwd)

def find_best_iteration(folder):
    folder = os.path.abspath(folder)
    reg = r'saved_model_iter([0-9]+)'
    matches = [re.findall(reg, file) for file in os.listdir(folder)]
    iteration = max(int(match[0]) for match in matches if match)
    return iteration

def cleanup(path):
    if os.path.exists(path):
        shutil.rmtree(path)

def dataset(count):
    cleanup('code2vec/input')
    cleanup('code2vec/output')
    random.seed(42)
    programs = read_dataset_folder('data')
    programs = sorted(random.sample(programs, count), key=lambda entry: entry[1])
    extract_files(programs, 'code2vec/input')
    assert os.path.exists('code2vec/input')
    assert len(os.listdir('code2vec/input')) == count
    mine_path_contexts('code2vec')
    assert os.path.exists('code2vec/output/py/data/path_contexts.c2s')
    make_histograms('code2vec/output/py/data/path_contexts.c2s')
    assert os.path.exists('code2vec/output/py/data/path_contexts.histo.tgt.c2v')
    assert os.path.exists('code2vec/output/py/data/path_contexts.histo.ori.c2v')
    assert os.path.exists('code2vec/output/py/data/path_contexts.histo.path.c2v')
    preprocess('code2vec/output/py/data/path_contexts.c2s')
    assert os.path.exists('code2vec/output/py/data/output.dict.c2v')
    assert os.path.exists('code2vec/output/py/data/output.train.c2v')
    assert os.path.exists('code2vec/output/py/data/output.test.c2v')
    assert os.path.exists('code2vec/output/py/data/output.val.c2v')
    train('code2vec/output/py/data')
    assert os.path.exists('code2vec/output/py/data/saved_model.meta')
    assert os.path.exists('code2vec/output/py/data/saved_model.index')
    iteration = find_best_iteration('code2vec/output/py/data')
    assert os.path.exists(f'code2vec/output/py/data/saved_model_iter{iteration}.meta')
    assert os.path.exists(f'code2vec/output/py/data/saved_model_iter{iteration}.index')
    export_vectors(f'code2vec/output/py/data/saved_model_iter{iteration}')
    assert os.path.exists('code2vec/output/py/data/output.train.c2v.vectors')
    y = np.array([klass for _, klass in programs], dtype=int).reshape((len(programs), 1))
    x = np.array([[float(e) for e in line] for line in read_csv_lines('code2vec/output/py/data/output.train.c2v.vectors')], dtype=np.float32)
    return x, y

describe(dataset)

Working in 1 thread(s)
Parsing Python
100 file(s) found
Done!

Successfully generated histogram files.
File: c:\Users\prizr\Documents\GitHub\code-vectors\src\code2vec\output\py\data\path_contexts.c2s
Average total contexts: 300.0
Average final (after sampling) contexts: 200.0
Total examples: 100
Empty examples: 0
Max number of contexts per word: 300
File: c:\Users\prizr\Documents\GitHub\code-vectors\src\code2vec\output\py\data\path_contexts.c2s
Average total contexts: 300.0
Average final (after sampling) contexts: 200.0
Total examples: 100
Empty examples: 0
Max number of contexts per word: 300
File: c:\Users\prizr\Documents\GitHub\code-vectors\src\code2vec\output\py\data\path_contexts.c2s
Average total contexts: 300.0
Average final (after sampling) contexts: 200.0
Total examples: 100
Empty examples: 0
Max number of contexts per word: 300
Dictionaries saved to: c:\Users\prizr\Documents\GitHub\code-vectors\src\code2vec\output\py\data\output.dict.c2v

2024-07-12 15:43:11,130 INFO     
202

**Plots 100-1000**

In [2]:
stats = statistics(dataset, KNeighborsClassifier, {
    'n_neighbors': [4],
    'weights': ('distance',),
    'metric': ('manhattan',)
}, 100, 1000 + 100, 100)

Working in 1 thread(s)
Parsing Python
100 file(s) found
Done!

Successfully generated histogram files.
File: c:\Users\prizr\Documents\GitHub\vecs\src\code2vec\output\py\data\path_contexts.c2s
Average total contexts: 300.0
Average final (after sampling) contexts: 200.0
Total examples: 100
Empty examples: 0
Max number of contexts per word: 300
File: c:\Users\prizr\Documents\GitHub\vecs\src\code2vec\output\py\data\path_contexts.c2s
Average total contexts: 300.0
Average final (after sampling) contexts: 200.0
Total examples: 100
Empty examples: 0
Max number of contexts per word: 300
File: c:\Users\prizr\Documents\GitHub\vecs\src\code2vec\output\py\data\path_contexts.c2s
Average total contexts: 300.0
Average final (after sampling) contexts: 200.0
Total examples: 100
Empty examples: 0
Max number of contexts per word: 300
Dictionaries saved to: c:\Users\prizr\Documents\GitHub\vecs\src\code2vec\output\py\data\output.dict.c2v

2023-07-02 12:14:43,901 INFO     
2023-07-02 12:14:43,901 INFO     
2

In [3]:
stats = statistics(dataset, SVC, {
    'C': [30],
    'kernel': ('rbf',),
}, 100, 1000 + 100, 100)

Working in 1 thread(s)
Parsing Python
100 file(s) found
Done!

Successfully generated histogram files.
File: c:\Users\prizr\Documents\GitHub\vecs\src\code2vec\output\py\data\path_contexts.c2s
Average total contexts: 300.0
Average final (after sampling) contexts: 200.0
Total examples: 100
Empty examples: 0
Max number of contexts per word: 300
File: c:\Users\prizr\Documents\GitHub\vecs\src\code2vec\output\py\data\path_contexts.c2s
Average total contexts: 300.0
Average final (after sampling) contexts: 200.0
Total examples: 100
Empty examples: 0
Max number of contexts per word: 300
File: c:\Users\prizr\Documents\GitHub\vecs\src\code2vec\output\py\data\path_contexts.c2s
Average total contexts: 300.0
Average final (after sampling) contexts: 200.0
Total examples: 100
Empty examples: 0
Max number of contexts per word: 300
Dictionaries saved to: c:\Users\prizr\Documents\GitHub\vecs\src\code2vec\output\py\data\output.dict.c2v

2023-07-02 12:27:55,723 INFO     
2023-07-02 12:27:55,723 INFO     
2

In [4]:
stats = statistics(dataset, RandomForestClassifier, {
    'n_estimators': [300],
    'max_depth': [40],
}, 100, 1000 + 100, 100)

Working in 1 thread(s)
Parsing Python
100 file(s) found
Done!

Successfully generated histogram files.
File: c:\Users\prizr\Documents\GitHub\vecs\src\code2vec\output\py\data\path_contexts.c2s
Average total contexts: 300.0
Average final (after sampling) contexts: 200.0
Total examples: 100
Empty examples: 0
Max number of contexts per word: 300
File: c:\Users\prizr\Documents\GitHub\vecs\src\code2vec\output\py\data\path_contexts.c2s
Average total contexts: 300.0
Average final (after sampling) contexts: 200.0
Total examples: 100
Empty examples: 0
Max number of contexts per word: 300
File: c:\Users\prizr\Documents\GitHub\vecs\src\code2vec\output\py\data\path_contexts.c2s
Average total contexts: 300.0
Average final (after sampling) contexts: 200.0
Total examples: 100
Empty examples: 0
Max number of contexts per word: 300
Dictionaries saved to: c:\Users\prizr\Documents\GitHub\vecs\src\code2vec\output\py\data\output.dict.c2v

2023-07-02 12:40:51,947 INFO     
2023-07-02 12:40:51,947 INFO     
2

In [5]:
stats = statistics(dataset, MLPClassifier, {
    'activation': ['relu'],
    'learning_rate_init': [0.001]
}, 100, 1000 + 100, 100)

Working in 1 thread(s)
Parsing Python
100 file(s) found
Done!

Successfully generated histogram files.
File: c:\Users\prizr\Documents\GitHub\vecs\src\code2vec\output\py\data\path_contexts.c2s
Average total contexts: 300.0
Average final (after sampling) contexts: 200.0
Total examples: 100
Empty examples: 0
Max number of contexts per word: 300
File: c:\Users\prizr\Documents\GitHub\vecs\src\code2vec\output\py\data\path_contexts.c2s
Average total contexts: 300.0
Average final (after sampling) contexts: 200.0
Total examples: 100
Empty examples: 0
Max number of contexts per word: 300
File: c:\Users\prizr\Documents\GitHub\vecs\src\code2vec\output\py\data\path_contexts.c2s
Average total contexts: 300.0
Average final (after sampling) contexts: 200.0
Total examples: 100
Empty examples: 0
Max number of contexts per word: 300
Dictionaries saved to: c:\Users\prizr\Documents\GitHub\vecs\src\code2vec\output\py\data\output.dict.c2v

2023-07-02 12:56:11,908 INFO     
2023-07-02 12:56:11,908 INFO     
2