In [2]:
from sklearn.model_selection import GridSearchCV
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.neural_network import MLPClassifier
from sklearn.ensemble import RandomForestClassifier

from pprint import pprint
from warnings import catch_warnings, simplefilter

import random
import algorithms
import numpy as np
import csv
import os

MARKOV_MAPPINGS = {
    "Module": None,
    "keyword": None,
    "Return": "Assign",
    "Attribute": "Name",
}

def rename(names):
    new = ()
    for name in names:
        value = MARKOV_MAPPINGS.get(name, name)
        if value is None:
            return None
        new += (value,)
    return new
        
def read(fname):
    with open(fname, 'r', encoding='utf-8') as file:
        reader = csv.reader(file)
        next(reader)
        for entry, in reader:
            yield entry

def read_folder(folder):
    dataset = []
    files = sorted(file for file in os.listdir(folder) if 'task' in file)
    for i, file in enumerate(files):
        for code in read(f'{folder}/{file}'):
            dataset.append([code, i])
    return dataset

def dataset(as_markov, folder='data', count=2000):
    random.seed(42)
    programs = random.sample(read_folder(folder), count)
    print(f'Total programs: {len(programs)}')
    vectors = algorithms.vectorize((code for code, _ in programs), as_markov)
    h = next(vectors)
    x = np.array(list(vectors), dtype=np.float32)
    y = np.array([klass for _, klass in programs], dtype=int).reshape((len(programs), 1))
    return h, x, y

def score(grid_search, scorers):
    required = ['fit_time', 'score_time']
    output = dict()
    for scorer in [f'test_' + s for s in scorers] + required:
        means = grid_search.cv_results_[f'mean_{scorer}']
        stds = grid_search.cv_results_[f'std_{scorer}']
        mean = np.max(means)
        std = np.mean(stds)
        key = scorer if scorer in required else scorer[5:]
        output[key] = (round(mean, 3), round(std, 3))
    return dict(sorted(output.items()))

def find(x, y, estimator, params, verbose=2):
    keys = ['precision_weighted', 'recall_weighted', 'f1_weighted',
            'precision_macro', 'recall_macro', 'f1_macro',
            'accuracy']
    grid_search = GridSearchCV(
        scoring=dict((key, key) for key in keys),
        estimator=estimator(),
        param_grid=params,
        refit='f1_macro',
        verbose=verbose,
        cv=5)
    with catch_warnings():
        simplefilter('ignore')
        grid_search.fit(x, y)
    print()
    pprint(grid_search.best_params_)
    pprint(score(grid_search, keys))
    return grid_search

def statistics(classifier, parameters, *limits):
    keys, scores = ['accuracy', 'precision_macro', 'recall_macro', 'f1_macro'], []
    for count in range(*limits):
        _, X, Y = dataset(markov_mapper, count=count)
        model = find(X, Y, classifier, parameters)
        scores.append(score(model, keys))
    pprint(scores, width=400)
    return scores

def markov_mapper(vertices, edges):
    return algorithms.markov(vertices, edges, lambda n: rename(vertices[n]))

def describe(dataset):
    H, X, Y = dataset(markov_mapper, count=100)
    print(f'Components: {len(X[0])}.')
    print(f'Non-zero components: {len(np.where(X[0] != 0)[0])}.')
    print(f'Zero components: {len(np.where(X[0] == 0)[0])}.')

describe(dataset)

Total programs: 100
Dims: 66
Vectorized 100 chains...
Components: 4356.
Non-zero components: 19.
Zero components: 4337.


**Plots 100-1000**

In [2]:
stats = statistics(KNeighborsClassifier, {
    'n_neighbors': [4],
    'weights': ('distance',),
    'metric': ('manhattan',)
}, 100, 1000 + 100, 100)

Total programs: 100
Fitting 5 folds for each of 1 candidates, totalling 5 fits
[CV] END ..metric=manhattan, n_neighbors=4, weights=distance; total time=   0.0s
[CV] END ..metric=manhattan, n_neighbors=4, weights=distance; total time=   0.0s
[CV] END ..metric=manhattan, n_neighbors=4, weights=distance; total time=   0.0s
[CV] END ..metric=manhattan, n_neighbors=4, weights=distance; total time=   0.0s
[CV] END ..metric=manhattan, n_neighbors=4, weights=distance; total time=   0.0s

{'metric': 'manhattan', 'n_neighbors': 4, 'weights': 'distance'}
{'accuracy': (0.8, 0.055),
 'f1_macro': (0.754, 0.051),
 'f1_weighted': (0.784, 0.071),
 'fit_time': (0.002, 0.0),
 'precision_macro': (0.788, 0.053),
 'precision_weighted': (0.834, 0.072),
 'recall_macro': (0.782, 0.035),
 'recall_weighted': (0.8, 0.055),
 'score_time': (0.048, 0.045)}
Total programs: 200
Fitting 5 folds for each of 1 candidates, totalling 5 fits
[CV] END ..metric=manhattan, n_neighbors=4, weights=distance; total time=   0.0s
[C

In [3]:
stats = statistics(SVC, {
    'C': [30],
    'kernel': ('rbf',),
}, 100, 1000 + 100, 100)

Total programs: 100
Fitting 5 folds for each of 1 candidates, totalling 5 fits
[CV] END ...................................C=30, kernel=rbf; total time=   0.0s
[CV] END ...................................C=30, kernel=rbf; total time=   0.0s
[CV] END ...................................C=30, kernel=rbf; total time=   0.0s
[CV] END ...................................C=30, kernel=rbf; total time=   0.0s
[CV] END ...................................C=30, kernel=rbf; total time=   0.0s

{'C': 30, 'kernel': 'rbf'}
{'accuracy': (0.84, 0.073),
 'f1_macro': (0.77, 0.096),
 'f1_weighted': (0.823, 0.09),
 'fit_time': (0.017, 0.0),
 'precision_macro': (0.786, 0.089),
 'precision_weighted': (0.851, 0.089),
 'recall_macro': (0.8, 0.08),
 'recall_weighted': (0.84, 0.073),
 'score_time': (0.011, 0.0)}
Total programs: 200
Fitting 5 folds for each of 1 candidates, totalling 5 fits
[CV] END ...................................C=30, kernel=rbf; total time=   0.0s
[CV] END ...................................C

In [4]:
stats = statistics(RandomForestClassifier, {
    'n_estimators': [300],
    'max_depth': [40],
}, 100, 1000 + 100, 100)

Total programs: 100
Fitting 5 folds for each of 1 candidates, totalling 5 fits
[CV] END .....................max_depth=40, n_estimators=300; total time=   0.4s
[CV] END .....................max_depth=40, n_estimators=300; total time=   0.4s
[CV] END .....................max_depth=40, n_estimators=300; total time=   0.4s
[CV] END .....................max_depth=40, n_estimators=300; total time=   0.4s
[CV] END .....................max_depth=40, n_estimators=300; total time=   0.4s

{'max_depth': 40, 'n_estimators': 300}
{'accuracy': (0.92, 0.04),
 'f1_macro': (0.878, 0.061),
 'f1_weighted': (0.903, 0.048),
 'fit_time': (0.452, 0.016),
 'precision_macro': (0.882, 0.059),
 'precision_weighted': (0.91, 0.045),
 'recall_macro': (0.894, 0.053),
 'recall_weighted': (0.92, 0.04),
 'score_time': (0.031, 0.001)}
Total programs: 200
Fitting 5 folds for each of 1 candidates, totalling 5 fits
[CV] END .....................max_depth=40, n_estimators=300; total time=   0.5s
[CV] END ..................

In [5]:
stats = statistics(MLPClassifier, {
    'activation': ['relu'],
    'learning_rate_init': [0.001]
}, 100, 1000 + 100, 100)

Total programs: 100
Fitting 5 folds for each of 1 candidates, totalling 5 fits
[CV] END ..........activation=relu, learning_rate_init=0.001; total time=   2.1s
[CV] END ..........activation=relu, learning_rate_init=0.001; total time=   2.1s
[CV] END ..........activation=relu, learning_rate_init=0.001; total time=   2.1s
[CV] END ..........activation=relu, learning_rate_init=0.001; total time=   2.1s
[CV] END ..........activation=relu, learning_rate_init=0.001; total time=   2.2s

{'activation': 'relu', 'learning_rate_init': 0.001}
{'accuracy': (0.86, 0.073),
 'f1_macro': (0.795, 0.1),
 'f1_weighted': (0.842, 0.081),
 'fit_time': (2.236, 0.047),
 'precision_macro': (0.796, 0.105),
 'precision_weighted': (0.848, 0.083),
 'recall_macro': (0.82, 0.091),
 'recall_weighted': (0.86, 0.073),
 'score_time': (0.009, 0.001)}
Total programs: 200
Fitting 5 folds for each of 1 candidates, totalling 5 fits
[CV] END ..........activation=relu, learning_rate_init=0.001; total time=   3.0s
[CV] END .....