In [1]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.neural_network import MLPClassifier
from sklearn.ensemble import RandomForestClassifier
from utilities import read_dataset_folder, describe, statistics

import random
import numpy as np
import algorithms

import dis
import ast
import pprint
import graphviz
import types

OPS = {
    'jump_backward': lambda i, n: [i.argval],
    'jump_forward': lambda i, n: [i.argval],
    'jump_absolute': lambda i, n: [i.argval],
    'jump_if_not_exc_match': lambda i, n: [i.argval],
    'for_iter': lambda i, n: [i.argval, n.offset],
    'pop_jump_if_true': lambda i, n: [i.argval, n.offset],
    'pop_jump_if_false': lambda i, n: [i.argval, n.offset],
    'pop_jump_if_none': lambda i, n: [i.argval, n.offset],
    'pop_jump_if_not_none': lambda i, n: [i.argval, n.offset],
}

def window(seq, n):
    return [seq[i:i + n] for i in range(len(seq) - n + 1)]

def walk(co, names, edges, init=1):
    no = init
    mask = init * 10000
    bytecode = list(dis.Bytecode(co))
    for i, n in window(bytecode, n=2):
        mi = mask + i.offset
        mn = mask + n.offset
        names[mi] = i.opname
        names[mn] = n.opname
        edges[mi] = []
        opname = i.opname.lower()
        if opname in OPS:
            for offs in OPS[opname](i, n):
                edges[mi].append(mask + offs)
        else:
            edges[mi].append(mn)
        if isinstance(i.argval, types.CodeType):
            no = walk(i.argval, names, edges, no + 1)
    return no

def markov(vertices, edges, type):
    vertypes = set(type(v) for v in vertices if type(v))
    chain_edges = set()
    for vertype in vertypes:
        targets = set(vd for vs in edges for vd in edges[vs] if type(vs) and type(vs) == vertype)
        tartypes = set(type(t) for t in targets if type(t))
        for tartype in tartypes:
            weight = len([t for t in targets if type(t) and type(t) == tartype]) / len(targets)
            chain_edges.add((vertype, tartype, weight))
    return vertypes, chain_edges

def control_flow_graph(src):
    tree = ast.parse(src)
    co = compile(tree, '', 'exec')
    names, edges = {}, {}
    walk(co, names, edges)
    return names, edges

def vectorize(codes):
    chains = []
    nodes = set()
    for code in codes:
        vertices, edges = control_flow_graph(code)
        vertices, edges = markov(vertices.keys(), edges, vertices.get)
        chains.append(edges)
        nodes |= vertices
    nodes = sorted(list(nodes))
    yield nodes
    for edges in chains:
        m = algorithms.adjacency(edges, nodes)
        yield algorithms.vector(m)

def dataset(folder='data', count=2000):
    random.seed(42)
    programs = random.sample(read_dataset_folder(folder), count)
    print(f'Total programs: {len(programs)}')
    vectors = vectorize([code for code, _ in programs])
    h = next(vectors)
    x = np.array(list(vectors), dtype=np.float32)
    y = np.array([klass for _, klass in programs], dtype=int).reshape((len(programs), 1))
    return h, x, y

describe(dataset)

Total programs: 100
Components: 5329.
Non-zero components: 33.
Zero components: 5296.


**Plots 100-1000**

In [2]:
stats = statistics(dataset, KNeighborsClassifier, {
    'n_neighbors': [4],
    'weights': ('distance',),
    'metric': ('manhattan',)
}, 100, 1000 + 100, 100)

Total programs: 100
Fitting 5 folds for each of 1 candidates, totalling 5 fits
[CV] END ..metric=manhattan, n_neighbors=4, weights=distance; total time=   0.0s
[CV] END ..metric=manhattan, n_neighbors=4, weights=distance; total time=   0.0s
[CV] END ..metric=manhattan, n_neighbors=4, weights=distance; total time=   0.0s
[CV] END ..metric=manhattan, n_neighbors=4, weights=distance; total time=   0.0s
[CV] END ..metric=manhattan, n_neighbors=4, weights=distance; total time=   0.0s

{'metric': 'manhattan', 'n_neighbors': 4, 'weights': 'distance'}
{'accuracy': (0.81, 0.102),
 'f1_macro': (0.767, 0.092),
 'f1_weighted': (0.791, 0.096),
 'fit_time': (0.001, 0.0),
 'precision_macro': (0.793, 0.079),
 'precision_weighted': (0.828, 0.071),
 'recall_macro': (0.793, 0.089),
 'recall_weighted': (0.81, 0.102),
 'score_time': (0.009, 0.0)}
Total programs: 200
Fitting 5 folds for each of 1 candidates, totalling 5 fits
[CV] END ..metric=manhattan, n_neighbors=4, weights=distance; total time=   0.0s
[C

In [3]:
stats = statistics(dataset, SVC, {
    'C': [30],
    'kernel': ('rbf',),
}, 100, 1000 + 100, 100)

Total programs: 100
Fitting 5 folds for each of 1 candidates, totalling 5 fits
[CV] END ...................................C=30, kernel=rbf; total time=   0.0s
[CV] END ...................................C=30, kernel=rbf; total time=   0.0s
[CV] END ...................................C=30, kernel=rbf; total time=   0.0s
[CV] END ...................................C=30, kernel=rbf; total time=   0.0s
[CV] END ...................................C=30, kernel=rbf; total time=   0.0s

{'C': 30, 'kernel': 'rbf'}
{'accuracy': (0.82, 0.051),
 'f1_macro': (0.745, 0.065),
 'f1_weighted': (0.783, 0.061),
 'fit_time': (0.014, 0.0),
 'precision_macro': (0.738, 0.071),
 'precision_weighted': (0.783, 0.065),
 'recall_macro': (0.788, 0.053),
 'recall_weighted': (0.82, 0.051),
 'score_time': (0.006, 0.0)}
Total programs: 200
Fitting 5 folds for each of 1 candidates, totalling 5 fits
[CV] END ...................................C=30, kernel=rbf; total time=   0.1s
[CV] END ...............................

In [4]:
stats = statistics(dataset, RandomForestClassifier, {
    'n_estimators': [300],
    'max_depth': [40],
}, 100, 1000 + 100, 100)

Total programs: 100
Fitting 5 folds for each of 1 candidates, totalling 5 fits
[CV] END .....................max_depth=40, n_estimators=300; total time=   0.2s
[CV] END .....................max_depth=40, n_estimators=300; total time=   0.2s
[CV] END .....................max_depth=40, n_estimators=300; total time=   0.2s
[CV] END .....................max_depth=40, n_estimators=300; total time=   0.2s
[CV] END .....................max_depth=40, n_estimators=300; total time=   0.2s

{'max_depth': 40, 'n_estimators': 300}
{'accuracy': (0.87, 0.075),
 'f1_macro': (0.816, 0.122),
 'f1_weighted': (0.848, 0.091),
 'fit_time': (0.206, 0.003),
 'precision_macro': (0.831, 0.121),
 'precision_weighted': (0.868, 0.088),
 'recall_macro': (0.841, 0.104),
 'recall_weighted': (0.87, 0.075),
 'score_time': (0.016, 0.0)}
Total programs: 200
Fitting 5 folds for each of 1 candidates, totalling 5 fits
[CV] END .....................max_depth=40, n_estimators=300; total time=   0.3s
[CV] END .................

In [5]:
stats = statistics(dataset, MLPClassifier, {
    'activation': ['relu'],
    'learning_rate_init': [0.001]
}, 100, 1000 + 100, 100)

Total programs: 100
Fitting 5 folds for each of 1 candidates, totalling 5 fits
[CV] END ..........activation=relu, learning_rate_init=0.001; total time=   1.0s
[CV] END ..........activation=relu, learning_rate_init=0.001; total time=   1.0s
[CV] END ..........activation=relu, learning_rate_init=0.001; total time=   1.0s
[CV] END ..........activation=relu, learning_rate_init=0.001; total time=   1.1s
[CV] END ..........activation=relu, learning_rate_init=0.001; total time=   1.0s

{'activation': 'relu', 'learning_rate_init': 0.001}
{'accuracy': (0.86, 0.058),
 'f1_macro': (0.825, 0.095),
 'f1_weighted': (0.838, 0.073),
 'fit_time': (1.033, 0.015),
 'precision_macro': (0.837, 0.105),
 'precision_weighted': (0.857, 0.082),
 'recall_macro': (0.851, 0.078),
 'recall_weighted': (0.86, 0.058),
 'score_time': (0.004, 0.0)}
Total programs: 200
Fitting 5 folds for each of 1 candidates, totalling 5 fits
[CV] END ..........activation=relu, learning_rate_init=0.001; total time=   1.6s
[CV] END ....