In [1]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.neural_network import MLPClassifier
from sklearn.ensemble import RandomForestClassifier
from utilities import read_dataset_folder, describe, statistics

import random
import numpy as np
import ast

class Collector(ast.NodeVisitor):
    def __init__(self):
        self.nodes = []
    
    def visit(self, node):
        name = type(node).__name__
        self.nodes.append(name)
        super().visit(node)

def vectorize(codes):
    nodes = set()
    nsets = []
    for code in codes:
        tree = ast.parse(code)
        visitor = Collector()
        visitor.visit(tree)
        nodes.update(visitor.nodes)
        nsets.append(visitor.nodes)
    nodes = list(nodes)
    print(f'AST hist dim: {len(nodes)}')
    vecs = []
    for nset in nsets:
        total = len(nset)
        vec = []
        for node in nodes:
            this = len([i for i in nset if node == i])
            vec.append(this / total)
        vecs.append(vec)
    return vecs

def dataset(folder='data', count=2000):
    random.seed(42)
    programs, labels = zip(*random.sample(read_dataset_folder(folder), count))
    print(f'Total programs: {len(programs)}')
    vectors = vectorize(programs)
    x = np.array(list(vectors), dtype=np.float32)
    y = np.array(labels, dtype=int).reshape((len(labels), 1))
    return None, x, y

describe(dataset)

Total programs: 100
AST hist dim: 75
Components: 75.
Non-zero components: 19.
Zero components: 56.


**Plots 100-1000**

In [2]:
stats = statistics(dataset, KNeighborsClassifier, {
    'n_neighbors': [4],
    'weights': ('distance',),
    'metric': ('manhattan',)
}, 100, 1000 + 100, 100)

Total programs: 100
AST hist dim: 75
Fitting 5 folds for each of 1 candidates, totalling 5 fits
[CV] END ..metric=manhattan, n_neighbors=4, weights=distance; total time=   0.0s
[CV] END ..metric=manhattan, n_neighbors=4, weights=distance; total time=   0.0s
[CV] END ..metric=manhattan, n_neighbors=4, weights=distance; total time=   0.0s
[CV] END ..metric=manhattan, n_neighbors=4, weights=distance; total time=   0.0s
[CV] END ..metric=manhattan, n_neighbors=4, weights=distance; total time=   0.0s

{'metric': 'manhattan', 'n_neighbors': 4, 'weights': 'distance'}
{'accuracy': (0.83, 0.06),
 'f1_macro': (0.798, 0.071),
 'f1_weighted': (0.821, 0.054),
 'fit_time': (0.0, 0.0),
 'precision_macro': (0.837, 0.071),
 'precision_weighted': (0.87, 0.042),
 'recall_macro': (0.818, 0.064),
 'recall_weighted': (0.83, 0.06),
 'score_time': (0.003, 0.0)}
Total programs: 200
AST hist dim: 82
Fitting 5 folds for each of 1 candidates, totalling 5 fits
[CV] END ..metric=manhattan, n_neighbors=4, weights=di

In [3]:
stats = statistics(dataset, SVC, {
    'C': [30],
    'kernel': ('rbf',),
}, 100, 1000 + 100, 100)

Total programs: 100
AST hist dim: 75
Fitting 5 folds for each of 1 candidates, totalling 5 fits
[CV] END ...................................C=30, kernel=rbf; total time=   0.0s
[CV] END ...................................C=30, kernel=rbf; total time=   0.0s
[CV] END ...................................C=30, kernel=rbf; total time=   0.0s
[CV] END ...................................C=30, kernel=rbf; total time=   0.0s
[CV] END ...................................C=30, kernel=rbf; total time=   0.0s

{'C': 30, 'kernel': 'rbf'}
{'accuracy': (0.85, 0.105),
 'f1_macro': (0.828, 0.135),
 'f1_weighted': (0.846, 0.107),
 'fit_time': (0.001, 0.0),
 'precision_macro': (0.849, 0.142),
 'precision_weighted': (0.869, 0.111),
 'recall_macro': (0.832, 0.132),
 'recall_weighted': (0.85, 0.105),
 'score_time': (0.002, 0.0)}
Total programs: 200
AST hist dim: 82
Fitting 5 folds for each of 1 candidates, totalling 5 fits
[CV] END ...................................C=30, kernel=rbf; total time=   0.0s
[CV] E

In [4]:
stats = statistics(dataset, RandomForestClassifier, {
    'n_estimators': [300],
    'max_depth': [40],
}, 100, 1000 + 100, 100)

Total programs: 100
AST hist dim: 75
Fitting 5 folds for each of 1 candidates, totalling 5 fits
[CV] END .....................max_depth=40, n_estimators=300; total time=   0.2s
[CV] END .....................max_depth=40, n_estimators=300; total time=   0.2s
[CV] END .....................max_depth=40, n_estimators=300; total time=   0.2s
[CV] END .....................max_depth=40, n_estimators=300; total time=   0.2s
[CV] END .....................max_depth=40, n_estimators=300; total time=   0.2s

{'max_depth': 40, 'n_estimators': 300}
{'accuracy': (0.9, 0.055),
 'f1_macro': (0.868, 0.098),
 'f1_weighted': (0.887, 0.07),
 'fit_time': (0.192, 0.005),
 'precision_macro': (0.881, 0.112),
 'precision_weighted': (0.905, 0.082),
 'recall_macro': (0.884, 0.083),
 'recall_weighted': (0.9, 0.055),
 'score_time': (0.017, 0.0)}
Total programs: 200
AST hist dim: 82
Fitting 5 folds for each of 1 candidates, totalling 5 fits
[CV] END .....................max_depth=40, n_estimators=300; total time=   

In [6]:
stats = statistics(dataset, MLPClassifier, {
    'activation': ['relu'],
    'learning_rate_init': [0.001],
}, 100, 1000 + 100, 100)

Total programs: 100
AST hist dim: 75
Fitting 5 folds for each of 1 candidates, totalling 5 fits
[CV] END ..........activation=relu, learning_rate_init=0.001; total time=   0.3s
[CV] END ..........activation=relu, learning_rate_init=0.001; total time=   0.1s
[CV] END ..........activation=relu, learning_rate_init=0.001; total time=   0.1s
[CV] END ..........activation=relu, learning_rate_init=0.001; total time=   0.1s
[CV] END ..........activation=relu, learning_rate_init=0.001; total time=   0.1s

{'activation': 'relu', 'learning_rate_init': 0.001}
{'accuracy': (0.52, 0.108),
 'f1_macro': (0.339, 0.085),
 'f1_weighted': (0.407, 0.115),
 'fit_time': (0.208, 0.08),
 'precision_macro': (0.314, 0.093),
 'precision_weighted': (0.374, 0.12),
 'recall_macro': (0.43, 0.069),
 'recall_weighted': (0.52, 0.108),
 'score_time': (0.004, 0.005)}
Total programs: 200
AST hist dim: 82
Fitting 5 folds for each of 1 candidates, totalling 5 fits
[CV] END ..........activation=relu, learning_rate_init=0.001;