In [1]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.neural_network import MLPClassifier
from sklearn.ensemble import RandomForestClassifier
from utilities import read_dataset_folder, describe, statistics

import random
import markov
import numpy as np

MM = {
    "Module": None,
    "keyword": None,
    "Return": "Assign",
    "Attribute": "Name",
}

def rename(names):
    new = ()
    for name in names:
        value = MM.get(name, name)
        if value is None:
            return None
        new += (value,)
    return new

def markov_n2(vertices, edges):
    vertices, edges = markov.lift(vertices, edges, vertices.get)
    return markov.markov(vertices, edges, lambda n: rename(vertices[n]))

def dataset(folder='data', count=2000):
    random.seed(42)
    programs, labels = zip(*random.sample(read_dataset_folder(folder), count))
    print(f'Total programs: {len(programs)}')
    vectors = markov.vectorize(programs, markov.graph, markov_n2)
    h = next(vectors)
    x = np.array(list(vectors), dtype=np.float32)
    y = np.array(labels, dtype=int).reshape((len(labels), 1))
    return h, x, y

describe(dataset)

Total programs: 100
Vectorized 100 chains...
Components: 36864.
Non-zero components: 25.
Zero components: 36839.


**Plots 100-1000**

In [2]:
stats = statistics(dataset, KNeighborsClassifier, {
    'n_neighbors': [4],
    'weights': ('distance',),
    'metric': ('manhattan',)
}, 100, 1000 + 100, 100)

Total programs: 100
Fitting 5 folds for each of 1 candidates, totalling 5 fits
[CV] END ..metric=manhattan, n_neighbors=4, weights=distance; total time=   0.1s
[CV] END ..metric=manhattan, n_neighbors=4, weights=distance; total time=   0.0s
[CV] END ..metric=manhattan, n_neighbors=4, weights=distance; total time=   0.0s
[CV] END ..metric=manhattan, n_neighbors=4, weights=distance; total time=   0.0s
[CV] END ..metric=manhattan, n_neighbors=4, weights=distance; total time=   0.0s

{'metric': 'manhattan', 'n_neighbors': 4, 'weights': 'distance'}
{'accuracy': (0.7, 0.105),
 'f1_macro': (0.621, 0.113),
 'f1_weighted': (0.667, 0.128),
 'fit_time': (0.008, 0.001),
 'precision_macro': (0.633, 0.127),
 'precision_weighted': (0.685, 0.151),
 'recall_macro': (0.661, 0.091),
 'recall_weighted': (0.7, 0.105),
 'score_time': (0.094, 0.047)}
Total programs: 200
Fitting 5 folds for each of 1 candidates, totalling 5 fits
[CV] END ..metric=manhattan, n_neighbors=4, weights=distance; total time=   0.2s


In [3]:
stats = statistics(dataset, SVC, {
    'C': [30],
    'kernel': ('rbf',),
}, 100, 1000 + 100, 100)

Total programs: 100
Fitting 5 folds for each of 1 candidates, totalling 5 fits
[CV] END ...................................C=30, kernel=rbf; total time=   1.2s
[CV] END ...................................C=30, kernel=rbf; total time=   1.1s
[CV] END ...................................C=30, kernel=rbf; total time=   1.1s
[CV] END ...................................C=30, kernel=rbf; total time=   1.1s
[CV] END ...................................C=30, kernel=rbf; total time=   1.1s

{'C': 30, 'kernel': 'rbf'}
{'accuracy': (0.83, 0.068),
 'f1_macro': (0.75, 0.077),
 'f1_weighted': (0.808, 0.086),
 'fit_time': (1.049, 0.015),
 'precision_macro': (0.754, 0.082),
 'precision_weighted': (0.819, 0.095),
 'recall_macro': (0.776, 0.065),
 'recall_weighted': (0.83, 0.068),
 'score_time': (0.189, 0.005)}
Total programs: 200
Fitting 5 folds for each of 1 candidates, totalling 5 fits
[CV] END ...................................C=30, kernel=rbf; total time=   4.6s
[CV] END ............................

In [4]:
stats = statistics(dataset, RandomForestClassifier, {
    'n_estimators': [300],
    'max_depth': [40],
}, 100, 1000 + 100, 100)

Total programs: 100
Fitting 5 folds for each of 1 candidates, totalling 5 fits
[CV] END .....................max_depth=40, n_estimators=300; total time=   1.1s
[CV] END .....................max_depth=40, n_estimators=300; total time=   1.3s
[CV] END .....................max_depth=40, n_estimators=300; total time=   1.1s
[CV] END .....................max_depth=40, n_estimators=300; total time=   1.1s
[CV] END .....................max_depth=40, n_estimators=300; total time=   1.1s

{'max_depth': 40, 'n_estimators': 300}
{'accuracy': (0.91, 0.058),
 'f1_macro': (0.871, 0.094),
 'f1_weighted': (0.893, 0.072),
 'fit_time': (1.22, 0.089),
 'precision_macro': (0.882, 0.094),
 'precision_weighted': (0.91, 0.066),
 'recall_macro': (0.888, 0.081),
 'recall_weighted': (0.91, 0.058),
 'score_time': (0.037, 0.005)}
Total programs: 200
Fitting 5 folds for each of 1 candidates, totalling 5 fits
[CV] END .....................max_depth=40, n_estimators=300; total time=   3.0s
[CV] END .................

In [5]:
stats = statistics(dataset, MLPClassifier, {
    'activation': ['relu'],
    'learning_rate_init': [0.001]
}, 100, 1000 + 100, 100)

Total programs: 100
Fitting 5 folds for each of 1 candidates, totalling 5 fits
[CV] END ..........activation=relu, learning_rate_init=0.001; total time=  22.8s
[CV] END ..........activation=relu, learning_rate_init=0.001; total time=  22.9s
[CV] END ..........activation=relu, learning_rate_init=0.001; total time=  22.6s
[CV] END ..........activation=relu, learning_rate_init=0.001; total time=  21.8s
[CV] END ..........activation=relu, learning_rate_init=0.001; total time=  21.1s

{'activation': 'relu', 'learning_rate_init': 0.001}
{'accuracy': (0.88, 0.081),
 'f1_macro': (0.812, 0.114),
 'f1_weighted': (0.858, 0.092),
 'fit_time': (22.309, 0.684),
 'precision_macro': (0.816, 0.108),
 'precision_weighted': (0.868, 0.084),
 'recall_macro': (0.839, 0.104),
 'recall_weighted': (0.88, 0.081),
 'score_time': (0.014, 0.007)}
Total programs: 200
Fitting 5 folds for each of 1 candidates, totalling 5 fits
[CV] END ..........activation=relu, learning_rate_init=0.001; total time=  39.5s
[CV] END .