In [1]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.neural_network import MLPClassifier
from sklearn.ensemble import RandomForestClassifier
from utilities import read_dataset_folder, describe, statistics

import random
import numpy as np
import ast
import dis
import types

def instructions(co):
    bytecode = dis.Bytecode(co)
    iset = []
    for instr in bytecode:
        iset.append(instr.opname)
        if isinstance(instr.argval, types.CodeType):
            iset += instructions(instr.argval)
    return iset

def vectorize(codes):
    isets = []
    for code in codes:
        tree = ast.parse(code)
        co = compile(tree, '', 'exec')
        iset = instructions(co)
        isets.append(iset)
    ops = set(i for iset in isets for i in iset)
    print(f'Dim (opcode count): {len(ops)}')
    vecs = []
    for iset in isets:
        total = len(iset)
        vec = []
        for op in ops:
            this = len([i for i in iset if i == op])
            vec.append(this / total)
        vecs.append(vec)
    return vecs

def dataset(folder='data', count=2000):
    random.seed(42)
    programs = random.sample(read_dataset_folder(folder), count)
    print(f'Total programs: {len(programs)}')
    vectors = vectorize([code for code, _ in programs])
    x = np.array(list(vectors), dtype=np.float32)
    y = np.array([klass for _, klass in programs], dtype=int).reshape((len(programs), 1))
    return None, x, y

describe(dataset)

Total programs: 100
Dim (opcode count): 73
Components: 73.
Non-zero components: 15.
Zero components: 58.


**Plots 100-1000**

In [3]:
stats = statistics(dataset, KNeighborsClassifier, {
    'n_neighbors': [4],
    'weights': ('distance',),
    'metric': ('manhattan',)
}, 100, 1000 + 100, 100)

Total programs: 100
Dim (opcode count): 73
Fitting 5 folds for each of 1 candidates, totalling 5 fits
[CV] END ..metric=manhattan, n_neighbors=4, weights=distance; total time=   0.3s
[CV] END ..metric=manhattan, n_neighbors=4, weights=distance; total time=   0.0s
[CV] END ..metric=manhattan, n_neighbors=4, weights=distance; total time=   0.0s
[CV] END ..metric=manhattan, n_neighbors=4, weights=distance; total time=   0.0s
[CV] END ..metric=manhattan, n_neighbors=4, weights=distance; total time=   0.0s

{'metric': 'manhattan', 'n_neighbors': 4, 'weights': 'distance'}
{'accuracy': (0.84, 0.107),
 'f1_macro': (0.783, 0.127),
 'f1_weighted': (0.821, 0.111),
 'fit_time': (0.001, 0.0),
 'precision_macro': (0.787, 0.125),
 'precision_weighted': (0.828, 0.107),
 'recall_macro': (0.803, 0.121),
 'recall_weighted': (0.84, 0.107),
 'score_time': (0.098, 0.154)}
Total programs: 200
Dim (opcode count): 82
Fitting 5 folds for each of 1 candidates, totalling 5 fits
[CV] END ..metric=manhattan, n_neig

In [4]:
stats = statistics(dataset, SVC, {
    'C': [30],
    'kernel': ('rbf',),
}, 100, 1000 + 100, 100)

Total programs: 100
Dim (opcode count): 73
Fitting 5 folds for each of 1 candidates, totalling 5 fits
[CV] END ...................................C=30, kernel=rbf; total time=   0.0s
[CV] END ...................................C=30, kernel=rbf; total time=   0.0s
[CV] END ...................................C=30, kernel=rbf; total time=   0.0s
[CV] END ...................................C=30, kernel=rbf; total time=   0.0s
[CV] END ...................................C=30, kernel=rbf; total time=   0.0s

{'C': 30, 'kernel': 'rbf'}
{'accuracy': (0.87, 0.093),
 'f1_macro': (0.836, 0.113),
 'f1_weighted': (0.856, 0.101),
 'fit_time': (0.002, 0.0),
 'precision_macro': (0.836, 0.116),
 'precision_weighted': (0.857, 0.103),
 'recall_macro': (0.848, 0.103),
 'recall_weighted': (0.87, 0.093),
 'score_time': (0.01, 0.006)}
Total programs: 200
Dim (opcode count): 82
Fitting 5 folds for each of 1 candidates, totalling 5 fits
[CV] END ...................................C=30, kernel=rbf; total time= 

In [5]:
stats = statistics(dataset, RandomForestClassifier, {
    'n_estimators': [300],
    'max_depth': [40],
}, 100, 1000 + 100, 100)

Total programs: 100
Dim (opcode count): 73
Fitting 5 folds for each of 1 candidates, totalling 5 fits
[CV] END .....................max_depth=40, n_estimators=300; total time=   0.4s
[CV] END .....................max_depth=40, n_estimators=300; total time=   0.3s
[CV] END .....................max_depth=40, n_estimators=300; total time=   0.3s
[CV] END .....................max_depth=40, n_estimators=300; total time=   0.3s
[CV] END .....................max_depth=40, n_estimators=300; total time=   0.3s

{'max_depth': 40, 'n_estimators': 300}
{'accuracy': (0.86, 0.073),
 'f1_macro': (0.806, 0.103),
 'f1_weighted': (0.841, 0.083),
 'fit_time': (0.38, 0.027),
 'precision_macro': (0.816, 0.102),
 'precision_weighted': (0.855, 0.08),
 'recall_macro': (0.824, 0.093),
 'recall_weighted': (0.86, 0.073),
 'score_time': (0.03, 0.001)}
Total programs: 200
Dim (opcode count): 82
Fitting 5 folds for each of 1 candidates, totalling 5 fits
[CV] END .....................max_depth=40, n_estimators=300; 

In [16]:
stats = statistics(dataset, MLPClassifier, {
    'activation': ['relu'],
    'learning_rate_init': [0.001],
}, 100, 1000 + 100, 100)

Total programs: 100
Dim (opcode count): 73
Fitting 5 folds for each of 1 candidates, totalling 5 fits
[CV] END ..........activation=relu, learning_rate_init=0.001; total time=   0.1s
[CV] END ..........activation=relu, learning_rate_init=0.001; total time=   0.1s
[CV] END ..........activation=relu, learning_rate_init=0.001; total time=   0.1s
[CV] END ..........activation=relu, learning_rate_init=0.001; total time=   0.1s
[CV] END ..........activation=relu, learning_rate_init=0.001; total time=   0.1s

{'activation': 'relu', 'learning_rate_init': 0.001}
{'accuracy': (0.59, 0.097),
 'f1_macro': (0.456, 0.114),
 'f1_weighted': (0.495, 0.127),
 'fit_time': (0.181, 0.016),
 'precision_macro': (0.433, 0.13),
 'precision_weighted': (0.465, 0.146),
 'recall_macro': (0.535, 0.081),
 'recall_weighted': (0.59, 0.097),
 'score_time': (0.007, 0.0)}
Total programs: 200
Dim (opcode count): 82
Fitting 5 folds for each of 1 candidates, totalling 5 fits
[CV] END ..........activation=relu, learning_rate