In [1]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.neural_network import MLPClassifier
from sklearn.ensemble import RandomForestClassifier
from utilities import read_dataset_folder, describe, statistics

import keyword
import string
import random
import numpy as np
import io
import tokenize

from gensim.models import Word2Vec

def tokens(code):
    names = dict()
    sio = io.StringIO(code)
    tokens = []
    for token in tokenize.generate_tokens(sio.readline):
        value = token.string
        if token.type == 1 and value not in keyword.kwlist:
            if value not in names:
                names[value] = ''.join(random.choices(string.ascii_letters, k=5))
            value = names[value]
        tokens.append(value)
    return tokens

def vectorize(snippets: list[str]):
    sentences = [tokens(s) for s in snippets]
    model = Word2Vec(sentences, min_count=1, workers=3)
    sentence_vectors = []
    done = 1
    for sentence in sentences:
        if not done % 500:
            print(f'Processed {done} snippets...')
        done += 1
        word_vectors = []
        for word in sentence:
            if word not in model.wv:
                continue
            word_vectors.append(model.wv[word])
        sentence_vector = np.mean(word_vectors, axis=0)
        sentence_vectors.append(sentence_vector)
    return sentence_vectors

def dataset(folder='data', count=2000):
    random.seed(42)
    programs, labels = zip(*random.sample(read_dataset_folder(folder), count))
    print(f'Total programs: {len(programs)}')
    vectors = vectorize(programs)
    x = np.array(list(vectors), dtype=np.float32)
    y = np.array(labels, dtype=int).reshape((len(labels), 1))
    return None, x, y

describe(dataset)

Total programs: 100
Components: 100.
Non-zero components: 100.
Zero components: 0.


**Plots 100-1000**

In [2]:
stats = statistics(dataset, KNeighborsClassifier, {
    'n_neighbors': [4],
    'weights': ('distance',),
    'metric': ('manhattan',)
}, 100, 1000 + 100, 100)

Total programs: 100
Fitting 5 folds for each of 1 candidates, totalling 5 fits
[CV] END ..metric=manhattan, n_neighbors=4, weights=distance; total time=   0.0s
[CV] END ..metric=manhattan, n_neighbors=4, weights=distance; total time=   0.0s
[CV] END ..metric=manhattan, n_neighbors=4, weights=distance; total time=   0.0s
[CV] END ..metric=manhattan, n_neighbors=4, weights=distance; total time=   0.0s
[CV] END ..metric=manhattan, n_neighbors=4, weights=distance; total time=   0.0s

{'metric': 'manhattan', 'n_neighbors': 4, 'weights': 'distance'}
{'accuracy': (0.39, 0.107),
 'f1_macro': (0.318, 0.075),
 'f1_weighted': (0.333, 0.09),
 'fit_time': (0.001, 0.0),
 'precision_macro': (0.303, 0.083),
 'precision_weighted': (0.317, 0.086),
 'recall_macro': (0.372, 0.079),
 'recall_weighted': (0.39, 0.107),
 'score_time': (0.045, 0.046)}
Total programs: 200
Fitting 5 folds for each of 1 candidates, totalling 5 fits
[CV] END ..metric=manhattan, n_neighbors=4, weights=distance; total time=   0.0s
[

In [3]:
stats = statistics(dataset, SVC, {
    'C': [30],
    'kernel': ('rbf',),
}, 100, 1000 + 100, 100)

Total programs: 100
Fitting 5 folds for each of 1 candidates, totalling 5 fits
[CV] END ...................................C=30, kernel=rbf; total time=   0.0s
[CV] END ...................................C=30, kernel=rbf; total time=   0.0s
[CV] END ...................................C=30, kernel=rbf; total time=   0.0s
[CV] END ...................................C=30, kernel=rbf; total time=   0.0s
[CV] END ...................................C=30, kernel=rbf; total time=   0.0s

{'C': 30, 'kernel': 'rbf'}
{'accuracy': (0.25, 0.032),
 'f1_macro': (0.134, 0.058),
 'f1_weighted': (0.141, 0.048),
 'fit_time': (0.002, 0.0),
 'precision_macro': (0.117, 0.062),
 'precision_weighted': (0.118, 0.053),
 'recall_macro': (0.219, 0.041),
 'recall_weighted': (0.25, 0.032),
 'score_time': (0.01, 0.006)}
Total programs: 200
Fitting 5 folds for each of 1 candidates, totalling 5 fits
[CV] END ...................................C=30, kernel=rbf; total time=   0.0s
[CV] END ..............................

In [2]:
stats = statistics(dataset, RandomForestClassifier, {
    'n_estimators': [300],
    'max_depth': [40],
}, 100, 1000 + 100, 100)

Total programs: 100
Fitting 5 folds for each of 1 candidates, totalling 5 fits
[CV] END .....................max_depth=40, n_estimators=300; total time=   0.5s
[CV] END .....................max_depth=40, n_estimators=300; total time=   0.5s
[CV] END .....................max_depth=40, n_estimators=300; total time=   0.4s
[CV] END .....................max_depth=40, n_estimators=300; total time=   0.4s
[CV] END .....................max_depth=40, n_estimators=300; total time=   0.4s

{'max_depth': 40, 'n_estimators': 300}
{'accuracy': (0.65, 0.063),
 'f1_macro': (0.578, 0.054),
 'f1_weighted': (0.617, 0.068),
 'fit_time': (0.516, 0.023),
 'precision_macro': (0.59, 0.05),
 'precision_weighted': (0.638, 0.06),
 'recall_macro': (0.614, 0.054),
 'recall_weighted': (0.65, 0.063),
 'score_time': (0.034, 0.004)}
Total programs: 200
Fitting 5 folds for each of 1 candidates, totalling 5 fits
[CV] END .....................max_depth=40, n_estimators=300; total time=   0.6s
[CV] END ..................

In [5]:
stats = statistics(dataset, MLPClassifier, {
    'activation': ['relu'],
    'learning_rate_init': [0.001]
}, 100, 1000 + 100, 100)

Total programs: 100
Fitting 5 folds for each of 1 candidates, totalling 5 fits
[CV] END ..........activation=relu, learning_rate_init=0.001; total time=   0.1s
[CV] END ..........activation=relu, learning_rate_init=0.001; total time=   0.1s
[CV] END ..........activation=relu, learning_rate_init=0.001; total time=   0.1s
[CV] END ..........activation=relu, learning_rate_init=0.001; total time=   0.0s
[CV] END ..........activation=relu, learning_rate_init=0.001; total time=   0.1s

{'activation': 'relu', 'learning_rate_init': 0.001}
{'accuracy': (0.19, 0.073),
 'f1_macro': (0.068, 0.045),
 'f1_weighted': (0.083, 0.056),
 'fit_time': (0.155, 0.056),
 'precision_macro': (0.051, 0.037),
 'precision_weighted': (0.061, 0.045),
 'recall_macro': (0.148, 0.053),
 'recall_weighted': (0.19, 0.073),
 'score_time': (0.007, 0.001)}
Total programs: 200
Fitting 5 folds for each of 1 candidates, totalling 5 fits
[CV] END ..........activation=relu, learning_rate_init=0.001; total time=   0.0s
[CV] END ..