## Import & install libs

In [1]:
import sys
sys.path.append("..")

from src.preprocessing import preprocess_snippet
import random
import os
import re
from tqdm import tqdm
from IPython.display import clear_output
from typing import List, Union

import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.utils import shuffle
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score
from sklearn.svm import SVC
from sklearn.model_selection import GridSearchCV, StratifiedKFold
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.compose import ColumnTransformer
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier

import warnings
warnings.filterwarnings("ignore")

In [2]:
def seed_everything(seed=42):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
seed_everything()

## Load & preprocess data

In [3]:
def read_data(path):
    dml = pd.read_csv(path, index_col=None).drop('Unnamed: 0', axis=1)
    print('ALL SNIPPETS: ', dml.shape[0], 'UNIQUE: ', len(dml['code_block'].unique()))
    dml = dml.dropna()
    print('WITHOUT NANS: ', dml.shape[0])
    dml = dml[dml['marks'] == 5]
    print('CLEAR DATA MARKS', dml.shape[0])
    dml = dml[(dml['graph_vertex_id'] != 53) & (dml['graph_vertex_id'] != 84)]
    print('CLEAR DATA CLASSES', dml.shape[0])
    return dml

In [4]:
path = '../data/code4ml/markup_data.csv'
dml = read_data(path)

ALL SNIPPETS:  7947 UNIQUE:  7839
WITHOUT NANS:  7947
CLEAR DATA MARKS 5371
CLEAR DATA CLASSES 5152


In [5]:
all_classes = sorted(dml['graph_vertex_id'].unique())
print(len(all_classes))

65


In [6]:
def ind(el):
  return list(all_classes).index(el)

dml['code_block'] = dml['code_block'].apply(preprocess_snippet)
X = dml[['code_block']]
y = dml['graph_vertex_id'].apply(ind)

In [7]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.4, random_state=42)
print(X_train.shape)

(3091, 1)


## SVM, Logreg, RF, GB

In [8]:
def id(x: str) -> str:
    return x

def get_score(scoring, tfidf, model, params):
    param_grid = params

    column_transformer = ColumnTransformer(
        [('tfidf', TfidfVectorizer(analyzer='word', tokenizer=id, preprocessor=id, token_pattern=None), tfidf),
         ],
         remainder='passthrough'
    )
    pipeline = Pipeline([
           ('trans', column_transformer),
           ('model', model),
    ])
    grid = GridSearchCV(pipeline, param_grid, refit=True, scoring=scoring, error_score="raise", verbose=3, cv=StratifiedKFold(n_splits=5, shuffle=True))
    grid.fit(X_train, y_train)

    return grid

In [9]:
def calc(model, params):
    res = get_score('f1_weighted', 'code_block', model, params)
    clear_output(True)
    print(res.best_score_, res.best_params_, res.cv_results_['std_test_score'][res.best_index_])
    print('TEST accuracy: ', accuracy_score(res.best_estimator_.predict(X_test), y_test))
    print('TEST f1: ', f1_score(res.best_estimator_.predict(X_test), y_test, average='weighted'))
    print('TEST precision: ', precision_score(res.best_estimator_.predict(X_test), y_test, average='weighted'))
    print('TEST recall: ', recall_score(res.best_estimator_.predict(X_test), y_test, average='weighted'))

In [None]:
%%time
params = {
    'model__C' : np.logspace(-2, 2, 10),
    'model__kernel' : ['linear', 'poly', 'rbf'],
    'model__gamma' : np.logspace(-2, 2, 10)
}
calc(SVC(), params)

0.8084157587575141 {'model__C': 1.6681005372000592, 'model__gamma': 0.01, 'model__kernel': 'linear'} 0.012514011311451587
TEST accuracy:  0.8321203299369239
TEST f1:  0.8388761266509127
TEST precision:  0.8562938679052116
TEST recall:  0.8321203299369239
CPU times: user 1h 4min 36s, sys: 4.82 s, total: 1h 4min 41s
Wall time: 1h 5min 14s


In [None]:
%%time
params = {
    'model__C' : np.logspace(-2, 2, 10),
    'model__penalty' : ['l2', 'none']
}
calc(LogisticRegression(), params)

0.8190582579205635 {'model__C': 100.0, 'model__penalty': 'l2'} 0.017074106927489334
TEST accuracy:  0.8282387190684134
TEST f1:  0.835118834890067
TEST precision:  0.8526421262445221
TEST recall:  0.8282387190684134
CPU times: user 16min 37s, sys: 17min 43s, total: 34min 21s
Wall time: 20min 29s


In [None]:
%%time
params = {
    'model__n_estimators' : [100, 200, 400, 800],
    'model__max_depth' : [5, 20, None],
    'model__min_samples_split' : [2, 8, 32],
    'model__min_samples_leaf' : [1, 4, 16]
}
calc(RandomForestClassifier(), params)

0.7994822366743681 {'model__max_depth': None, 'model__min_samples_leaf': 1, 'model__min_samples_split': 8, 'model__n_estimators': 100} 0.01591747405653803
TEST accuracy:  0.8190198932557011
TEST f1:  0.8326939926621841
TEST precision:  0.863889552535101
TEST recall:  0.8190198932557011
CPU times: user 17min 45s, sys: 8.76 s, total: 17min 54s
Wall time: 18min


In [12]:
%%time
params = {
    'model__n_estimators' : [20, 40],
    'model__max_depth' : [2, 4, None],
    'model__learning_rate' : [0.01, 0.1, 1]
}
calc(GradientBoostingClassifier(), params)

0.7540195084095841 {'model__learning_rate': 0.1, 'model__max_depth': 4, 'model__n_estimators': 40} 0.01212267145213408
TEST accuracy:  0.754973313925279
TEST f1:  0.7513297710949846
TEST precision:  0.7719663463640357
TEST recall:  0.754973313925279
CPU times: user 5h 19min 17s, sys: 19.1 s, total: 5h 19min 36s
Wall time: 5h 20min 18s


# Without preprocessing

In [10]:
dml = read_data(path)

ALL SNIPPETS:  7947 UNIQUE:  7839
WITHOUT NANS:  7947
CLEAR DATA MARKS 5371
CLEAR DATA CLASSES 5152


In [11]:
def preprocess_snippet(snippet: str, format='list') -> Union[str, List[str]]:
    """Performs string preprocessing."""
    return list(snippet.split())

In [12]:
def ind(el):
  return list(all_classes).index(el)

dml['code_block'] = dml['code_block'].apply(preprocess_snippet)
X = dml[['code_block']]
y = dml['graph_vertex_id'].apply(ind)

In [13]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.4, random_state=42)

In [None]:
%%time
params = {
    'model__C' : np.logspace(-2, 2, 10),
    'model__kernel' : ['linear', 'poly', 'rbf'],
    'model__gamma' : np.logspace(-2, 2, 10)
}
calc(SVC(), params)

0.4616242900463945 {'model__C': 4.6415888336127775, 'model__gamma': 0.01, 'model__kernel': 'linear'} 0.03232717072794601
TEST accuracy:  0.4556040756914119
TEST f1:  0.4580374957834386
TEST precision:  0.6054280646649305
TEST recall:  0.4556040756914119
CPU times: user 52min 19s, sys: 4.3 s, total: 52min 23s
Wall time: 52min 37s


In [None]:
%%time
params = {
    'model__C' : np.logspace(-2, 2, 10),
    'model__penalty' : ['l2', 'none']
}
calc(LogisticRegression(), params)

0.4680019302883675 {'model__C': 100.0, 'model__penalty': 'l2'} 0.012781667739560217
TEST accuracy:  0.48083454633672973
TEST f1:  0.4883760260567769
TEST precision:  0.6396780567990958
TEST recall:  0.48083454633672973
CPU times: user 30min 54s, sys: 25min 5s, total: 56min
Wall time: 33min 35s


In [None]:
%%time
params = {
    'model__n_estimators' : [100, 200, 400, 800],
    'model__max_depth' : [5, 20, None],
    'model__min_samples_split' : [2, 8, 32],
    'model__min_samples_leaf' : [1, 4, 16]
}
calc(RandomForestClassifier(), params)

0.4027315245732564 {'model__max_depth': None, 'model__min_samples_leaf': 1, 'model__min_samples_split': 2, 'model__n_estimators': 400} 0.018066803150690133
TEST accuracy:  0.43619602134885976
TEST f1:  0.45278979391172897
TEST precision:  0.6497581909788717
TEST recall:  0.43619602134885976
CPU times: user 19min 44s, sys: 18.8 s, total: 20min 3s
Wall time: 20min 9s


In [14]:
%%time
params = {
    'model__n_estimators' : [20, 40],
    'model__max_depth' : [2, 4, None],
    'model__learning_rate' : [0.01, 0.1, 1]
}
calc(GradientBoostingClassifier(), params)

0.3940721598074911 {'model__learning_rate': 0.1, 'model__max_depth': None, 'model__n_estimators': 40} 0.009855488212037395
TEST accuracy:  0.368753032508491
TEST f1:  0.3498236807939366
TEST precision:  0.5495949091686895
TEST recall:  0.368753032508491
CPU times: user 7h 21min 5s, sys: 7.17 s, total: 7h 21min 13s
Wall time: 7h 22min 21s
