## Import & install libs

In [25]:
import random
import os
import re
from tqdm import tqdm
from IPython.display import clear_output
from typing import List, Union

import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.utils import shuffle
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score
from sklearn.svm import SVC
from sklearn.model_selection import GridSearchCV, StratifiedKFold
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.compose import ColumnTransformer
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier

import torch
from torch import nn

import warnings
warnings.filterwarnings("ignore")

In [26]:
def seed_everything(seed=42):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False
seed_everything()

In [27]:
# graphics settings:

import matplotlib.pyplot as plt
import seaborn as sns
sns.set(style='whitegrid')
%config InlineBackend.figure_format = 'retina'
plt.rcParams['font.size'] = 14
plt.rcParams['figure.figsize'] = 10, 7
sns.set_style('darkgrid')

## Load & preprocess data

In [28]:
def read_data(path):
    dml = pd.read_csv(path, index_col=None).drop('Unnamed: 0', axis=1)
    print('ALL SNIPPETS: ', dml.shape[0], 'UNIQUE: ', len(dml['code_block'].unique()))
    dml = dml.dropna()
    print('WITHOUT NANS: ', dml.shape[0])
    dml = dml[dml['marks'] == 5]
    print('CLEAR DATA MARKS', dml.shape[0])
    dml = dml[(dml['graph_vertex_id'] != 53) & (dml['graph_vertex_id'] != 84)]
    print('CLEAR DATA CLASSES', dml.shape[0])
    return dml

In [29]:
path = 'snippets.csv'
dml = read_data(path)

ALL SNIPPETS:  7947 UNIQUE:  7839
WITHOUT NANS:  7947
CLEAR DATA MARKS 5371
CLEAR DATA CLASSES 5152


In [30]:
all_classes = sorted(dml['graph_vertex_id'].unique())
print(len(all_classes))

65


In [31]:
def from_bytes_to_str(snippet: str) -> str:
    """Converts a string from byte to regular format."""
    format_ = f'b"'
    format_hat = f"b'"
    if snippet[:2] == format_ or snippet[:2] == format_hat:
        n = len(snippet)
        snippet = snippet[2: n - 1]
    return snippet

def correct_special_symbols(snippet: str) -> str:
    """Fixes special characters."""
    snippet = snippet.replace('\\n', '\n')
    snippet = snippet.replace('<br>', '\n')
    snippet = snippet.replace('\n', ' \n ')
    snippet = snippet.replace('\\"', '\"')
    snippet = snippet.replace("\\'", "\'")
    snippet = snippet.replace('\'\'\'', ' \'\'\' ')
    snippet = snippet.replace('\"\"\"', ' \"\"\" ')
    return snippet

def delete_short_comments(snippet: str) -> str:
    """Deletes short comments."""
    snippet = re.sub('#.*', '', snippet)
    return snippet

def delete_imports(snippet: str) -> str:
    """Deletes imports modules from code."""
    im_snippet = re.sub('from .* import .*', '', snippet)
    im_snippet = re.sub('import .*', '', im_snippet)
    if len(im_snippet.replace(' ', '').replace('\n', '')) != 0:
        snippet = im_snippet
    return snippet

def add_spaces(snippet: str, symbols: List[str]) -> str:
    """Adds spaces to the beginning and end of each character."""
    for symb in symbols:
        snippet = snippet.replace(symb, ' ' + symb + ' ')
    return snippet

def delete_empty_lines(snippet: str) -> List[str]:
    """Deletes empty lines in snippet."""
    snippet = snippet.split(sep=' ')
    while '' in snippet:
        snippet.remove('')

    if len(snippet) == 0:
        snippet = ['\n']

    new_snippet = [snippet[0]]
    for i in range(1, len(snippet)):
        if snippet[i] == '\n' and new_snippet[-1] == '\n':
            continue
        else:
            new_snippet.append(snippet[i])
    if len(new_snippet) > 1 and new_snippet[0] == '\n':
        new_snippet = new_snippet[1:]
    return new_snippet

def delete_long_comments(snippet: List[str]) -> List[str]:
    """Deletes long comments."""
    res_snippet = []
    comment_flag = 0
    opened = ''
    for item in snippet:
        if item == '\'\'\'' or item == "\\'\\'\\'" or item == '\"\"\"' or item == '\\"\\"\\"':
            comment_flag += 1
            comment_flag %= 2
            if comment_flag == 1:
                opened = item
            else:
                if item != opened:
                    comment_flag = 1
            continue
        if comment_flag != 1:
            res_snippet.append(item)
    if len(res_snippet) == 0:
        res_snippet = ['\n']
    return res_snippet

def preprocess_snippet(snippet: str, format='list') -> Union[str, List[str]]:
    """Performs string preprocessing."""
    functions = [from_bytes_to_str, correct_special_symbols, delete_short_comments, delete_imports]
    for function in functions:
        snippet = function(snippet)
    to_replace = ['.', '(', ')', '\n', '[', ']', '_']
    snippet = add_spaces(snippet, to_replace)
    new_snippet = delete_empty_lines(snippet)
    res_snippet = delete_long_comments(new_snippet)
    if format == 'str':
        return ' '.join(res_snippet)
    return res_snippet

In [32]:
def ind(el):
  return list(all_classes).index(el)

dml['code_block'] = dml['code_block'].apply(preprocess_snippet)
X = dml[['code_block']]
y = dml['graph_vertex_id'].apply(ind)

In [33]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.4, random_state=42)
print(X_train.shape)

(3091, 1)


## SVM, Logreg, RF, GB

In [53]:
def id(x: str) -> str:
    return x

def get_score(scoring, tfidf, model, params):
    param_grid = params

    column_transformer = ColumnTransformer(
        [('tfidf', TfidfVectorizer(analyzer='word', tokenizer=id, preprocessor=id, token_pattern=None), tfidf),
         ],
         remainder='passthrough'
    )
    pipeline = Pipeline([
           ('trans', column_transformer),
           ('model', model),
    ])
    grid = GridSearchCV(pipeline, param_grid, refit=True, scoring=scoring, error_score="raise", verbose=3, cv=StratifiedKFold(n_splits=5, shuffle=True))
    grid.fit(X_train, y_train)

    return grid

In [54]:
def calc(model, params):
    res = get_score('f1_weighted', 'code_block', model, params)
    clear_output(True)
    print(res.best_score_, res.best_params_, res.cv_results_['std_test_score'][res.best_index_])
    print('TEST accuracy: ', accuracy_score(res.best_estimator_.predict(X_test), y_test))
    print('TEST f1: ', f1_score(res.best_estimator_.predict(X_test), y_test, average='weighted'))
    print('TEST precision: ', precision_score(res.best_estimator_.predict(X_test), y_test, average='weighted'))
    print('TEST recall: ', recall_score(res.best_estimator_.predict(X_test), y_test, average='weighted'))

In [55]:
%%time
params = {
    'model__C' : np.logspace(-2, 2, 10),
    'model__kernel' : ['linear', 'poly', 'rbf'],
    'model__gamma' : np.logspace(-2, 2, 10)
}
calc(SVC(), params)

0.8084157587575141 {'model__C': 1.6681005372000592, 'model__gamma': 0.01, 'model__kernel': 'linear'} 0.012514011311451587
TEST accuracy:  0.8321203299369239
TEST f1:  0.8388761266509127
TEST precision:  0.8562938679052116
TEST recall:  0.8321203299369239
CPU times: user 1h 4min 36s, sys: 4.82 s, total: 1h 4min 41s
Wall time: 1h 5min 14s


In [56]:
%%time
params = {
    'model__C' : np.logspace(-2, 2, 10),
    'model__penalty' : ['l2', 'none']
}
calc(LogisticRegression(), params)

0.8190582579205635 {'model__C': 100.0, 'model__penalty': 'l2'} 0.017074106927489334
TEST accuracy:  0.8282387190684134
TEST f1:  0.835118834890067
TEST precision:  0.8526421262445221
TEST recall:  0.8282387190684134
CPU times: user 16min 37s, sys: 17min 43s, total: 34min 21s
Wall time: 20min 29s


In [57]:
%%time
params = {
    'model__n_estimators' : [100, 200, 400, 800],
    'model__max_depth' : [5, 20, None],
    'model__min_samples_split' : [2, 8, 32],
    'model__min_samples_leaf' : [1, 4, 16]
}
calc(RandomForestClassifier(), params)

0.7994822366743681 {'model__max_depth': None, 'model__min_samples_leaf': 1, 'model__min_samples_split': 8, 'model__n_estimators': 100} 0.01591747405653803
TEST accuracy:  0.8190198932557011
TEST f1:  0.8326939926621841
TEST precision:  0.863889552535101
TEST recall:  0.8190198932557011
CPU times: user 17min 45s, sys: 8.76 s, total: 17min 54s
Wall time: 18min


In [58]:
%%time
params = {
    'model__n_estimators' : [5, 10, 20],
    'model__max_depth' : [1, 2, 4],
    'model__learning_rate' : [0.01,0.1,1]
}
calc(GradientBoostingClassifier(), params)

0.7471641725904492 {'model__learning_rate': 0.1, 'model__max_depth': 2, 'model__n_estimators': 20} 0.016908909716347783
TEST accuracy:  0.7540029112081513
TEST f1:  0.7533480621906988
TEST precision:  0.7724027933455473
TEST recall:  0.7540029112081513
CPU times: user 28min 39s, sys: 1.97 s, total: 28min 41s
Wall time: 28min 47s


# Without preprocessing

In [59]:
path = 'snippets.csv'
dml = read_data(path)

ALL SNIPPETS:  7947 UNIQUE:  7839
WITHOUT NANS:  7947
CLEAR DATA MARKS 5371
CLEAR DATA CLASSES 5152


In [60]:
def preprocess_snippet(snippet: str, format='list') -> Union[str, List[str]]:
    """Performs string preprocessing."""
    return list(snippet.split())

In [61]:
def ind(el):
  return list(all_classes).index(el)

dml['code_block'] = dml['code_block'].apply(preprocess_snippet)
X = dml[['code_block']]
y = dml['graph_vertex_id'].apply(ind)

In [62]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.4, random_state=42)

In [63]:
%%time
params = {
    'model__C' : np.logspace(-2, 2, 10),
    'model__kernel' : ['linear', 'poly', 'rbf'],
    'model__gamma' : np.logspace(-2, 2, 10)
}
calc(SVC(), params)

0.4616242900463945 {'model__C': 4.6415888336127775, 'model__gamma': 0.01, 'model__kernel': 'linear'} 0.03232717072794601
TEST accuracy:  0.4556040756914119
TEST f1:  0.4580374957834386
TEST precision:  0.6054280646649305
TEST recall:  0.4556040756914119
CPU times: user 52min 19s, sys: 4.3 s, total: 52min 23s
Wall time: 52min 37s


In [64]:
%%time
params = {
    'model__C' : np.logspace(-2, 2, 10),
    'model__penalty' : ['l2', 'none']
}
calc(LogisticRegression(), params)

0.4680019302883675 {'model__C': 100.0, 'model__penalty': 'l2'} 0.012781667739560217
TEST accuracy:  0.48083454633672973
TEST f1:  0.4883760260567769
TEST precision:  0.6396780567990958
TEST recall:  0.48083454633672973
CPU times: user 30min 54s, sys: 25min 5s, total: 56min
Wall time: 33min 35s


In [65]:
%%time
params = {
    'model__n_estimators' : [100, 200, 400, 800],
    'model__max_depth' : [5, 20, None],
    'model__min_samples_split' : [2, 8, 32],
    'model__min_samples_leaf' : [1, 4, 16]
}
calc(RandomForestClassifier(), params)

0.4027315245732564 {'model__max_depth': None, 'model__min_samples_leaf': 1, 'model__min_samples_split': 2, 'model__n_estimators': 400} 0.018066803150690133
TEST accuracy:  0.43619602134885976
TEST f1:  0.45278979391172897
TEST precision:  0.6497581909788717
TEST recall:  0.43619602134885976
CPU times: user 19min 44s, sys: 18.8 s, total: 20min 3s
Wall time: 20min 9s


In [66]:
%%time
params = {
    'model__n_estimators' : [5, 10, 20],
    'model__max_depth' : [1, 2, 4],
    'model__learning_rate' : [0.01,0.1,1]
}
calc(GradientBoostingClassifier(), params)

0.384518381614674 {'model__learning_rate': 0.1, 'model__max_depth': 4, 'model__n_estimators': 20} 0.02230671192243451
TEST accuracy:  0.3799126637554585
TEST f1:  0.3917574613868245
TEST precision:  0.6281470150329166
TEST recall:  0.3799126637554585
CPU times: user 26min 36s, sys: 2.28 s, total: 26min 38s
Wall time: 26min 45s
