## Import & install libs

In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.utils import shuffle
from sklearn.metrics import accuracy_score
from sklearn.svm import SVC
from sklearn.metrics import f1_score
from sklearn.model_selection import GridSearchCV, StratifiedKFold
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfVectorizer
import re
import random
import torch
from torch import nn
import os
import warnings
from sklearn.linear_model import LogisticRegression
from sklearn.compose import ColumnTransformer
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import GradientBoostingClassifier
from tqdm import tqdm
from IPython.display import clear_output

warnings.filterwarnings("ignore")

In [None]:
def seed_everything(seed=42):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False
seed_everything()

In [None]:
# graphics settings:

import matplotlib.pyplot as plt
import seaborn as sns
sns.set(style='whitegrid')
%config InlineBackend.figure_format = 'retina'
plt.rcParams['font.size'] = 14
plt.rcParams['figure.figsize'] = 10, 7
sns.set_style('darkgrid')

## Load & preprocess data

In [None]:
def read_data(path):
    dml = pd.read_csv(path, index_col=None).drop('Unnamed: 0', axis=1)
    print('ALL SNIPPETS: ', dml.shape[0], 'UNIQUE: ', len(dml['code_block'].unique()))
    dml = dml.dropna()
    print('WITHOUT NANS: ', dml.shape[0])
    dml = dml[dml['marks'] == 5]
    print('CLEAR DATA MARKS', dml.shape[0])
    dml = dml[(dml['graph_vertex_id'] != 53) & (dml['graph_vertex_id'] != 84)]
    print('CLEAR DATA CLASSES', dml.shape[0])
    return dml

In [None]:
path = 'snippets.csv'
dml = read_data(path)

ALL SNIPPETS:  7947 UNIQUE:  7839
WITHOUT NANS:  7947
CLEAR DATA MARKS 5371
CLEAR DATA CLASSES 5152


In [None]:
all_classes = sorted(dml['graph_vertex_id'].unique())
print(len(all_classes))

65


In [None]:
import re
from typing import List, Union


def from_bytes_to_str(snippet: str) -> str:
    """Converts a string from byte to regular format."""
    format_ = f'b"'
    format_hat = f"b'"
    if snippet[:2] == format_ or snippet[:2] == format_hat:
        n = len(snippet)
        snippet = snippet[2: n - 1]
    return snippet

def correct_special_symbols(snippet: str) -> str:
    """Fixes special characters."""
    snippet = snippet.replace('\\n', '\n')
    snippet = snippet.replace('<br>', '\n')
    snippet = snippet.replace('\n', ' \n ')
    snippet = snippet.replace('\\"', '\"')
    snippet = snippet.replace("\\'", "\'")
    snippet = snippet.replace('\'\'\'', ' \'\'\' ')
    snippet = snippet.replace('\"\"\"', ' \"\"\" ')
    return snippet

def delete_short_comments(snippet: str) -> str:
    """Deletes short comments."""
    snippet = re.sub('#.*', '', snippet)
    return snippet

def delete_imports(snippet: str) -> str:
    """Deletes imports modules from code."""
    im_snippet = re.sub('from .* import .*', '', snippet)
    im_snippet = re.sub('import .*', '', im_snippet)
    if len(im_snippet.replace(' ', '').replace('\n', '')) != 0:
        snippet = im_snippet
    return snippet

def add_spaces(snippet: str, symbols: List[str]) -> str:
    """Adds spaces to the beginning and end of each character."""
    for symb in symbols:
        snippet = snippet.replace(symb, ' ' + symb + ' ')
    return snippet

def delete_empty_lines(snippet: str) -> List[str]:
    """Deletes empty lines in snippet."""
    snippet = snippet.split(sep=' ')
    while '' in snippet:
        snippet.remove('')

    if len(snippet) == 0:
        snippet = ['\n']

    new_snippet = [snippet[0]]
    for i in range(1, len(snippet)):
        if snippet[i] == '\n' and new_snippet[-1] == '\n':
            continue
        else:
            new_snippet.append(snippet[i])
    if len(new_snippet) > 1 and new_snippet[0] == '\n':
        new_snippet = new_snippet[1:]
    return new_snippet

def delete_long_comments(snippet: List[str]) -> List[str]:
    """Deletes long comments."""
    res_snippet = []
    comment_flag = 0
    opened = ''
    for item in snippet:
        if item == '\'\'\'' or item == "\\'\\'\\'" or item == '\"\"\"' or item == '\\"\\"\\"':
            comment_flag += 1
            comment_flag %= 2
            if comment_flag == 1:
                opened = item
            else:
                if item != opened:
                    comment_flag = 1
            continue
        if comment_flag != 1:
            res_snippet.append(item)
    if len(res_snippet) == 0:
        res_snippet = ['\n']
    return res_snippet

def preprocess_snippet(snippet: str, format='list') -> Union[str, List[str]]:
    """Performs string preprocessing."""
    functions = [from_bytes_to_str, correct_special_symbols, delete_short_comments, delete_imports]
    for function in functions:
        snippet = function(snippet)
    to_replace = ['.', '(', ')', '\n', '[', ']', '_']
    snippet = add_spaces(snippet, to_replace)
    new_snippet = delete_empty_lines(snippet)
    res_snippet = delete_long_comments(new_snippet)
    if format == 'str':
        return ' '.join(res_snippet)
    return res_snippet

In [None]:
def ind(el):
  return list(all_classes).index(el)

dml['code_block'] = dml['code_block'].apply(preprocess_snippet)
dml = shuffle(dml, random_state=42)
X = dml[['code_block']]
y = dml['graph_vertex_id'].apply(ind)

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.4, random_state=42)
print(X_train.shape)

(3091, 1)


## SVM, Logreg, RF, GB

In [None]:
def id(x: str) -> str:
    return x

def get_score(scoring, tfidf, model, params):
    param_grid = params
    
    column_transformer = ColumnTransformer(
        [('tfidf', TfidfVectorizer(analyzer='word', tokenizer=id, preprocessor=id, token_pattern=None), tfidf),
         ],
         remainder='passthrough'
    )
    pipeline = Pipeline([
           ('trans', column_transformer),
           ('model', model),
    ])
    grid = GridSearchCV(pipeline, param_grid, refit=True, scoring=scoring, error_score="raise", verbose=10, cv=StratifiedKFold(n_splits=5, shuffle=True))
    grid.fit(X_train, y_train)
    
    return grid

In [None]:
def calc(model, params):
    res_ac = get_score('accuracy', 'code_block', model, params)
    res_f1 = get_score('f1_micro', 'code_block', model, params)
    clear_output(True)
    print(res_ac.best_score_, res_ac.best_params_, res_ac.cv_results_['std_test_score'][res_ac.best_index_])
    print('TEST accuracy: ', accuracy_score(res_ac.best_estimator_.predict(X_test), y_test))
    print(res_f1.best_score_, res_f1.best_params_, res_f1.cv_results_['std_test_score'][res_f1.best_index_])
    print('TEST f1: ', f1_score(res_f1.best_estimator_.predict(X_test), y_test, average='micro'))

In [None]:
%%time
params = {
    'model__C' : np.logspace(-2, 2, 10),
    'model__kernel' : ['linear', 'poly', 'rbf'],
    'model__gamma' : np.logspace(-2, 2, 10)
}
calc(SVC(), params)

0.8233563896251915 {'model__C': 4.6415888336127775, 'model__gamma': 0.01, 'model__kernel': 'linear'} 0.004572130718817808
TEST accuracy:  0.8282387190684134
0.8211014738250964 {'model__C': 4.6415888336127775, 'model__gamma': 0.01, 'model__kernel': 'linear'} 0.021035146628719522
TEST f1:  0.8282387190684134
CPU times: user 2h 8min 33s, sys: 12 s, total: 2h 8min 45s
Wall time: 2h 8min 35s


In [None]:
%%time
params = {
    'model__C' : np.logspace(-2, 2, 10),
    'model__penalty' : ['l2', 'none']
}
calc(LogisticRegression(), params)

0.8301561658589123 {'model__C': 100.0, 'model__penalty': 'l2'} 0.010953289940674609
TEST accuracy:  0.8277535177098496
0.8223907440228786 {'model__C': 35.93813663804626, 'model__penalty': 'l2'} 0.008329934479417603
TEST f1:  0.8282387190684134
CPU times: user 36min 25s, sys: 23min 29s, total: 59min 54s
Wall time: 32min 45s


In [None]:
%%time
params = {
    'model__n_estimators' : [100, 200, 400, 800],
    'model__max_depth' : [5, 20, None],
    'model__min_samples_split' : [2, 8, 32],
    'model__min_samples_leaf' : [1, 4, 16] 
}
calc(RandomForestClassifier(), params)

0.8191576349786429 {'model__max_depth': None, 'model__min_samples_leaf': 1, 'model__min_samples_split': 2, 'model__n_estimators': 800} 0.015466587861485385
TEST accuracy:  0.8233867054827754
0.8191576349786429 {'model__max_depth': None, 'model__min_samples_leaf': 1, 'model__min_samples_split': 2, 'model__n_estimators': 400} 0.012297956004606212
TEST f1:  0.8238719068413392
CPU times: user 32min 26s, sys: 38.4 s, total: 33min 4s
Wall time: 33min 3s


In [None]:
%%time
params = {
    'model__n_estimators' : [5, 10, 20],
    'model__max_depth' : [1, 2, 4],
    'model__learning_rate' : [0.01,0.1,1] 
}
calc(GradientBoostingClassifier(), params)

0.7424878836833603 {'model__learning_rate': 0.1, 'model__max_depth': 4, 'model__n_estimators': 20} 0.017667263263528615
TEST accuracy:  0.7530325084910238
0.7444259715273095 {'model__learning_rate': 0.1, 'model__max_depth': 4, 'model__n_estimators': 20} 0.01267680336415881
TEST f1:  0.750121300339641
CPU times: user 54min 22s, sys: 17.2 s, total: 54min 39s
Wall time: 54min 31s


# Without preprocessing

In [None]:
path = 'snippets.csv'
dml = read_data(path)

ALL SNIPPETS:  7947 UNIQUE:  7839
WITHOUT NANS:  7947
CLEAR DATA MARKS 5371
CLEAR DATA CLASSES 5152


In [None]:
import re
from typing import List, Union

def preprocess_snippet(snippet: str, format='list') -> Union[str, List[str]]:
    """Performs string preprocessing."""
    return list(snippet.split())

In [None]:
def ind(el):
  return list(all_classes).index(el)

dml['code_block'] = dml['code_block'].apply(preprocess_snippet)
dml = shuffle(dml, random_state=42)
X = dml[['code_block']]
y = dml['graph_vertex_id'].apply(ind)

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.4, random_state=42)

In [None]:
%%time
params = {
    'model__C' : np.logspace(-2, 2, 10),
    'model__kernel' : ['linear', 'poly', 'rbf'],
    'model__gamma' : np.logspace(-2, 2, 10)
}
calc(SVC(), params)

0.4545472131164683 {'model__C': 100.0, 'model__gamma': 0.027825594022071243, 'model__kernel': 'rbf'} 0.009677837279327571
TEST accuracy:  0.47064531780688984
0.4532626482843714 {'model__C': 4.6415888336127775, 'model__gamma': 0.01, 'model__kernel': 'linear'} 0.0221946189281898
TEST f1:  0.47016011644832606
CPU times: user 1h 41min 33s, sys: 7.22 s, total: 1h 41min 40s
Wall time: 1h 41min 38s


In [None]:
%%time
params = {
    'model__C' : np.logspace(-2, 2, 10),
    'model__penalty' : ['l2', 'none']
} 
calc(LogisticRegression(), params)

0.46458009839442466 {'model__C': 100.0, 'model__penalty': 'l2'} 0.012325150833644062
TEST accuracy:  0.4754973313925279
0.4619848278097568 {'model__C': 100.0, 'model__penalty': 'l2'} 0.011582858042331124
TEST f1:  0.4754973313925279
CPU times: user 1h 2min 5s, sys: 53min 23s, total: 1h 55min 29s
Wall time: 1h 2min 19s


In [20]:
%%time
params = {
    'model__n_estimators' : [100, 200, 400, 800],
    'model__max_depth' : [5, 20, None],
    'model__min_samples_split' : [2, 8, 32],
    'model__min_samples_leaf' : [1, 4, 16] 
}
calc(RandomForestClassifier(), params)

0.4280225439298168 {'model__max_depth': None, 'model__min_samples_leaf': 1, 'model__min_samples_split': 8, 'model__n_estimators': 400} 0.012700644979203497
TEST accuracy:  0.44347404172731686
0.4289965546266815 {'model__max_depth': None, 'model__min_samples_leaf': 1, 'model__min_samples_split': 2, 'model__n_estimators': 100} 0.01566675061942488
TEST f1:  0.43619602134885976
CPU times: user 33min 1s, sys: 39.3 s, total: 33min 40s
Wall time: 33min 50s


In [21]:
%%time
params = {
    'model__n_estimators' : [5, 10, 20],
    'model__max_depth' : [1, 2, 4],
    'model__learning_rate' : [0.01,0.1,1]   
}
calc(GradientBoostingClassifier(), params)

0.38207778492296274 {'model__learning_rate': 0.1, 'model__max_depth': 4, 'model__n_estimators': 20} 0.008556993264833444
TEST accuracy:  0.40950994662785056
0.3846678273235357 {'model__learning_rate': 0.1, 'model__max_depth': 4, 'model__n_estimators': 20} 0.009225484327261137
TEST f1:  0.3993207180980107
CPU times: user 44min 33s, sys: 16.9 s, total: 44min 50s
Wall time: 44min 55s
