# Predicting tags for the given code
### Simple models (SVM, LogReg, Naive Bayes, LGBM)
Usage: Define constants and run all to get the result

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import scipy

from sklearn.metrics import *
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.svm import SVC

import pickle
import dagshub

In [2]:
def load_code_blocks(DATASET_PATH, CODE_COLUMN):
    df = pd.read_csv(DATASET_PATH, encoding='utf-8', comment='#', sep=',')#, quoting=csv.QUOTE_NONE, error_bad_lines=False)#, sep=','
    print("Data of {} shape has been loaded".format(df.shape))
    print(df.columns)
    code_blocks = df[CODE_COLUMN]
    # test_size = 0.1
    # test_rows = round(df.shape[0]*test_size)
    # train_rows = df.shape[0] - test_rows
    # train_code_blocks = df[CODE_COLUMN][0:test_rows]
    # test_code_blocks = df[CODE_COLUMN][train_rows:]
    return df, code_blocks

In [3]:
def tfidf_fit_transform(code_blocks, params, TFIDF_DIR):
    vectorizer = TfidfVectorizer(params)
    tfidf = vectorizer.fit(code_blocks)
    pickle.dump(tfidf, open("tfidf.pickle", "wb"))
    code_blocks_tfidf = tfidf.transform(code_blocks)
    print("tfidf trained, saved and transformed")
    return code_blocks_tfidf

In [4]:
def tfidf_transform(code_blocks, params, TFIDF_DIR):
    tfidf = pickle.load(open(TFIDF_DIR, 'rb'))
    code_blocks_tfidf = tfidf.transform(code_blocks.astype(str))
    print("tfidf loaded and transformed")
    return code_blocks_tfidf

In [5]:
def SVM_evaluate(df, code_blocks, tfidf_params, TFIDF_DIR, SVM_params):
    code_blocks_tfidf = tfidf_fit_transform(code_blocks, tfidf_params, TFIDF_DIR)
    X_train, X_test, y_train, y_test = train_test_split(code_blocks_tfidf, df[TAGS_TO_PREDICT], test_size=0.3)
    # grid = {"C": [100]}
    # cv = KFold(n_splits=2, shuffle=True, random_state=241)
    # model = SVC(kernel="linear", random_state=241)
    # gs = GridSearchCV(model, grid, scoring="accuracy", cv=cv, verbose=1, n_jobs=-1)
    # gs.fit(X_train[:25000], y_train.ravel()[:25000])
    # C = gs.best_params_.get('C')
    model = SVC(**SVM_params)
    # model.set_params(SVM_params)
    print(model.get_params())
    model.fit(X_train, y_train.ravel())
    # a faster option:
    # 1: usage of BagginClassifier decreased the fitting time from 38 mins to 8
    # n_estimators = 10
    clf = model
    # clf = BaggingClassifier(model, max_samples=1.0 / n_estimators, n_estimators=n_estimators)
    # clf.fit(X_train, y_train.ravel())
    pickle.dump(clf, open(MODEL_DIR, 'wb'))
    y_pred = clf.predict(X_test)
    accuracy = accuracy_score(y_test, y_pred)
    f1 = f1_score(y_test, y_pred)
    metrics = {'test_accuracy': accuracy
               , 'test_f1_score': f1}
    plot_confusion_matrix(model, X_test, y_test)
    return metrics

In [39]:
def get_metrics(X, y, TAGS_TO_PREDICT, MODEL_DIR):
    clf = pickle.load(open(MODEL_DIR, 'rb'))
    print("the model has been loaded")
    # result = loaded_model.score(X, y)
    y_pred = clf.predict(X)
    print("predictions were calculated")
    accuracy = clf.score(X, y)
    f1 = f1_score(y_pred, y, average='weighted')
    print(f'Mean Accuracy {round(accuracy*100, 2)}%')
    print(f'F1-score {round(f1*100, 2)}%')
    # errors = y - y_pred
    # plt.hist(errors)
    # plot_precision_recall_curve(clf, X, y)
    # plot_confusion_matrix(clf, X, y, values_format='d')
    # def mean_confidence_interval(data, confidence=0.95):
    #     a = 1.0 * np.array(data)
    #     n = len(a)
    #     m, se = np.mean(a), scipy.stats.sem(a)
    #     h = se * scipy.stats.t.ppf((1 + confidence) / 2., n-1)
    #     return m, m-h, m+h
    # conf_interval = mean_confidence_interval(errors, 0.95)
    # print(conf_interval)
    metrics = {'test_accuracy': accuracy
               , 'test_f1_score': f1
            #    , 'conf_interval': conf_interval
               }
    return X, y, y_pred, metrics
    print(result)

In [40]:
def get_predictions(X, TAGS_TO_PREDICT, MODEL_DIR):
    clf = pickle.load(open(MODEL_DIR, 'rb'))
    print("the model has been loaded")
    y_pred = clf.predict(X)
    print("predictions were calculated")
    return y_pred

# Constants

In [53]:
GRAPH_VER = 5
MODEL = 'svm'
CHUNK_SIZE = 10
DATASET_PATH = './data/golden_884_set.csv'
MODEL_DIR = './models/{}_regex_graph_v{}.sav'.format(MODEL, GRAPH_VER)
TFIDF_DIR = './models/tfidf_{}_graph_v{}.pickle'.format(MODEL, GRAPH_VER)
CODE_COLUMN = 'code_block'
TAGS_TO_PREDICT = ['import', 'data_import', 'data_export', 'preprocessing',
                    'visualization', 'model', 'deep_learning_model', 'train', 'predict']
SCRIPT_DIR = './predict_tag.ipynb'

### Validation or Evaluation

In [54]:
if __name__ == '__main__':
    TASK = 'model validation'
    df, code_blocks = load_code_blocks(DATASET_PATH, CODE_COLUMN)
    nrows = df.shape[0]
    print("loaded")
    tfidf_params = {'min_df': 5
            , 'max_df': 0.3
            , 'smooth_idf': True}
    SVM_params = {'C': 100
            , 'kernel': "linear"
            , 'verbose': 1
            , 'random_state': 241}
    meta = {'DATASET_PATH': DATASET_PATH
           ,'MODEL_DIR': MODEL_DIR
           ,'TFIDF_DIR': TFIDF_DIR
           ,'nrows': nrows
           ,'label': TAGS_TO_PREDICT
           ,'model': MODEL
           ,'graph_ver': GRAPH_VER
           ,'script_dir': SCRIPT_DIR
           ,'task': TASK}
    code_blocks_tfidf = tfidf_transform(code_blocks, tfidf_params, TFIDF_DIR)
    with dagshub.dagshub_logger() as logger:
        # metrics = SVM_evaluate(df, code_blocks, tfidf_params, TFIDF_DIR, SVM_params)
        _, y, y_pred, metrics = get_metrics(code_blocks_tfidf, df[TAGS_TO_PREDICT], TAGS_TO_PREDICT, MODEL_DIR)
        logger.log_hyperparams(meta)
        # logger.log_hyperparams(tfidf_params)
        # logger.log_hyperparams(SVM_params)
        logger.log_metrics(metrics)
    print("finished")

Data of (884, 19) shape has been loaded
Index(['code_block', 'import', 'data_import', 'data_export', 'preprocessing',
       'visualization', 'model', 'deep_learning_model', 'train', 'predict',
       'import_regex_v5', 'data_import_regex_v5', 'data_export_regex_v5',
       'preprocessing_regex_v5', 'visualization_regex_v5', 'model_regex_v5',
       'deep_learning_model_regex_v5', 'train_regex_v5', 'predict_regex_v5'],
      dtype='object')
loaded
tfidf loaded and transformed
the model has been loaded
predictions were calculated
Mean Accuracy -6.49%
F1-score 58.6%
finished


In [33]:
y

Unnamed: 0,import,data_import,data_export,preprocessing,visualization,model,deep_learning_model,train,predict
0,1,0,0,0,0,0,0,0,0
1,1,0,0,0,0,0,0,0,0
2,1,0,0,0,0,0,0,0,0
3,1,0,0,0,0,0,0,0,1
4,1,0,0,0,1,0,0,0,0
5,1,0,0,0,0,0,0,0,0
6,1,0,0,0,0,0,0,0,0
7,1,0,0,0,0,0,0,0,0
8,1,0,0,0,0,0,0,0,0
9,1,0,0,0,1,0,0,0,0


In [55]:
for i, tag in enumerate(TAGS_TO_PREDICT):
    tag_results = (round(f1_score(y.iloc[:, i], y_pred[:, i], average='weighted'),4),\
                    round(precision_score(y.iloc[:, i], y_pred[:, i], average='weighted'),4),\
                    round(recall_score(y.iloc[:, i], y_pred[:, i], average='weighted'),4))
    print(tag)
    print(tag_results)
    print('------')

import
(0.9931, 0.9933, 0.9932)
------
data_import
(0.9491, 0.956, 0.9536)
------
data_export
(0.8824, 0.9206, 0.9129)
------
preprocessing
(0.7175, 0.7124, 0.724)
------
visualization
(0.8225, 0.8346, 0.8156)
------
model
(0.8654, 0.8915, 0.8846)
------
deep_learning_model
(0.899, 0.9464, 0.8654)
------
train
(0.9526, 0.9547, 0.9514)
------
predict
(0.9189, 0.9212, 0.9299)
------


### Validation (different chunk_sizes)

In [10]:
# if __name__ == '__main__':
#     all_metrics = []
#     chunk_sizes = [5, 10, 15, 20, 25, 30, 40]
#     for i in chunk_sizes:
#         df, corpus = load_code_blocks(DATASET_PATH, CODE_COLUMN)
#         nrows = df.shape[0]
#         print("loaded")
#         params = {'min_df': 5
#                 , 'max_df': 0.3
#                 , 'smooth_idf': True}
#         meta = {'DATASET_PATH': DATASET_PATH
#               ,'MODEL_DIR': MODEL_DIR
#               ,'TFIDF_DIR': TFIDF_DIR
#               ,'nrows': nrows
#               ,'label': TAG_TO_PREDICT
#               ,'model': MODEL
#               ,'graph_ver': GRAPH_VER
#               ,'script_dir': SCRIPT_DIR
#               ,'task': TASK}
#         features = tfidf_transform(corpus, params, TFIDF_DIR)
#         print("tfidf-ed")
#         # metrics = logreg_evaluate(df, features, TAG_TO_PREDICT)
#         _, _, _, metrics = get_metrics(features, df[TAG_TO_PREDICT], TAG_TO_PREDICT, MODEL_DIR)
#         all_metrics.append(metrics)
#         # with dagshub.dagshub_logger() as logger:
#         #     logger.log_hyperparams(meta)
#         #     logger.log_hyperparams(params)
#         #     logger.log_metrics(metrics)
#         print("finished")

In [11]:
# all_acc = [all_metrics[i]['test_accuracy'] for i in range(len(all_metrics))]
# all_f1 = [all_metrics[i]['test_f1_score'] for i in range(len(all_metrics))]
# plt.scatter(chunk_sizes, all_acc)
# plt.scatter(chunk_sizes, all_f1)

## Errors Analysis

In [12]:
# VAL_CHUNK_SIZE = 10
# VAL_CODE_COLUMN = 'code'
# VAL_TAG_TO_PREDICT = 'tag'
# VAL_DATASET_PATH = './data/chunks_{}_validate.csv'.format(VAL_CHUNK_SIZE)

In [13]:
# def analyze_predictions(X, y, TAG_TO_PREDICT, MODEL_DIR):
#     clf = pickle.load(open(MODEL_DIR, 'rb'))
#     # result = loaded_model.score(X, y)
#     y_pred = clf.predict(X)
#     accuracy = accuracy_score(y_pred, y)
#     f1 = f1_score(y_pred, y, average='weighted')
#     print(f'Mean Accuracy {round(accuracy*100, 2)}%')
#     print(f'F1-score {round(f1*100, 2)}%')
#     errors = y - y_pred
#     plt.hist(errors)
#     plot_precision_recall_curve(clf, X, y)
#     plot_confusion_matrix(clf, X, y, values_format='d')
#     def mean_confidence_interval(data, confidence=0.95):
#         a = 1.0 * np.array(data)
#         n = len(a)
#         m, se = np.mean(a), scipy.stats.sem(a)
#         h = se * scipy.stats.t.ppf((1 + confidence) / 2., n-1)
#         return m, m-h, m+h
#     conf_interval = mean_confidence_interval(errors, 0.95)
#     print(conf_interval)
#     metrics = {'test_accuracy': accuracy
#                , 'test_f1_score': f1}
#     return X, y, y_pred
# if __name__ == '__main__':
#     df, corpus = load_code_blocks(VAL_DATASET_PATH, VAL_CODE_COLUMN)
#     nrows = df.shape[0]
#     print("loaded")
#     params = {'min_df': 5
#              , 'max_df': 0.3
#              , 'smooth_idf': True}
#     features = tfidf_transform(corpus, params, TFIDF_DIR)
#     print("tfidf-ed")
#     _, _, y_pred = analyze_predictions(features, df[VAL_TAG_TO_PREDICT], VAL_TAG_TO_PREDICT, MODEL_DIR)
#     print("finished")
#     df[PREDICT_COL] = y_pred

In [14]:
# _, y, y_pred, _ = get_metrics(features, df[VAL_TAG_TO_PREDICT], VAL_TAG_TO_PREDICT, MODEL_DIR)

In [15]:
# pd.set_option('max_colwidth', 500)
# pd.set_option('max_rows', 500)

In [16]:
# ## False Negatives (preprocessing y/n)
# df[(df[VAL_TAG_TO_PREDICT] == 1)&(df[PREDICT_COL] == 0)][VAL_CODE_COLUMN].to_csv('./model_interpretation_results/errors_analysis/{}_validation_{}_chunksize_{}_conf_matrix_FN.txt'.format(MODEL, GRAPH_VER, VAL_CHUNK_SIZE), index=False)

In [17]:
## False Positives (preprocessing y/n)
# df[(df[VAL_TAG_TO_PREDICT] == 0)&(df[PREDICT_COL] == 1)][VAL_CODE_COLUMN].to_csv('./model_interpretation_results/errors_analysis/{}_validation_{}_chunksize_{}_conf_matrix_FP.txt'.format(MODEL, GRAPH_VER, VAL_CHUNK_SIZE), index=False)

In [18]:
## True Negatives (preprocessing y/n)
# df[(df[VAL_TAG_TO_PREDICT] == 0)&(df[PREDICT_COL] == 0)][VAL_CODE_COLUMN].to_csv('./model_interpretation_results/errors_analysis/{}_validation_{}_chunksize_{}_conf_matrix_TN.txt'.format(MODEL, GRAPH_VER, VAL_CHUNK_SIZE), index=False)

In [19]:
# ## True Positives (preprocessing y/n)
# df[(df[VAL_TAG_TO_PREDICT] == 1)&(df[PREDICT_COL] == 1)][VAL_CODE_COLUMN].to_csv('./model_interpretation_results/errors_analysis/{}_validation_{}_chunksize_{}_conf_matrix_TP.txt'.format(MODEL, GRAPH_VER, VAL_CHUNK_SIZE), index=False)

### Inference

In [20]:
# if __name__ == '__main__':
#     # GRAPH_VER = 5
#     # MODEL = 'logreg'
#     # CHUNK_SIZE = 10
#     # DATASET_PATH = './data/github_chunks_{}.csv'.format(CHUNK_SIZE)
#     CODE_COLUMN = '0'
#     OUTPUT_DATASET_PATH = './data/golden_{}_{}_v{}.csv'.format(CHUNK_SIZE, MODEL, GRAPH_VER)
#     MODEL_DIR = './models/{}_regex_graph_v{}.sav'.format(MODEL, GRAPH_VER)
#     TFIDF_DIR = './models/tfidf_{}_graph_v{}.pickle'.format(MODEL, GRAPH_VER)

#     df, code_blocks = load_code_blocks(DATASET_PATH, CODE_COLUMN)
#     df.dropna(axis=0, inplace=True)

#     print("loaded")
#     tfidf_params = {'min_df': 5
#                     , 'max_df': 0.3
#                     , 'smooth_idf': True}
                    
#     code_blocks_tfidf = tfidf_transform(code_blocks, tfidf_params, TFIDF_DIR)
#     y_pred = get_predictions(code_blocks_tfidf, TAGS_TO_PREDICT, MODEL_DIR)
    
#     for i, tag in enumerate(TAGS_TO_PREDICT):
#         df[tag+''] = pd.Series(y_pred[:, i])
#     print("finished")
#     df.to_csv(OUTPUT_DATASET_PATH, index=False)
#     print("saved") 