In [1]:
import pandas as pd
import numpy as np
import pickle
from datetime import datetime

from sklearn.metrics import accuracy_score, f1_score
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import KFold, GridSearchCV
from sklearn.svm import SVC
from sklearn.model_selection import train_test_split
from sklearn.ensemble import BaggingClassifier
from sklearn.multiclass import OneVsRestClassifier

import dagshub

def load_code_blocks(DATASET_PATH, CODE_COLUMN):
    df = pd.read_csv(DATASET_PATH, encoding='utf-8', comment='#', sep=',')#, quoting=csv.QUOTE_NONE, error_bad_lines=False)#, sep=','
    print(df.head())
    code_blocks = df[CODE_COLUMN]
    # test_size = 0.1
    # test_rows = round(df.shape[0]*test_size)
    # train_rows = df.shape[0] - test_rows
    # train_code_blocks = df[CODE_COLUMN][0:test_rows]
    # test_code_blocks = df[CODE_COLUMN][train_rows:]
    return df, code_blocks

def tfidf_fit_transform(code_blocks, params, TFIDF_DIR):
    vectorizer = TfidfVectorizer(**params)
    tfidf = vectorizer.fit(code_blocks)
    pickle.dump(tfidf, open(TFIDF_DIR, "wb"))
    print('TF-IDF model has been saved')
    code_blocks_tfidf = tfidf.transform(code_blocks)
    return code_blocks_tfidf

def SVM_evaluate(df, code_blocks, tfidf_params, TFIDF_DIR, SVM_params):
    code_blocks_tfidf = tfidf_fit_transform(code_blocks, tfidf_params, TFIDF_DIR)
    X_train, X_test, y_train, y_test = train_test_split(code_blocks_tfidf, df[TAG_TO_PREDICT], test_size=0.3)
    # grid = {"C": [100]}
    # cv = KFold(n_splits=2, shuffle=True, random_state=241)
    model = SVC(kernel="linear", random_state=241)
    # gs = GridSearchCV(model, grid, scoring="accuracy", cv=cv, verbose=1, n_jobs=-1)
    # gs.fit(X_train[:25000], y_train.ravel()[:25000])
    # C = gs.best_params_.get('C')
    # model = SVC(**SVM_params)
    print("Train SVM params:", model.get_params())
    n_estimators = 10
    clf = BaggingClassifier(model, max_samples=1.0 / n_estimators, n_estimators=n_estimators)
    # clf = model
    print("starting training..")
    clf.fit(X_train, y_train)
    print("saving the model")
    pickle.dump(clf, open(MODEL_DIR, 'wb'))
    print("predicting on the test..")
    y_pred = clf.predict(X_test)
    accuracy = accuracy_score(y_test, y_pred)
    f1 = f1_score(y_test, y_pred, average='weighted')
    # confus_matrix = confusion_matrix(model, X_test, y_test)
    metrics = {'test_accuracy': accuracy
            , 'test_f1_score': f1}
    print(metrics)
    return metrics

In [2]:
# if __name__ == '__main__':
#     DATASET_PATH = './data/code_blocks_regex_graph_v2.1.csv'
#     MODEL_DIR = './models/svm_regex_{}.sav'.format('graph_v2.1')
#     TFIDF_DIR = './models/tfidf_svm_graph_v2.1.pickle'
#     CODE_COLUMN = 'code_block'
#     TAG_TO_PREDICT = 'preprocessing'
#     SCRIPT_DIR = __file__
    
#     df, code_blocks = load_code_blocks(DATASET_PATH, CODE_COLUMN)
#     nrows = df.shape[0]
#     print("loaded")
#     tfidf_params = {'min_df': 5
#             , 'max_df': 0.3
#             , 'smooth_idf': True}
#     SVM_params = {'C':100
#             , 'kernel':"linear"
#             , 'random_state':241}
#     data_meta = {'DATASET_PATH': DATASET_PATH
#                 ,'nrows': nrows
#                 ,'label': TAG_TO_PREDICT
#                 ,'model': MODEL_DIR
#                 ,'source': SCRIPT_DIR}

#     with dagshub.dagshub_logger() as logger:
#         print("evaluating..")
#         metrics = SVM_evaluate(df, code_blocks, tfidf_params, TFIDF_DIR, SVM_params)
#         print("saving the results..")
#         logger.log_hyperparams(data_meta)
#         logger.log_hyperparams(tfidf_params)
#         logger.log_hyperparams(SVM_params)
#         logger.log_metrics(metrics)
#     print("finished")

In [4]:
if __name__ == '__main__':
    GRAPH_VER = 3
    SCRIPT_DIR = 'svm_classifier.ipynb'
    CODE_COLUMN = 'code_block'
    TAG_TO_PREDICT = 'preprocessing'
    
    DATASET_PATH = './data/code_blocks_regex_graph_v{}.csv'.format(GRAPH_VER)
    MODEL_DIR = './models/svm_regex_graph_v{}.sav'.format(GRAPH_VER)
    TFIDF_DIR = './models/tfidf_svm_graph_v{}.pickle'.format(GRAPH_VER)
    clf = pickle.load(open(MODEL_DIR, 'rb'))

In [5]:
tfidf = pickle.load(open(TFIDF_DIR, 'rb'))
len(tfidf.vocabulary_)

48993

In [6]:
clf = clf.estimators_[0]
print('w = ',clf.coef_)
print('b = ',clf.intercept_)
print('Indices of support vectors = ', clf.support_)
print('Support vectors = ', clf.support_vectors_)
print('Number of support vectors for each class = ', clf.n_support_)
print('Coefficients of the support vector in the decision function = ', np.abs(clf.dual_coef_))

w =    (0, 12322)	0.41240137688069894
  (0, 45677)	0.14447384058117496
  (0, 45601)	0.07223692029058748
  (0, 19960)	0.07080398660761765
  (0, 9337)	0.07309440252163847
  (0, 44372)	0.026009618046510273
  (0, 43491)	0.0727909900758762
  (0, 43089)	0.026596843885727774
  (0, 23432)	0.052019236093020546
  (0, 21654)	0.20610744607782908
  (0, 21596)	0.05391407030537303
  (0, 15939)	0.024389997805882472
  (0, 9454)	0.025150385673394008
  (0, 9208)	0.025150385673394008
  (0, 6883)	0.10403847218604109
  (0, 37327)	0.9537033802756059
  (0, 11996)	0.5180699643103804
  (0, 11940)	0.273153987376577
  (0, 40734)	0.4509928638812952
  (0, 19900)	0.41823889126148917
  (0, 37172)	0.5023584794032238
  (0, 45792)	0.061292530946072905
  (0, 28278)	0.11433629896972944
  (0, 20692)	0.059038576772007086
  (0, 19357)	0.1898257913090543
  :	:
  (0, 48166)	-0.01991314818584338
  (0, 47410)	-0.036440586355025166
  (0, 47249)	-0.017982636811067463
  (0, 46042)	0.20371293186857614
  (0, 43231)	-0.045110268001248

In [7]:
w = clf.coef_
weights = w.toarray()[0]

In [8]:
vocab = list(tfidf.vocabulary_.keys())
vocab_freq = list(tfidf.vocabulary_.values())

In [10]:
interpret = pd.DataFrame()
interpret['vocab'] = vocab
interpret['vocab_freq'] = vocab_freq
interpret['weights'] = weights

In [17]:
interpret.to_csv('./model_interpretation/svm_interpret_v{}.csv'.format(GRAPH_VER), index=False)

In [12]:
interpret.sort_values(by='weights', ascending=False)[interpret['vocab_freq'] > 48000].head(10)

Unnamed: 0,vocab,vocab_freq,weights
42649,y_val1,48302,5.496884
37343,y_testl11,48174,3.836949
39589,y_test_cc_lgb,48135,3.516085
21676,y_values,48331,1.580852
41962,y_trfm,48278,1.015931
12312,y_pred_in,48013,0.956726
27964,y_pred_prob_yes,48026,0.940454
42648,y_train4,48189,0.895213
282,zeros,48648,0.88244
21766,ys_train,48565,0.77785


In [13]:
interpret.sort_values(by='vocab_freq', ascending=False)[interpret['weights'] > 3]

Unnamed: 0,vocab,vocab_freq,weights
42649,y_val1,48302,5.496884
37343,y_testl11,48174,3.836949
39589,y_test_cc_lgb,48135,3.516085
9856,traumatized,44062,3.271442
9434,trainset,43933,3.736686
43744,switzerland_nan,40674,9.559917
9848,storm,39953,9.684152
41675,personal_yards,31962,4.383437
35928,nan_new,28789,4.212826
33641,modelu2os,27808,8.978572
