In [1]:
import pandas as pd
import numpy as np
import pickle
from datetime import datetime

from sklearn.metrics import accuracy_score, f1_score
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import KFold, GridSearchCV
from sklearn.svm import SVC
from sklearn.model_selection import train_test_split
from sklearn.ensemble import BaggingClassifier
from sklearn.multiclass import OneVsRestClassifier

import dagshub

def load_code_blocks(DATASET_PATH, CODE_COLUMN):
    df = pd.read_csv(DATASET_PATH, encoding='utf-8', comment='#', sep=',')#, quoting=csv.QUOTE_NONE, error_bad_lines=False)#, sep=','
    print(df.head())
    code_blocks = df[CODE_COLUMN]
    # test_size = 0.1
    # test_rows = round(df.shape[0]*test_size)
    # train_rows = df.shape[0] - test_rows
    # train_code_blocks = df[CODE_COLUMN][0:test_rows]
    # test_code_blocks = df[CODE_COLUMN][train_rows:]
    return df, code_blocks

def tfidf_fit_transform(code_blocks, params, TFIDF_DIR):
    vectorizer = TfidfVectorizer(**params)
    tfidf = vectorizer.fit(code_blocks)
    pickle.dump(tfidf, open(TFIDF_DIR, "wb"))
    print('TF-IDF model has been saved')
    code_blocks_tfidf = tfidf.transform(code_blocks)
    return code_blocks_tfidf

def SVM_evaluate(df, code_blocks, tfidf_params, TFIDF_DIR, SVM_params):
    code_blocks_tfidf = tfidf_fit_transform(code_blocks, tfidf_params, TFIDF_DIR)
    X_train, X_test, y_train, y_test = train_test_split(code_blocks_tfidf, df[TAG_TO_PREDICT], test_size=0.3)
    # grid = {"C": [100]}
    # cv = KFold(n_splits=2, shuffle=True, random_state=241)
    model = SVC(kernel="linear", random_state=241)
    # gs = GridSearchCV(model, grid, scoring="accuracy", cv=cv, verbose=1, n_jobs=-1)
    # gs.fit(X_train[:25000], y_train.ravel()[:25000])
    # C = gs.best_params_.get('C')
    # model = SVC(**SVM_params)
    print("Train SVM params:", model.get_params())
    n_estimators = 10
    clf = BaggingClassifier(model, max_samples=1.0 / n_estimators, n_estimators=n_estimators)
    # clf = model
    print("starting training..")
    clf.fit(X_train, y_train)
    print("saving the model")
    pickle.dump(clf, open(MODEL_DIR, 'wb'))
    print("predicting on the test..")
    y_pred = clf.predict(X_test)
    accuracy = accuracy_score(y_test, y_pred)
    f1 = f1_score(y_test, y_pred, average='weighted')
    # confus_matrix = confusion_matrix(model, X_test, y_test)
    metrics = {'test_accuracy': accuracy
            , 'test_f1_score': f1}
    print(metrics)
    return metrics

In [None]:
# if __name__ == '__main__':
#     DATASET_PATH = './data/code_blocks_regex_graph_v2.1.csv'
#     MODEL_DIR = './models/svm_regex_{}.sav'.format('graph_v2.1')
#     TFIDF_DIR = './models/tfidf_svm_graph_v2.1.pickle'
#     CODE_COLUMN = 'code_block'
#     TAG_TO_PREDICT = 'preprocessing'
#     SCRIPT_DIR = __file__
    
#     df, code_blocks = load_code_blocks(DATASET_PATH, CODE_COLUMN)
#     nrows = df.shape[0]
#     print("loaded")
#     tfidf_params = {'min_df': 5
#             , 'max_df': 0.3
#             , 'smooth_idf': True}
#     SVM_params = {'C':100
#             , 'kernel':"linear"
#             , 'random_state':241}
#     data_meta = {'DATASET_PATH': DATASET_PATH
#                 ,'nrows': nrows
#                 ,'label': TAG_TO_PREDICT
#                 ,'model': MODEL_DIR
#                 ,'source': SCRIPT_DIR}

#     with dagshub.dagshub_logger() as logger:
#         print("evaluating..")
#         metrics = SVM_evaluate(df, code_blocks, tfidf_params, TFIDF_DIR, SVM_params)
#         print("saving the results..")
#         logger.log_hyperparams(data_meta)
#         logger.log_hyperparams(tfidf_params)
#         logger.log_hyperparams(SVM_params)
#         logger.log_metrics(metrics)
#     print("finished")

In [11]:
if __name__ == '__main__':
    DATASET_PATH = './data/code_blocks_regex_graph_v2.1.csv'
    MODEL_DIR = './models/svm_regex_{}.sav'.format('graph_v2.1')
    TFIDF_DIR = './models/tfidf_svm_graph_v2.1.pickle'
    CODE_COLUMN = 'code_block'
    TAG_TO_PREDICT = 'preprocessing'
    SCRIPT_DIR = 'svm_classifier.ipynb'
    clf = pickle.load(open(MODEL_DIR, 'rb'))

In [64]:
# tfidf = pickle.load(open(TFIDF_DIR, 'rb'))
# len(tfidf.vocabulary_)

In [61]:
clf = clf.estimators_[0]
print('w = ',clf.coef_)
print('b = ',clf.intercept_)
print('Indices of support vectors = ', clf.support_)
print('Support vectors = ', clf.support_vectors_)
print('Number of support vectors for each class = ', clf.n_support_)
print('Coefficients of the support vector in the decision function = ', np.abs(clf.dual_coef_))

w =    (0, 37211)	0.035646233761817
  (0, 34323)	0.030950524230484572
  (0, 29584)	0.035646233761817
  (0, 26147)	0.029679575247906122
  (0, 24917)	0.035646233761817
  (0, 19800)	0.06721916326465512
  (0, 18078)	0.03250528522413417
  (0, 31751)	0.3790859004196737
  (0, 13342)	0.15071739978159376
  (0, 45380)	0.2723816199314875
  (0, 45377)	0.2785312498516836
  (0, 31798)	0.025936232700084456
  (0, 4482)	0.028515522724737143
  (0, 41031)	0.32591415458340406
  (0, 36801)	0.13961101849195767
  (0, 31696)	0.13961101849195767
  (0, 27633)	0.12935753551689436
  (0, 16699)	0.12774161988093802
  (0, 37329)	0.08375070222986938
  (0, 37311)	0.0830840198551464
  (0, 36534)	0.2585720973751989
  (0, 34463)	0.8537643986305498
  (0, 41406)	0.27096391350885946
  (0, 11954)	0.24268623564190253
  (0, 42145)	0.24137468565479947
  :	:
  (0, 41919)	0.1413290554429865
  (0, 40426)	-0.12128050809560675
  (0, 39987)	0.43936220103801604
  (0, 38565)	-0.15207254814850366
  (0, 38001)	-0.13588208562448223
  (0, 

In [127]:
w = clf.coef_
weights = w.toarray()[0]

In [156]:
vocab = list(tfidf.vocabulary_.keys())
vocab_freq = list(tfidf.vocabulary_.values())

In [None]:
# interpret.to_csv('svm_interpret.csv', index=False)

In [190]:
interpret.sort_values(by='weights', ascending=False)[interpret['vocab_freq'] > 48000].head(10)

Unnamed: 0,vocab,vocab_freq,weights
39589,y_test_cc_lgb,48135,4.754719
37343,y_testl11,48174,3.691251
21676,y_values,48331,2.548121
42649,y_val1,48302,0.913669
41924,y_trains,48272,0.742393
37990,y_truth_sp,48292,0.685098
35413,yy_train,48596,0.682025
35295,y_test_fatal,48146,0.657406
42438,y_sel_test,48105,0.634224
22481,yaxis_title_text,48376,0.616938


In [186]:
interpret.sort_values(by='vocab_freq', ascending=False)[interpret['weights'] > 3]

Unnamed: 0,vocab,vocab_freq,weights
37343,y_testl11,48174,3.691251
39589,y_test_cc_lgb,48135,4.754719
9856,traumatized,44062,5.444746
9434,trainset,43933,4.694664
43744,switzerland_nan,40674,5.509675
9848,storm,39953,11.267548
29650,print_results,33812,5.796592
41675,personal_yards,31962,6.011538
35928,nan_new,28789,3.898947
33641,modelu2os,27808,11.50936


In [196]:
interpret['weights']

0        0.208923
1       -0.202469
2       -0.256652
3        0.000000
4       -0.019655
           ...   
48988    0.033672
48989    0.033672
48990    0.033672
48991    0.033672
48992    0.033672
Name: weights, Length: 48993, dtype: float64

In [157]:
interpret = pd.DataFrame()
interpret['vocab'] = vocab
interpret['vocab_freq'] = vocab_freq
interpret['weights'] = weights