In [1]:
import pandas as pd
import re
from nltk.tokenize import RegexpTokenizer
import numpy as np
import matplotlib.pyplot as plt
import sys

from sklearn import metrics
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import KFold, GridSearchCV
from sklearn.svm import SVC
from sklearn.model_selection import train_test_split
from sklearn.ensemble import BaggingClassifier
from sklearn.multiclass import OneVsRestClassifier

import dagshub

In [2]:
def load_corpus(DATASET_PATH, CODE_COLUMN):
    df = pd.read_csv(DATASET_PATH, encoding='utf-8', comment='#')#, quoting=csv.QUOTE_NONE, error_bad_lines=False)#, sep=','
    corpus = df[CODE_COLUMN]
    test_size = 0.1
    test_rows = round(df.shape[0]*test_size)
    train_rows = df.shape[0] - test_rows
    train_corpus = df[CODE_COLUMN][0:test_rows]
    test_corpus = df[CODE_COLUMN][train_rows:]
    return df, corpus

In [3]:
def tfidf_transform(corpus, params):
#     tfidf = TfidfVectorizer(min_df=5
#                             , max_df = 0.3
#                             , ngram_range = (1,2)
#                             , smooth_idf = True
#                            )
#     tfidf = TfidfVectorizer(params)
#     features = tfidf.fit_transform(corpus)
    vectorizer = TfidfVectorizer(ngram_range = (1,2), smooth_idf = True)
    features = vectorizer.fit_transform(corpus)
    # for_pred = tfidf.transform(test_corpus)
    return features

In [6]:
def SVM_evaluate(df, features):
    X_train, X_test, y_train, y_test = train_test_split(features, df[TAG_TO_PREDICT], test_size=0.3)
    grid = {"C": [1, 10, 100]}
    cv = KFold(n_splits=5, shuffle=True, random_state=241)
    model = SVC(kernel="linear", random_state=241)
    gs = GridSearchCV(model, grid, scoring="accuracy", cv=cv, verbose=1, n_jobs=-1)
    gs.fit(X_train[:25000], y_train.ravel()[:25000])
    C = gs.best_params_.get('C')
    # model = SVC(C=C, kernel="linear", random_state=241)
    # model.fit(X_train, y_train.ravel())
    # a faster option:
    # 1: usage of BagginClassifier decreased the fitting time from 38 mins to 8
    n_estimators = 10
    clf = BaggingClassifier(SVC(C=C, kernel='linear', random_state=241), max_samples=1.0 / n_estimators, n_estimators=n_estimators)
    clf.fit(X_train, y_train.ravel())
    # y_pred = model.predict(X_test)
    y_pred = clf.predict(X_test)
    accuracy = metrics.accuracy_score(y_test, y_pred)
    f1 = metrics.f1_score(y_test, y_pred)
    metrics = {'test_accuracy': accuracy
               , 'test_f1_score': f1}
    metrics.plot_confusion_matrix(model, X_test, y_test)
    return metrics

In [None]:
if __name__ == '__main__':
    DATASET_PATH = './data/code_blocks_regex.csv'
    CODE_COLUMN = 'code_block'
    TAG_TO_PREDICT = 'preprocessing'
    df, corpus = load_corpus(DATASET_PATH, CODE_COLUMN)
    nrows = df.shape[0]
    print("loaded")
    tfidf_params = {'min_df': 5
             , 'max_df': 0.3
             , 'smooth_idf': True}
    data_meta = {'DATASET_PATH': DATASET_PATH
                ,'nrows': nrows
                ,'label': TAG_TO_PREDICT
                ,'model': 'SVM'}
    features = tfidf_transform(corpus, tfidf_params)
    print("tfidf-ed")
    with dagshub.dagshub_logger() as logger:
        metrics = SVM_evaluate(df, features)
        logger.log_hyperparams(data_meta)
        logger.log_hyperparams(params)
        logger.log_metrics(metrics)
    print("finished")

loaded
tfidf-ed
Fitting 5 folds for each of 3 candidates, totalling 15 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
