In [10]:
import numpy as np
import pandas as pd
import os
from collections import Counter
import matplotlib.pyplot as plt
import statistics
from CFS import cfs
import NTK
import time
import warnings
import tools
from imblearn.over_sampling import SMOTE

# From https://machinelearningmastery.com/sequence-classification-lstm-recurrent-neural-networks-python-keras/
# and https://towardsdatascience.com/multi-class-text-classification-with-lstm-1590bee1bd17
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import label_binarize

from sklearn.metrics import confusion_matrix, roc_curve, auc, roc_auc_score, recall_score, classification_report

In [2]:
warnings.filterwarnings('ignore')

In [3]:
base_path = '../ghost-dl/data/defect/'

In [4]:
file_dic = {"ivy": ["ivy-1.1.csv", "ivy-1.4.csv", "ivy-2.0.csv"],
            "lucene": ["lucene-2.0.csv", "lucene-2.2.csv", "lucene-2.4.csv"],
            "poi": ["poi-1.5.csv", "poi-2.0.csv", "poi-2.5.csv", "poi-3.0.csv"],
            "synapse": ["synapse-1.0.csv", "synapse-1.1.csv", "synapse-1.2.csv"],
            "velocity": ["velocity-1.4.csv", "velocity-1.5.csv", "velocity-1.6.csv"],
            "camel": ["camel-1.0.csv", "camel-1.2.csv", "camel-1.4.csv", "camel-1.6.csv"],
            "jedit": ["jedit-3.2.csv", "jedit-4.0.csv", "jedit-4.1.csv", "jedit-4.2.csv", "jedit-4.3.csv"],
            "log4j": ["log4j-1.0.csv", "log4j-1.1.csv", "log4j-1.2.csv"],
            "xalan": ["xalan-2.4.csv", "xalan-2.5.csv", "xalan-2.6.csv", "xalan-2.7.csv"],
            "xerces": ["xerces-1.2.csv", "xerces-1.3.csv", "xerces-1.4.csv"]
           }

In [5]:
MAX_DEP = 3
DEP_LIST = list(range(MAX_DEP))
C_LIST = [10.0 ** i for i in range(-2, 3)]
alg = tools.svm

In [17]:
def run_on_dataset(filename, metric='d2h', epochs=10, layers=4, draw_roc=False, weighted=False):
    paths = [os.path.join(base_path, file_name) for file_name in file_dic[filename]]
    train_df = pd.concat([pd.read_csv(path) for path in paths[:-1]], ignore_index=True)
    test_df = pd.read_csv(paths[-1])
    
    train_df, test_df = train_df.iloc[:, 3:], test_df.iloc[:, 3:]
    train_size = train_df["bug"].count()
    df = pd.concat([train_df, test_df], ignore_index=True)
    df['bug'] = df['bug'].apply(lambda x: 0 if x == 0 else 1)
    
    train_data = df.iloc[:train_size, :]
    test_data = df.iloc[train_size:, :]
    
    X_train = train_data[train_data.columns[:-2]]
    y_train = train_data['bug']
    
    cols = X_train.columns[[cfs(X_train.values, y_train.values)]].tolist()
    X_train = X_train[cols]
    
    X_test = test_data[cols]
    y_test = test_data['bug']
        
    train_fold = list(range(len(X_train.index)))
    val_fold = list(range(len(X_train.index), len(X_train.index)+len(X_test.index)))
    
    sm = SMOTE()
    X_train, y_train = sm.fit_sample(X_train, y_train)
    X_train.reset_index(inplace=True,drop=True)
    
    train_fold = list(range(len(X_train.index)))
    val_fold = list(range(len(X_train.index), len(X_train.index)+len(X_test.index)))
    
    X_train = np.array(X_train)
    Ks = NTK.kernel_value_batch(np.vstack((X_train, X_test)), MAX_DEP)
    best_acc = 0.0
    best_value = 0
    best_dep = 0
    best_ker = 0
    
    y_train = np.hstack((y_train, y_test))
    
    print(Ks.shape, len(train_fold), len(val_fold))
    print(y_train.shape)
    
    # enumerate kenerls and cost values to find the best hyperparameters
    for dep in DEP_LIST:
        print('depth:', dep)
        for fix_dep in range(dep + 1):
            print('|\tfix_dep:', fix_dep)
            K = Ks[dep][fix_dep]
            for value in C_LIST:
                print('|\t|\tC:', value, flush=True)
                preds,acc = alg(K[train_fold][:, train_fold], K[val_fold][:, train_fold], y_train[train_fold], y_train[val_fold], value, 2)
                if acc > best_acc:
                    best_acc = acc
                    best_value = value
                    best_dep = dep
                    best_fix = fix_dep

    K = Ks[best_dep][best_fix]
    
    print ("best acc:", best_acc, "\tC:", best_value, "\tdep:", best_dep, "\tfix:", best_fix, flush=True)
    print(classification_report(y_test, alg(K[train_fold][:, train_fold], K[val_fold][:, train_fold], y_train[train_fold], y_train[val_fold], best_value, 2)[0]))

In [18]:
run_on_dataset('ivy')

(3, 3, 898, 898) 546 352
(898,)
depth: 0
|	fix_dep: 0
|	|	C: 0.01
|	|	C: 0.1
|	|	C: 1.0
|	|	C: 10.0
|	|	C: 100.0
depth: 1
|	fix_dep: 0
|	|	C: 0.01
|	|	C: 0.1
|	|	C: 1.0
|	|	C: 10.0
|	|	C: 100.0
|	fix_dep: 1
|	|	C: 0.01
|	|	C: 0.1
|	|	C: 1.0
|	|	C: 10.0
|	|	C: 100.0
depth: 2
|	fix_dep: 0
|	|	C: 0.01
|	|	C: 0.1
|	|	C: 1.0
|	|	C: 10.0
|	|	C: 100.0
|	fix_dep: 1
|	|	C: 0.01
|	|	C: 0.1
|	|	C: 1.0
|	|	C: 10.0
|	|	C: 100.0
|	fix_dep: 2
|	|	C: 0.01
|	|	C: 0.1
|	|	C: 1.0
|	|	C: 10.0
|	|	C: 100.0
best acc: 0.8409090909090909 	C: 0.01 	dep: 0 	fix: 0
              precision    recall  f1-score   support

           0       0.94      0.87      0.91       312
           1       0.38      0.60      0.46        40

    accuracy                           0.84       352
   macro avg       0.66      0.74      0.68       352
weighted avg       0.88      0.84      0.86       352

