In [3]:
import numpy as np
import pandas as pd
import os
from collections import Counter
import matplotlib.pyplot as plt
import statistics

import time
import warnings

# From https://machinelearningmastery.com/sequence-classification-lstm-recurrent-neural-networks-python-keras/
# and https://towardsdatascience.com/multi-class-text-classification-with-lstm-1590bee1bd17
from sklearn.model_selection import train_test_split
from keras.layers import Dense, BatchNormalization
from keras.callbacks import EarlyStopping
from sklearn.preprocessing import label_binarize
from keras.utils import to_categorical
from keras.models import Sequential
from keras.optimizers import SGD
from sklearn.neighbors import LocalOutlierFactor

from sklearn.metrics import confusion_matrix, roc_curve, auc, roc_auc_score, recall_score
from keras import backend as K
import tensorflow as tf

In [2]:
warnings.filterwarnings('ignore')

In [4]:
def get_auc(actual, preds, classes):
    return roc_auc_score(label_binarize(actual, classes), label_binarize(preds, classes))

In [5]:
def get_fpr(actual, preds):
    tn, fp, fn, tp = confusion_matrix(actual, preds, labels=[0,1]).ravel()
    fpr = fp * 1.0 / (tn + fp) if (tn + fp) != 0 else 0
    
    return fpr

In [9]:
def get_recall(true):
    total_true = float(len([i for i in true if i == 1]))
    hit = 0.0
    recall = []
    for i in range(len(true)):
        if true[i] == 1:
            hit += 1
        recall += [hit / total_true if total_true else 0.0]
    return recall

In [6]:
# from https://gist.github.com/wassname/ce364fddfc8a025bfab4348cf5de852d
def weighted_categorical_crossentropy(weights):
    """
    A weighted version of keras.objectives.categorical_crossentropy
    
    Variables:
        weights: numpy array of shape (C,) where C is the number of classes
    
    Usage:
        weights = np.array([0.5,2,10]) # Class one at 0.5, class 2 twice the normal weights, class 3 10x.
        loss = weighted_categorical_crossentropy(weights)
        model.compile(loss=loss,optimizer='adam')
    """
    
    weights = K.variable(weights)
        
    def loss(y_true, y_pred):
        return K.mean(
            K.binary_crossentropy(y_true, y_pred) * weights)
    
    return loss

In [18]:
base_path = '../../Dodge/data/defect/'

In [8]:
file_dic = {"ivy": ["ivy-1.1.csv", "ivy-1.4.csv", "ivy-2.0.csv"],
            "lucene": ["lucene-2.0.csv", "lucene-2.2.csv", "lucene-2.4.csv"],
            "poi": ["poi-1.5.csv", "poi-2.0.csv", "poi-2.5.csv", "poi-3.0.csv"],
            "synapse": ["synapse-1.0.csv", "synapse-1.1.csv", "synapse-1.2.csv"],
            "velocity": ["velocity-1.4.csv", "velocity-1.5.csv", "velocity-1.6.csv"],
            "camel": ["camel-1.0.csv", "camel-1.2.csv", "camel-1.4.csv", "camel-1.6.csv"],
            "jedit": ["jedit-3.2.csv", "jedit-4.0.csv", "jedit-4.1.csv", "jedit-4.2.csv", "jedit-4.3.csv"],
            "log4j": ["log4j-1.0.csv", "log4j-1.1.csv", "log4j-1.2.csv"],
            "xalan": ["xalan-2.4.csv", "xalan-2.5.csv", "xalan-2.6.csv", "xalan-2.7.csv"],
            "xerces": ["xerces-1.2.csv", "xerces-1.3.csv", "xerces-1.4.csv"]
           }

In [35]:
def get_inliers(X, y):
    n_neighbors = int(np.sqrt(len(X)))
    detector = LocalOutlierFactor(n_neighbors=n_neighbors, metric='euclidean')
    results = detector.fit_predict(X)
    indices = np.where(results == 1)[0]
    
    return np.array(X)[indices], np.array(y)[indices]

In [36]:
def run_on_dataset(filename, metric='d2h', epochs=10, layers=4, draw_roc=False, weighted=False):
    paths = [os.path.join(base_path, file_name) for file_name in file_dic[filename]]
    train_df = pd.concat([pd.read_csv(path) for path in paths[:-1]], ignore_index=True)
    test_df = pd.read_csv(paths[-1])
    
    train_df, test_df = train_df.iloc[:, 3:], test_df.iloc[:, 3:]
    train_size = train_df["bug"].count()
    df = pd.concat([train_df, test_df], ignore_index=True)
    df['bug'] = df['bug'].apply(lambda x: 0 if x == 0 else 1)
    
    train_data = df.iloc[:train_size, :]
    test_data = df.iloc[train_size:, :]
    
    X_train = train_data[train_data.columns[:-2]]
    y_train = train_data['bug']
    X_test = test_data[test_data.columns[:-2]]
    y_test = test_data['bug']
    
    X_train, y_train = get_inliers(X_train, y_train)
    
    frac = sum(y_train) * 1.0 / len(y_train)
    if weighted:
        weights = np.array([1., 1. / frac])
    else:
        weights = np.array([1., 1.])
                
    model = Sequential()
    model.add(Dense(20, input_shape=(X_train.shape[1],), activation='relu', name='layer1'))
    
    for i in range(layers - 2):
        model.add(Dense(20, activation='relu', name='layer'+str(i+2)))
        
    model.add(Dense(1, activation='sigmoid', name='layer'+str(layers)))
    model.compile(loss=weighted_categorical_crossentropy(weights), optimizer='adam', metrics=['accuracy'])

    batch_size = 64

    history = model.fit(X_train, y_train, epochs=epochs, batch_size=batch_size,validation_split=0.1, verbose=0, callbacks=[EarlyStopping(monitor='val_loss', patience=20, min_delta=0.0001)])
    
    y_pred = model.predict_classes(X_test)
    
    if metric == 'fpr':
        metric_ = get_fpr(y_test, y_pred)
    elif metric == 'recall':
        metric_ = recall_score(y_test, y_pred)
    elif metric == 'auc':
        metric_ = get_auc(y_test, y_pred, classes=[0,1])
    
    if draw_roc:
        fpr, tpr, _ = roc_curve(y_test, y_pred)
        print('AUC =', auc(fpr, tpr))
        print(metric, '=', metric_)
        plt.plot(fpr, tpr, color='darkorange')
        plt.plot([0, 1], [0, 1], color='navy', linestyle='--')
    
    return history, metric_

In [39]:
for dataset in file_dic.keys():
    print(dataset)
    print('=' * len(dataset))
    for metric in ['fpr', 'recall', 'auc']:
        values = []
        for i in range(20):
            _, metric_ = run_on_dataset(filename=dataset, metric=metric, epochs=10, layers=4)
            values.append(metric_)
        
        print(metric, '-', np.round(np.mean(values), 2), np.round(np.median(values), 2))
    
    print()

ivy
===
fpr - 0.15 0.1
recall - 0.24 0.18
auc - 0.56 0.55

lucene
fpr - 0.44 0.43
recall - 0.6 0.59
auc - 0.58 0.58

poi
===
fpr - 0.15 0.11
recall - 0.37 0.31
auc - 0.62 0.64

synapse
fpr - 0.11 0.07
recall - 0.18 0.15
auc - 0.55 0.56

velocity
fpr - 0.9 0.91
recall - 0.89 0.9
auc - 0.51 0.51

camel
=====
fpr - 0.04 0.03
recall - 0.1 0.06
auc - 0.54 0.53

jedit
=====
fpr - 0.19 0.19
recall - 0.31 0.36
auc - 0.58 0.6

log4j
=====
fpr - 0.25 0.22
recall - 0.29 0.25
auc - 0.54 0.55

xalan
=====
fpr - 0.05 0.09
recall - 0.25 0.24
auc - 0.62 0.62

xerces
fpr - 0.01 0.0
recall - 0.05 0.03
auc - 0.52 0.52

