In [104]:
import numpy as np
import pandas as pd
import os
from collections import Counter
import matplotlib.pyplot as plt
import statistics

import time
import warnings

# From https://machinelearningmastery.com/sequence-classification-lstm-recurrent-neural-networks-python-keras/
# and https://towardsdatascience.com/multi-class-text-classification-with-lstm-1590bee1bd17
from sklearn.model_selection import train_test_split
from keras.layers import Dense, BatchNormalization, Dropout
from keras.callbacks import EarlyStopping
from sklearn.preprocessing import label_binarize
from keras.utils import to_categorical
from keras.models import Sequential
from keras.optimizers import SGD

from random import randrange, choice
from sklearn.neighbors import NearestNeighbors
import statistics

from sklearn.metrics import confusion_matrix, roc_curve, auc, roc_auc_score, recall_score, accuracy_score
from sklearn.ensemble import BaggingClassifier, VotingClassifier
from sklearn.tree import DecisionTreeClassifier
from keras import backend as K
import tensorflow as tf

In [2]:
warnings.filterwarnings('ignore')

In [3]:
# from https://gist.github.com/wassname/ce364fddfc8a025bfab4348cf5de852d
def weighted_categorical_crossentropy(weights):
    """
    A weighted version of keras.objectives.categorical_crossentropy
    
    Variables:
        weights: numpy array of shape (C,) where C is the number of classes
    
    Usage:
        weights = np.array([0.5,2,10]) # Class one at 0.5, class 2 twice the normal weights, class 3 10x.
        loss = weighted_categorical_crossentropy(weights)
        model.compile(loss=loss,optimizer='adam')
    """
    
    weights = K.variable(weights)
        
    def loss(y_true, y_pred):
        return K.mean(
            K.binary_crossentropy(y_true, y_pred) * weights)
    
    return loss

In [11]:
base_path = '../../../Dodge/data/defect/'

In [5]:
file_dic = {"ivy": ["ivy-1.1.csv", "ivy-1.4.csv", "ivy-2.0.csv"],
            "lucene": ["lucene-2.0.csv", "lucene-2.2.csv", "lucene-2.4.csv"],
            "poi": ["poi-1.5.csv", "poi-2.0.csv", "poi-2.5.csv", "poi-3.0.csv"],
            "synapse": ["synapse-1.0.csv", "synapse-1.1.csv", "synapse-1.2.csv"],
            "velocity": ["velocity-1.4.csv", "velocity-1.5.csv", "velocity-1.6.csv"],
            "camel": ["camel-1.0.csv", "camel-1.2.csv", "camel-1.4.csv", "camel-1.6.csv"],
            "jedit": ["jedit-3.2.csv", "jedit-4.0.csv", "jedit-4.1.csv", "jedit-4.2.csv", "jedit-4.3.csv"],
            "log4j": ["log4j-1.0.csv", "log4j-1.1.csv", "log4j-1.2.csv"],
            "xalan": ["xalan-2.4.csv", "xalan-2.5.csv", "xalan-2.6.csv", "xalan-2.7.csv"],
            "xerces": ["xerces-1.2.csv", "xerces-1.3.csv", "xerces-1.4.csv"]
           }

In [108]:
# fit model on dataset
def fit_model(trainX, trainy):
    frac = sum(trainy) * 1.0 / len(trainy)
    weights = np.array([1., 10. / frac])
    
    # define model
    model = Sequential()
    model.add(Dense(20, input_shape=(trainX.shape[1],), activation='relu'))
    
    n_layers = np.random.randint(2, 5)
    for i in range(n_layers):
        model.add(Dense(20, activation='relu'))
        if np.random.random(1) <= 0.5:
            model.add(BatchNormalization())
        if np.random.random(1) <= 0.5:
            model.add(Dropout(0.5))
    model.add(Dense(1, activation='sigmoid'))
    model.compile(loss=weighted_categorical_crossentropy(weights), optimizer='adam', metrics=['accuracy'])
    # fit model
    model.fit(trainX, trainy, epochs=20, verbose=0)
    return model
 
# make an ensemble prediction for multi-class classification
def ensemble_predictions(members, testX):
    # make predictions
    yhats = [model.predict(testX) for model in members]
    yhats = np.array(yhats)
    # sum across ensemble members
    summed = np.sum(yhats, axis=0)
    # argmax across classes
    result = np.argmax(summed, axis=1)
    return result
 
# evaluate a specific number of members in an ensemble
def evaluate_n_members(members, n_members, testX, testy):
    # select a subset of members
    subset = members[:n_members]
    print(len(subset))
    # make prediction
    yhat = ensemble_predictions(subset, testX)
    # calculate accuracy
    return accuracy_score(testy, yhat)

In [19]:
# From https://stats.stackexchange.com/a/217753
def SMOTE(T, N, k):
    """
    Returns (N/100) * n_minority_samples synthetic minority samples.

    Parameters
    ----------
    T : array-like, shape = [n_minority_samples, n_features]
        Holds the minority samples
    N : percetange of new synthetic samples: 
        n_synthetic_samples = N/100 * n_minority_samples. Can be < 100.
    k : int. Number of nearest neighbours. 

    Returns
    -------
    S : array, shape = [(N/100) * n_minority_samples, n_features]
    """    
    n_minority_samples, n_features = T.shape

    if N < 100:
        #create synthetic samples only for a subset of T.
        #TODO: select random minortiy samples
        N = 100
        pass

    if (N % 100) != 0:
        raise ValueError("N must be < 100 or multiple of 100")

    N = int(N // 100)
    n_synthetic_samples = N * n_minority_samples
    S = np.zeros(shape=(n_synthetic_samples, n_features))

    #Learn nearest neighbours
    neigh = NearestNeighbors(n_neighbors = k)
    neigh.fit(T)

    #Calculate synthetic samples
    for i in range(n_minority_samples):
        nn = neigh.kneighbors(T[i].reshape(1,-1), return_distance=False)
        for n in range(N):
            nn_index = choice(nn[0])
            #NOTE: nn includes T[i], we don't want to select it 
            while nn_index == i:
                nn_index = choice(nn[0])

            dif = T[nn_index] - T[i]
            gap = np.random.random()
            S[n + i * N, :] = T[i,:] + gap * dif[:]

    return S

In [24]:
def run_on_dataset(filename, epochs=10, layers=4, weighted=True):
    paths = [os.path.join(base_path, file_name) for file_name in file_dic[filename]]
    train_df = pd.concat([pd.read_csv(path) for path in paths[:-1]], ignore_index=True)
    test_df = pd.read_csv(paths[-1])
    
    train_df, test_df = train_df.iloc[:, 3:], test_df.iloc[:, 3:]
    train_size = train_df["bug"].count()
    df = pd.concat([train_df, test_df], ignore_index=True)
    df['bug'] = df['bug'].apply(lambda x: 0 if x == 0 else 1)
    
    train_data = df.iloc[:train_size, :]
    test_data = df.iloc[train_size:, :]
    
    X_train = np.array(train_data[train_data.columns[:-2]])
    y_train = np.array(train_data['bug'])
    X_test = np.array(test_data[test_data.columns[:-2]])
    y_test = np.array(test_data['bug'])
    
    frac = sum(y_train) * 1.0 / len(y_train)
    if weighted:
        weights = np.array([1., 10. / frac])
    else:
        weights = np.array([1., 1.])
                
    model = Sequential()
    model.add(Dense(20, input_shape=(X_train.shape[1],), activation='relu', name='layer1'))
    
    for i in range(layers - 2):
        model.add(Dense(20, activation='relu', name='layer'+str(i+2)))
        model.add(BatchNormalization())
        
    model.add(Dense(1, activation='sigmoid', name='layer'+str(layers)))
    model.compile(loss=weighted_categorical_crossentropy(weights), optimizer='adam', metrics=['accuracy'])

    batch_size = 64

    history = model.fit(X_train, y_train, epochs=epochs, batch_size=batch_size,validation_split=0.1, verbose=0, callbacks=[EarlyStopping(monitor='val_loss', patience=20, min_delta=0.0001)])
    
    y_pred = model.predict_classes(X_test)
    
    print('Recall:', recall_score(y_test, y_pred))
    print('AUC:', roc_auc_score(y_test, y_pred))
    
    synthetic_data = SMOTE(X_train, 500, 5)
    preds = model.predict_classes(synthetic_data)
    
    student = DecisionTreeClassifier()
    student.fit(synthetic_data, preds)
    
    student_preds = student.predict(X_test)
    print('Recall:', recall_score(y_test, student_preds))
    print('AUC:', roc_auc_score(y_test, student_preds))

In [34]:
run_on_dataset('ivy', layers=5, epochs=10)

Recall: 0.675
AUC: 0.7445512820512821
Recall: 0.725
AUC: 0.7631410256410257


## Next step: Ensemble!

In [50]:
def generate_rubbish_map(*args, **kwargs):
    return {
        "n_layers": 5,
        "n_epochs": 10
    }

In [77]:
def run_ensemble_on_dataset(filename, epochs=10, layers=4, weighted=True):
    paths = [os.path.join(base_path, file_name) for file_name in file_dic[filename]]
    train_df = pd.concat([pd.read_csv(path) for path in paths[:-1]], ignore_index=True)
    test_df = pd.read_csv(paths[-1])
    
    train_df, test_df = train_df.iloc[:, 3:], test_df.iloc[:, 3:]
    train_size = train_df["bug"].count()
    df = pd.concat([train_df, test_df], ignore_index=True)
    df['bug'] = df['bug'].apply(lambda x: 0 if x == 0 else 1)
    
    train_data = df.iloc[:train_size, :]
    test_data = df.iloc[train_size:, :]
    
    X_train = np.array(train_data[train_data.columns[:-2]])
    y_train = np.array(train_data['bug'])
    X_test = np.array(test_data[test_data.columns[:-2]])
    y_test = np.array(test_data['bug'])
                
    n_members = 10
    members = [fit_model(X_train, y_train) for _ in range(n_members)]

    batch_size = 64

    y_pred = ensemble_predictions(members, X_test)
    
    print('Recall:', recall_score(y_test, y_pred))
    print('AUC:', roc_auc_score(y_test, y_pred))
    
    synthetic_data = SMOTE(X_train, 500, 5)
    preds = ensemble_predictions(members, synthetic_data)
    
    student = DecisionTreeClassifier()
    student.fit(synthetic_data, preds)
    
    student_preds = student.predict(X_test)
    print('Recall:', recall_score(y_test, student_preds))
    print('AUC:', roc_auc_score(y_test, student_preds))

In [78]:
run_ensemble_on_dataset('ivy')

Recall: 0.0
AUC: 0.5
Recall: 0.0
AUC: 0.5


## Distillation to forest

The naive ensemble didn't work

In [109]:
def run_forest_on_dataset(filename, epochs=10, layers=4, weighted=True):
    paths = [os.path.join(base_path, file_name) for file_name in file_dic[filename]]
    train_df = pd.concat([pd.read_csv(path) for path in paths[:-1]], ignore_index=True)
    test_df = pd.read_csv(paths[-1])
    
    train_df, test_df = train_df.iloc[:, 3:], test_df.iloc[:, 3:]
    train_size = train_df["bug"].count()
    df = pd.concat([train_df, test_df], ignore_index=True)
    df['bug'] = df['bug'].apply(lambda x: 0 if x == 0 else 1)
    
    train_data = df.iloc[:train_size, :]
    test_data = df.iloc[train_size:, :]
    
    X_train = np.array(train_data[train_data.columns[:-2]])
    y_train = np.array(train_data['bug'])
    X_test = np.array(test_data[test_data.columns[:-2]])
    y_test = np.array(test_data['bug'])
                
    n_members = 21
    members = [fit_model(X_train, y_train) for _ in range(n_members)]
    
    synthetic_data = SMOTE(X_train, 1000, 3)
    preds = np.array([model.predict_classes(synthetic_data) for model in members])
    
    students = [DecisionTreeClassifier()] * n_members
    for learner, pred in zip(students, preds):
        learner.fit(synthetic_data, pred)
        
    student_preds = np.array([learner.predict_proba(X_test) for learner in students])
    student_preds = np.apply_along_axis(np.argmax, 2, student_preds)
    student_preds = np.apply_along_axis(statistics.mean, 0, student_preds)
    print('Recall:', recall_score(y_test, student_preds))
    print('AUC:', roc_auc_score(y_test, student_preds))

In [110]:
run_forest_on_dataset('ivy')

Recall: 0.075
AUC: 0.5150641025641026
