In [60]:
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
from sklearn.naive_bayes import GaussianNB
from keras.models import Sequential
from keras.layers import Dense
import keras.backend as K
from tqdm import tqdm_notebook
import warnings
import numpy as np
import pandas as pd
import os

In [2]:
warnings.filterwarnings('ignore')

In [3]:
base_path = '../../../Dodge/data/defect/'

file_dic = {"ivy": ["ivy-1.1.csv", "ivy-1.4.csv", "ivy-2.0.csv"],
            "lucene": ["lucene-2.0.csv", "lucene-2.2.csv", "lucene-2.4.csv"],
            "poi": ["poi-1.5.csv", "poi-2.0.csv", "poi-2.5.csv", "poi-3.0.csv"],
            "synapse": ["synapse-1.0.csv", "synapse-1.1.csv", "synapse-1.2.csv"],
            "velocity": ["velocity-1.4.csv", "velocity-1.5.csv", "velocity-1.6.csv"],
            "camel": ["camel-1.0.csv", "camel-1.2.csv", "camel-1.4.csv", "camel-1.6.csv"],
            "jedit": ["jedit-3.2.csv", "jedit-4.0.csv", "jedit-4.1.csv", "jedit-4.2.csv", "jedit-4.3.csv"],
            "log4j": ["log4j-1.0.csv", "log4j-1.1.csv", "log4j-1.2.csv"],
            "xalan": ["xalan-2.4.csv", "xalan-2.5.csv", "xalan-2.6.csv", "xalan-2.7.csv"],
            "xerces": ["xerces-1.2.csv", "xerces-1.3.csv", "xerces-1.4.csv"]
           }

In [59]:
# from https://gist.github.com/wassname/ce364fddfc8a025bfab4348cf5de852d
def weighted_categorical_crossentropy(weights):
    """
    A weighted version of keras.objectives.categorical_crossentropy
    
    Variables:
        weights: numpy array of shape (C,) where C is the number of classes
    
    Usage:
        weights = np.array([0.5,2,10]) # Class one at 0.5, class 2 twice the normal weights, class 3 10x.
        loss = weighted_categorical_crossentropy(weights)
        model.compile(loss=loss,optimizer='adam')
    """
    
    weights = K.variable(weights)
        
    def loss(y_true, y_pred):
        return K.mean(
            K.binary_crossentropy(y_true, y_pred) * weights)
    
    return loss

In [84]:
def DL(frac):
    n_layers = np.random.randint(1, 11)
    n_units = np.random.randint(1, 21)
    
    model = Sequential()
    
    for i in range(n_layers):
        model.add(Dense(n_units, activation='relu'))
    
    model.add(Dense(1, activation='sigmoid'))
    
    weights = np.array([10./frac, 1.])
    
    model.compile(loss=weighted_categorical_crossentropy(weights), optimizer='sgd')
    return model

In [85]:
# from https://stackoverflow.com/questions/30564015/how-to-generate-random-points-in-a-circular-distribution
def fuzz_data(X, y, radii=(0., 1, .1)):
    idx = np.where(y == 1)[0]
    frac = len(idx) * 1. / len(y)
    print('debug: weight =', 1./frac)
    
    fuzzed_x = []
    fuzzed_y = []
    
    for row in X[idx]:
        for i, r in enumerate(np.arange(*radii)):
            for j in range(int((1./frac) / pow(np.sqrt(2), i))):
                fuzzed_x.append([val - r for val in row])
                fuzzed_x.append([val + r for val in row])
                fuzzed_y.append(1)
                fuzzed_y.append(1)
    
    return np.concatenate((X, np.array(fuzzed_x)), axis=0), np.concatenate((y, np.array(fuzzed_y)))

In [86]:
def committee(filename):
    paths = [os.path.join(base_path, file_name) for file_name in file_dic[filename]]
    train_df = pd.concat([pd.read_csv(path) for path in paths[:-1]], ignore_index=True)
    test_df = pd.read_csv(paths[-1])
    
    train_df, test_df = train_df.iloc[:, 3:], test_df.iloc[:, 3:]
    train_size = train_df["bug"].count()
    df = pd.concat([train_df, test_df], ignore_index=True)
    df['bug'] = df['bug'].apply(lambda x: 0 if x == 0 else 1)
    
    train_data = df.iloc[:train_size, :]
    test_data = df.iloc[train_size:, :]
    
    X_train = np.array(train_data[train_data.columns[:-2]])
    y_train = np.array(train_data['bug'])
    X_test = np.array(test_data[test_data.columns[:-2]])
    y_test = np.array(test_data['bug'])
    
    X_train, X_val, y_train, y_val = train_test_split(X_train, y_train)
    #X_train, y_train = fuzz_data(X_train, y_train)
    
    print('Fuzzed data. Number of samples =', len(X_train))
    
    frac = sum(y_train) * 1.0 / len(y_train)
    
    committee = [RandomForestClassifier(), RandomForestClassifier(), RandomForestClassifier(), 
                 KNeighborsClassifier(n_neighbors=2), KNeighborsClassifier(n_neighbors=3),
                 KNeighborsClassifier(n_neighbors=4), KNeighborsClassifier(n_neighbors=5),
                 LogisticRegression(), DL(frac), DL(frac), DL(frac), DL(frac), DL(frac), DL(frac), DL(frac),
                 SVC(probability=True), GaussianNB()]
    
    pb = tqdm_notebook(total=len(committee))
    for learner in committee:
        if learner.__class__.__name__ == 'Sequential':
            learner.fit(X_train, y_train, epochs=50, verbose=0)
        else:
            learner.fit(X_train, y_train)
        pb.update(1)
        
    print('Trained committee.')
    
    preds = []
    for learner in committee:
        if learner.__class__.__name__ == 'Sequential':
            preds.append(learner.evaluate(X_val, y_val))
        else:
            preds.append(learner.score(X_val, y_val))
    
    print('Evaluated committee:', preds)
        
    test_preds = []
    
    pbar = tqdm_notebook(total=len(X_test))
    for sample in X_test: 
        classified = False
        most_conf = -1
        most_conf_idx = -1
        for index in reversed(np.argsort(preds)):            
            prob = committee[index].predict_proba(sample.reshape(1, -1))[0][0]
            
            if prob > most_conf:
                most_conf = prob
                most_conf_idx = index
            
            if prob > 0.4 and prob < 0.5:
                continue
        
        if not classified:
            if committee[most_conf_idx].__class__.__name__ == 'Sequential':
                test_preds.append(int(committee[most_conf_idx].predict_classes(sample.reshape(1,-1))))
            else:
                test_preds.append(committee[most_conf_idx].predict(sample.reshape(1,-1))[0])
        pbar.update(1)
    
    test_preds = np.array(test_preds)
    print(y_test)
    print(test_preds.squeeze())
    print(classification_report(y_test, test_preds.squeeze()))
    return test_preds

In [87]:
committee('ivy')

Fuzzed data. Number of samples = 264


HBox(children=(IntProgress(value=0, max=17), HTML(value='')))

Trained committee.
Evaluated committee: [0.7386363636363636, 0.7386363636363636, 0.7613636363636364, 0.7159090909090909, 0.7159090909090909, 0.7386363636363636, 0.75, 0.7727272727272727, nan, 14.81309647993608, 13.448060382496227, 14.073207161643289, 13.41886754469438, nan, nan, 0.75, 0.7954545454545454]


HBox(children=(IntProgress(value=0, max=352), HTML(value='')))

[1 0 0 0 0 1 0 1 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0
 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 1 1 0 0 0 0 0 0 0 0 0 0 0 1
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1
 0 0 0 0 0 0 0 0 0 1 1 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0
 1 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 1 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 1 0 0 1 0 0 0 1 0 0 0 0 1 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 1 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 1 1 0 1 0 0
 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 1 0 0 1 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0
 0 0 0 1 0 0 1 0 0 0 0 0 1 0 0 0 0 0 1]
[0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0

array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,