In [26]:
import numpy as np
import pandas as pd

from keras.layers import BatchNormalization, Dense, Activation
from keras.models import Sequential
from keras.callbacks import LearningRateScheduler
from keras import backend as K

from sklearn.ensemble import AdaBoostClassifier

import itertools

In [2]:
!pwd

/Users/ryedida/Desktop/menzies/dnn-less-data/code


In [10]:
# Data path
base_path = '../../data/Change-Level-Prediction-Data-20191107T052353Z-001/Change-Level-Prediction-Data/ICSE-2016-PROMISE DefectData/'
train_data_fname = 'ant-1.5.csv'

In [11]:
# Read dataset
train_df = pd.read_csv(base_path + train_data_fname)

In [12]:
# Parameters for deep learners
batch_size = 128
num_classes = 2

In [13]:
# Get X and y
x_train = train_df.drop('bug', axis=1)
x_train.drop(['name', 'version', 'name.1'], axis=1, inplace=True)
y_train = train_df['bug']

In [14]:
idx = np.where(y_train > 1)[0]
print(len(idx))
print(len(x_train))
x_train.drop(idx, axis=0, inplace=True)
y_train.drop(idx, inplace=True)

3
293


## Helper functions for neural nets

In [15]:
def parabola(x):
    return x ** 2

In [18]:
def get_model(units=2, n_layers=1, deep_learner=False, batchnorm=False, activation='relu'):
    """
    Returns a model.
    
    Params:
    =======
    units: int - Number of input units. Only for deep learners.
    layers: int - Number of hidden layers. Only for deep learners.
    deep_learner: boolean - To use deep learner, set to True.
    batchnorm: boolean - If True, uses Batch Normalization.
    
    Returns:
    ========
    An instance of keras.Sequential or AdaBoost from sklearn.
    """
    if deep_learner:
        layers = []
        
        # Add first hidden layer set.
        first = Dense(units, input_shape=(units,))
        layers.append(first)
        
        if batchnorm:
            layers.append(BatchNormalization())
        
        layers.append(Activation(activation))
        
        # Add the other hidden layers.
        for i in range(n_layers - 1):
            layers.append(Dense(units))
            
            if batchnorm:
                layers.append(BatchNormalization())
            
            layers.append(Activation(activation))
        
        # Add the output layer.
        layers.append(Dense(1))
        layers.append(Activation('sigmoid'))
        
        return Sequential(layers)
    else:
        return AdaBoostClassifier()

In [20]:
def resample(x, y, n_desired):
    """
    Picks n_desired samples from x and y, trying to maintain a balanced dataset
    and oversampling when necessary.
    
    Args:
    =====
    x, y: dataset
    n_desired: int
    """
    m = len(x)
    
    # We need to pick n_desired random samples, trying to maintain a
    # balanced dataset
    c0_idx = np.where(y == 0)[0]
    c1_idx = np.where(y == 1)[0]
        
    c0_samples = np.random.choice(c0_idx, np.floor(n_desired / 2))
    c1_samples = np.random.choice(c1_idx, np.ceil(n_desired / 2))
        
    pick_idx = np.concatenate((c0_samples, c1_samples))
    return x[pick_idx], y[pick_idx]

In [21]:
def run_one_experiment(x_resampled, y_resampled, x_test, y_test, deep_learner=False, n_layers=1, batchnorm=False, lipschitz_lr=False, 
                       reduce_data=False, embedding_dims=2, k=7, oversample=False, oversampler=None,
                       activation='relu'):
    """
    Runs one experiment.
    
    Params:
    =======
    x, y: np.array - Input data
    deep_learner: boolean - If True, uses a deep learner.
    layers: int - Number of hidden layers. Deep learners only.
    batchnorm: boolean - If True, uses Batch Normalization in deep learner.
    lipschitz_lr: boolean - If True, uses LipschitzLR
    reduce_data: boolean - If True, reduces data using Ivis.
    k: int - k used in Ivis reduction.
    embedding_dims: int - Number of dimensions to reduce data to
    oversample: boolean - If True, oversamples dataset.
    oversampler: str or None - One of None, 'smote', and 'random'.
    
    Returns:
    ========
    prec, recall, f1, time - Precision, recall, and F-1 scores of the model, along with runtime
    """    
    
    # Step 3: Get the model
    model = get_model(units=x_resampled.shape[1],n_layers=n_layers, 
                      deep_learner=deep_learner, batchnorm=batchnorm,
                      activation=activation)
    
    # Step 4: Train model
    if deep_learner:
        ####################
        # LipschitzLR code #
        ####################    
        def lr_schedule(epoch):
            """Learning Rate Schedule
            # Arguments
                epoch (int): The number of epochs
            # Returns
                lr (float32): learning rate
            """

            Kz = 0.
            for i in range((len(x_resampled) - 1) // batch_size + 1):
                start_i = i * batch_size
                end_i = start_i + batch_size
                xb = x_resampled[start_i:end_i]

                activ = np.linalg.norm(func([xb]))
                if activ > Kz:
                    Kz = activ

            K_ = ((num_classes - 1) * Kz) / (num_classes * batch_size)
            lr = 1 / K_
            #print('Epoch', epoch + 1, 'LR =', lr)
            return lr
        ####################
        
        if lipschitz_lr:
            model.compile(loss='binary_crossentropy',
                  optimizer='sgd')
            func = K.function([model.layers[0].input], [model.layers[-2].output])
            lr_scheduler = LearningRateScheduler(lr_schedule)
            
            start = time.time()
            model.fit(x_resampled, y_resampled, batch_size=batch_size, verbose=0, 
                      validation_data=(x_test, y_test), epochs=100, callbacks=[lr_scheduler])
            end = time.time()
        else:
            model.compile(loss='binary_crossentropy',
                  optimizer='adam')
            
            start = time.time()
            model.fit(x_resampled, y_resampled, batch_size=batch_size, verbose=0, 
                      validation_data=(x_test, y_test), epochs=100)
            end = time.time()
        
    else:
        start = time.time()
        model.fit(x_resampled, y_resampled)
        end = time.time()
    
    # Step 5: Evaluate model
    evaluators = [precision_score, recall_score, f1_score]
    if deep_learner:
        prec, recall, f1 = [f(y_test, model.predict_classes(x_test)) for f in evaluators]
    else:
        prec, recall, f1 = [f(y_test, model.predict(x_test)) for f in evaluators]
    
    return prec, recall, f1, (end - start)

In [22]:
def run_20_experiments(reduce_data=False, n_desired=100, 
                       embedding_dims=2, k=7, lipschitz_lr=False, activation='relu', **kwargs):
    global x_train, y_train, x_test, y_test
    prec = []
    recall = []
    f1 = []
    times = []
    
    count = 0

    # Step 1: Reduce dimensions --> (x_*_reduced, y_*)
    x_train_reduced = np.array(x_train)
    x_test_reduced = np.array(x_test)
    y_train = np.array(y_train)
    y_test = np.array(y_test)

    if reduce_data:
        ivis = Ivis(embedding_dims=embedding_dims, k=k, verbose=0)
        x_train_reduced = ivis.fit_transform(x_train_reduced)
        x_test_reduced = ivis.transform(x_test_reduced)

    # Step 2: Oversample data --> (x_resampled, y_resampled)
    x_resampled, y_resampled = resample(x_train_reduced, y_train, n_desired)

    while count != 20:
        try:          
            precision, rec, f1_, time_ = run_one_experiment(x_resampled, y_resampled,
                                                            x_test_reduced, y_test, 
                                                            lipschitz_lr=lipschitz_lr, 
                                                            activation=activation, **kwargs)
            prec.append(precision)
            recall.append(rec)
            f1.append(f1_)
            times.append(time_)
            count += 1
        except KeyboardInterrupt:
            raise
        except:
            print('Retrying...')
            if lipschitz_lr:
                pass
            else:
                raise
    
    print('Precision:', prec, '\nRecall:', recall, '\nF1:', f1, '\nRuntime:', times)

In [28]:
def get_experimental_config():
    configs = [
        {'deep_learner': True, 'n_layers': '1', 'lipschitz_lr': False, 'reduce_data': True, 'embedding_dims': '2', 'k': '4', 'oversample': True, 'activation': 'relu'},
        {'deep_learner': True, 'n_layers': '1', 'lipschitz_lr': False, 'reduce_data': False, 'embedding_dims': '2', 'k': '3', 'oversample': True, 'activation': 'parabola'},
        {'deep_learner': True, 'n_layers': '1', 'lipschitz_lr': False, 'reduce_data': True, 'embedding_dims': '2', 'k': '3', 'oversample': True, 'activation': 'relu'},
        {'deep_learner': True, 'n_layers': '1', 'lipschitz_lr': False, 'reduce_data': True, 'embedding_dims': '3', 'k': '3', 'oversample': True, 'activation': 'relu'},
        {'deep_learner': True, 'n_layers': '1', 'lipschitz_lr': False, 'reduce_data': True, 'embedding_dims': '2', 'k': '5', 'oversample': True, 'activation': 'relu'},
        {'deep_learner': True, 'n_layers': '1', 'lipschitz_lr': False, 'reduce_data': True, 'embedding_dims': '3', 'k': '5', 'oversample': True, 'activation': 'relu'},
        {'deep_learner': True, 'n_layers': '1', 'lipschitz_lr': False, 'reduce_data': True, 'embedding_dims': '3', 'k': '4', 'oversample': True, 'activation': 'relu'},
        {'deep_learner': True, 'n_layers': '1', 'lipschitz_lr': False, 'reduce_data': False, 'embedding_dims': '2', 'k': '3', 'oversample': True, 'activation': 'relu'},
        {'deep_learner': True, 'n_layers': '1', 'lipschitz_lr': False, 'reduce_data': False, 'embedding_dims': '2', 'k': '3', 'oversample': False, 'activation': 'parabola'}
    ]
    
    complete = []

    for config in configs:
        if config not in complete:
            yield config

In [29]:
count = 0
for _ in get_experimental_config():
    count += 1
print(count * 20)

0


In [None]:
for i, config in enumerate(get_experimental_config()):
    print('Experiment', i + 1, '\b:', config)
    run_20_experiments(**config)
    print()