In [1]:
import pandas as pd
import numpy as np
import os
from sklearn.model_selection import train_test_split
from raise_utils.data import Data
from raise_utils.hyperparams import DODGE
from raise_utils.learners import *
from keras.utils.np_utils import to_categorical
from raise_utils.transform import Transform
from raise_utils.metrics import ClassificationMetrics
from raise_utils.hooks import Hook

In [2]:
def split_data(filename: str, data: Data, n_classes: int):
    if n_classes == 2:
        if filename == 'firefox.csv':
            data.y_train = data.y_train < 4
            data.y_test = data.y_test < 4
        elif filename == 'chromium.csv':
            data.y_train = data.y_train < 5
            data.y_test = data.y_test < 5
        else:
            data.y_train = data.y_train < 6
            data.y_test = data.y_test < 6
    elif n_classes == 3:
        data.y_train = np.where(data.y_train < 2, 0,
                                np.where(data.y_train < 6, 1, 2))
        data.y_test = np.where(
            data.y_test < 2, 0, np.where(data.y_test < 6, 1, 2))
    elif n_classes == 5:
        data.y_train = np.where(data.y_train < 1, 0, np.where(data.y_train < 3, 1, np.where(
            data.y_train < 6, 2, np.where(data.y_train < 21, 3, 4))))
        data.y_test = np.where(data.y_test < 1, 0, np.where(data.y_test < 3, 1, np.where(
            data.y_test < 6, 2, np.where(data.y_test < 21, 3, 4))))
    elif n_classes == 7:
        data.y_train = np.where(data.y_train < 1, 0, np.where(data.y_train < 2, 1, np.where(data.y_train < 3, 2, np.where(
            data.y_train < 6, 3, np.where(data.y_train < 11, 4, np.where(data.y_train < 21, 5, 6))))))
        data.y_test = np.where(data.y_test < 1, 0, np.where(data.y_test < 2, 1, np.where(data.y_test < 3, 2, np.where(
            data.y_test < 6, 3, np.where(data.y_test < 11, 4, np.where(data.y_test < 21, 5, 6))))))
    else:
        data.y_train = np.where(data.y_train < 1, 0, np.where(data.y_train < 2, 1, np.where(data.y_train < 3, 2, np.where(data.y_train < 4, 3, np.where(
            data.y_train < 6, 4, np.where(data.y_train < 8, 5, np.where(data.y_train < 11, 6, np.where(data.y_train < 21, 7, 8))))))))
        data.y_test = np.where(data.y_test < 1, 0, np.where(data.y_test < 2, 1, np.where(data.y_test < 3, 2, np.where(data.y_test < 4, 3, np.where(
            data.y_test < 6, 4, np.where(data.y_test < 8, 5, np.where(data.y_test < 11, 6, np.where(data.y_test < 21, 7, 8))))))))

    #if n_classes > 2:
    #    data.y_train = to_categorical(data.y_train, num_classes=n_classes)
    #    data.y_test = to_categorical(data.y_test, num_classes=n_classes)
    
    return data

In [3]:
def run(filename, n_classes):
    df = pd.read_csv(f'./Bug-Related-Activity-Logs/{filename}.csv')
    df.drop(['Unnamed: 0', 'bugID'], axis=1, inplace=True)
    _df = df[['s1', 's2', 's3', 's4', 's5', 's6', 's8', 'y']]
    _df['s70'] = df['s7'].apply(lambda x: eval(x)[0])
    _df['s71'] = df['s7'].apply(lambda x: eval(x)[1])
    _df['s72'] = df['s7'].apply(lambda x: eval(x)[2])
    _df['s90'] = df['s9'].apply(lambda x: eval(x)[0])
    _df['s91'] = df['s9'].apply(lambda x: eval(x)[1])
    
    if filename == 'firefox':
        _df['s92'] = df['s9'].apply(lambda x: eval(x)[2])
    
    x = _df.drop('y', axis=1)
    y = _df['y']
    
    data = Data(*train_test_split(x, y))
    data = split_data(filename, data, n_classes)
    
    top1 = []
    top2 = []
    def hook(model, x_test, y_test):
        m = ClassificationMetrics(y_test, model.predict(x_test))
        m.add_metric('accuracy')
        t1 = m.get_metrics()[0]
        print("Top-1 Accuracy =", t1)
        top1.append(t1)
        t2 = get_top2(model, x_test, y_test)
        print("Top-2 Accuracy =", t2)
        top2.append(t2)
    
    config = {
        "n_runs": 20,
        "transforms": ["normalize", "standardize", "robust", "maxabs", "minmax"] * 30,
        "metrics": ["accuracy"],
        "random": True,
        "learners": [],
        "log_path": "./dodge-log/",
        "data": [data],
        "name": f"{filename}-{n_classes}",
        "post_train_hooks": [Hook(name="top2", function=hook)]
    }
    for _ in range(50):
        config["learners"].extend([
            DecisionTree(random=True),
            RandomForest(random=True),
            LogisticRegressionClassifier(random=True),
            NaiveBayes(random=True)
        ])

    dodge = DODGE(config)
    dodge.optimize()
    print("\nMedian top-1 =", np.median(top1))
    print("Median top-2 =", np.median(top2))

In [63]:
!mkdir dodge-log

mkdir: dodge-log: File exists


In [64]:
for file in ['firefox', 'eclipse', 'chromium']:
    print('Running:', file)
    print('=' * 30)
    for n_class in [2, 3, 5, 7, 9]:
        if os.path.exists(f'./dodge-log/{file}-{n_class}.txt'):
            continue
        print(n_class, 'classes')
        print('=' * 30)
        run(file, n_class)

Running: firefox
Running: eclipse
3 classes
{'hooks': None, 'learner': DecisionTreeClassifier(criterion='entropy', splitter='random'), 'name': 'rf', 'random': True, 'random_map': {'criterion': ['gini', 'entropy'], 'splitter': ['best', 'random']}, 'x_test': None, 'x_train': None, 'y_test': None, 'y_train': None}
{'hooks': None, 'learner': RandomForestClassifier(n_estimators=40), 'name': 'rf', 'random': True, 'random_map': {'criterion': ['gini', 'entropy'], 'n_estimators': (10, 100)}, 'x_test': None, 'x_train': None, 'y_test': None, 'y_train': None}
{'hooks': None, 'learner': LogisticRegression(C=0.1, penalty='l1', solver='liblinear'), 'name': 'rf', 'random': True, 'random_map': {'penalty': ['l1', 'l2'], 'C': [0.1, 1.0, 10.0, 100.0, 1000.0]}, 'x_test': None, 'x_train': None, 'y_test': None, 'y_train': None}
{'hooks': None, 'learner': GaussianNB(), 'name': 'rf', 'random': True, 'random_map': {}, 'x_test': None, 'x_train': None, 'y_test': None, 'y_train': None}
{'hooks': None, 'learner': D

{'hooks': None, 'learner': DecisionTreeClassifier(criterion='entropy', splitter='random'), 'name': 'rf', 'random': True, 'random_map': {'criterion': ['gini', 'entropy'], 'splitter': ['best', 'random']}, 'x_test': None, 'x_train': None, 'y_test': None, 'y_train': None}
{'hooks': None, 'learner': RandomForestClassifier(criterion='entropy', n_estimators=14), 'name': 'rf', 'random': True, 'random_map': {'criterion': ['gini', 'entropy'], 'n_estimators': (10, 100)}, 'x_test': None, 'x_train': None, 'y_test': None, 'y_train': None}
{'hooks': None, 'learner': LogisticRegression(C=10.0, solver='liblinear'), 'name': 'rf', 'random': True, 'random_map': {'penalty': ['l1', 'l2'], 'C': [0.1, 1.0, 10.0, 100.0, 1000.0]}, 'x_test': None, 'x_train': None, 'y_test': None, 'y_train': None}
{'hooks': None, 'learner': GaussianNB(), 'name': 'rf', 'random': True, 'random_map': {}, 'x_test': None, 'x_train': None, 'y_test': None, 'y_train': None}
{'hooks': None, 'learner': DecisionTreeClassifier(criterion='ent

{'hooks': None, 'learner': DecisionTreeClassifier(criterion='entropy'), 'name': 'rf', 'random': True, 'random_map': {'criterion': ['gini', 'entropy'], 'splitter': ['best', 'random']}, 'x_test': None, 'x_train': None, 'y_test': None, 'y_train': None}
{'hooks': None, 'learner': RandomForestClassifier(n_estimators=84), 'name': 'rf', 'random': True, 'random_map': {'criterion': ['gini', 'entropy'], 'n_estimators': (10, 100)}, 'x_test': None, 'x_train': None, 'y_test': None, 'y_train': None}
{'hooks': None, 'learner': LogisticRegression(C=10.0, solver='liblinear'), 'name': 'rf', 'random': True, 'random_map': {'penalty': ['l1', 'l2'], 'C': [0.1, 1.0, 10.0, 100.0, 1000.0]}, 'x_test': None, 'x_train': None, 'y_test': None, 'y_train': None}
{'hooks': None, 'learner': GaussianNB(), 'name': 'rf', 'random': True, 'random_map': {}, 'x_test': None, 'x_train': None, 'y_test': None, 'y_train': None}
{'hooks': None, 'learner': DecisionTreeClassifier(criterion='entropy', splitter='random'), 'name': 'rf',

{'hooks': None, 'learner': DecisionTreeClassifier(criterion='entropy'), 'name': 'rf', 'random': True, 'random_map': {'criterion': ['gini', 'entropy'], 'splitter': ['best', 'random']}, 'x_test': None, 'x_train': None, 'y_test': None, 'y_train': None}
{'hooks': None, 'learner': RandomForestClassifier(n_estimators=33), 'name': 'rf', 'random': True, 'random_map': {'criterion': ['gini', 'entropy'], 'n_estimators': (10, 100)}, 'x_test': None, 'x_train': None, 'y_test': None, 'y_train': None}
{'hooks': None, 'learner': LogisticRegression(C=100.0, solver='liblinear'), 'name': 'rf', 'random': True, 'random_map': {'penalty': ['l1', 'l2'], 'C': [0.1, 1.0, 10.0, 100.0, 1000.0]}, 'x_test': None, 'x_train': None, 'y_test': None, 'y_train': None}
{'hooks': None, 'learner': GaussianNB(), 'name': 'rf', 'random': True, 'random_map': {}, 'x_test': None, 'x_train': None, 'y_test': None, 'y_train': None}
{'hooks': None, 'learner': DecisionTreeClassifier(criterion='entropy'), 'name': 'rf', 'random': True, '

{'hooks': None, 'learner': DecisionTreeClassifier(splitter='random'), 'name': 'rf', 'random': True, 'random_map': {'criterion': ['gini', 'entropy'], 'splitter': ['best', 'random']}, 'x_test': None, 'x_train': None, 'y_test': None, 'y_train': None}
{'hooks': None, 'learner': RandomForestClassifier(criterion='entropy', n_estimators=55), 'name': 'rf', 'random': True, 'random_map': {'criterion': ['gini', 'entropy'], 'n_estimators': (10, 100)}, 'x_test': None, 'x_train': None, 'y_test': None, 'y_train': None}
{'hooks': None, 'learner': LogisticRegression(C=100.0, solver='liblinear'), 'name': 'rf', 'random': True, 'random_map': {'penalty': ['l1', 'l2'], 'C': [0.1, 1.0, 10.0, 100.0, 1000.0]}, 'x_test': None, 'x_train': None, 'y_test': None, 'y_train': None}
{'hooks': None, 'learner': GaussianNB(), 'name': 'rf', 'random': True, 'random_map': {}, 'x_test': None, 'x_train': None, 'y_test': None, 'y_train': None}
{'hooks': None, 'learner': DecisionTreeClassifier(criterion='entropy', splitter='ran

{'hooks': None, 'learner': DecisionTreeClassifier(splitter='random'), 'name': 'rf', 'random': True, 'random_map': {'criterion': ['gini', 'entropy'], 'splitter': ['best', 'random']}, 'x_test': None, 'x_train': None, 'y_test': None, 'y_train': None}
{'hooks': None, 'learner': RandomForestClassifier(n_estimators=12), 'name': 'rf', 'random': True, 'random_map': {'criterion': ['gini', 'entropy'], 'n_estimators': (10, 100)}, 'x_test': None, 'x_train': None, 'y_test': None, 'y_train': None}
{'hooks': None, 'learner': LogisticRegression(C=10.0, solver='liblinear'), 'name': 'rf', 'random': True, 'random_map': {'penalty': ['l1', 'l2'], 'C': [0.1, 1.0, 10.0, 100.0, 1000.0]}, 'x_test': None, 'x_train': None, 'y_test': None, 'y_train': None}
{'hooks': None, 'learner': GaussianNB(), 'name': 'rf', 'random': True, 'random_map': {}, 'x_test': None, 'x_train': None, 'y_test': None, 'y_train': None}
{'hooks': None, 'learner': DecisionTreeClassifier(), 'name': 'rf', 'random': True, 'random_map': {'criteri

KeyboardInterrupt: 

## Untuned learners

In [4]:
!mkdir untuned-nondl-log

In [4]:
from raise_utils.experiments import Experiment
from raise_utils.hooks import Hook

In [5]:
def get_top2(model, data):
    y_test = np.array(data.y_test)
    preds = model.learner.predict_proba(data.x_test)
    best_n = np.argsort(preds, axis=1)[:,-2:]
    correct = 0
    total = len(y_test)
    
    for i, pred in enumerate(best_n):
        if y_test[i] in pred:
            correct += 1
    return round(correct / total, 3)

In [6]:
def get_data(filename, n_classes):
    df = pd.read_csv(f'./Bug-Related-Activity-Logs/{filename}.csv')
    df.drop(['Unnamed: 0', 'bugID'], axis=1, inplace=True)
    _df = df[['s1', 's2', 's3', 's4', 's5', 's6', 's8', 'y']]
    _df['s70'] = df['s7'].apply(lambda x: eval(x)[0])
    _df['s71'] = df['s7'].apply(lambda x: eval(x)[1])
    _df['s72'] = df['s7'].apply(lambda x: eval(x)[2])
    _df['s90'] = df['s9'].apply(lambda x: eval(x)[0])
    _df['s91'] = df['s9'].apply(lambda x: eval(x)[1])
    
    if filename == 'firefox':
        _df['s92'] = df['s9'].apply(lambda x: eval(x)[2])
    
    x = _df.drop('y', axis=1)
    y = _df['y']
    
    data = Data(*train_test_split(x, y))
    data = split_data(filename, data, n_classes)
    return data

In [27]:
for filename, n_class, data in [(f, c, get_data(f, c)) for f in ["firefox", "eclipse", "chromium"] for c in [2, 3, 5, 7, 9]]:    
    print("Running", filename, n_class, "classes.")
    for learner in [DecisionTree(name="dt"), LogisticRegressionClassifier(name="lr"), RandomForest()]:
        print("Learner:", learner.__name__)
        top1 = []
        top2 = []
        for i in range(20):
            tr = Transform("normalize")
            tr.apply(data)
            learner.set_data(*data)
            learner.fit()
            
            preds = learner.predict(data.x_test)
            metr = ClassificationMetrics(data.y_test, preds)
            metr.add_metric('accuracy')
            print("Accuracy =", metr.get_metrics()[0])
            t2 = get_top2(learner, data)
            print("Top-2 Acc =", t2)
            top1.append(metr.get_metrics()[0])
            top2.append(t2)
        
        print("\nMedian top-1 accuracy =", round(np.median(top1), 3))
        print("Median top-2 accuracy =", round(np.median(top2), 3))
        print()
    print()
print()

Running firefox 2 classes.
Learner: dt
Accuracy = 0.6475058568081824
Top-2 Acc = 1.0
Accuracy = 0.6480772527284155
Top-2 Acc = 1.0
Accuracy = 0.6473915776241358
Top-2 Acc = 1.0
Accuracy = 0.6488200674247185
Top-2 Acc = 1.0
Accuracy = 0.6471630192560425
Top-2 Acc = 1.0
Accuracy = 0.6468201817039027
Top-2 Acc = 1.0
Accuracy = 0.6469344608879493
Top-2 Acc = 1.0
Accuracy = 0.6479058339523456
Top-2 Acc = 1.0
Accuracy = 0.647791554768299
Top-2 Acc = 1.0
Accuracy = 0.6475058568081824
Top-2 Acc = 1.0
Accuracy = 0.6459630878235529
Top-2 Acc = 1.0
Accuracy = 0.6471058796640192
Top-2 Acc = 1.0
Accuracy = 0.6473915776241358
Top-2 Acc = 1.0
Accuracy = 0.646705902519856
Top-2 Acc = 1.0
Accuracy = 0.6458488086395063
Top-2 Acc = 1.0
Accuracy = 0.6482486715044855
Top-2 Acc = 1.0
Accuracy = 0.6464202045597395
Top-2 Acc = 1.0
Accuracy = 0.6443060396548769
Top-2 Acc = 1.0
Accuracy = 0.647620135992229
Top-2 Acc = 1.0
Accuracy = 0.6464773441517627
Top-2 Acc = 1.0

Median top-1 accuracy = 0.647
Median top-2 

IndexError: tuple index out of range

In [28]:
for filename, n_class, data in [(f, c, get_data(f, c)) for f in ["firefox", "eclipse", "chromium"] for c in [3, 5, 7, 9]]:    
    print("Running", filename, n_class, "classes.")
    for learner in [DecisionTree(name="dt"), LogisticRegressionClassifier(name="lr"), RandomForest()]:
        print("Learner:", learner.__name__)
        top1 = []
        top2 = []
        for i in range(20):
            tr = Transform("normalize")
            tr.apply(data)
            learner.set_data(*data)
            learner.fit()
            
            preds = learner.predict(data.x_test)
            metr = ClassificationMetrics(data.y_test, preds)
            metr.add_metric('accuracy')
            print("Accuracy =", metr.get_metrics()[0])
            t2 = get_top2(learner, data)
            print("Top-2 Acc =", t2)
            top1.append(metr.get_metrics()[0])
            top2.append(t2)
        
        print("\nMedian top-1 accuracy =", round(np.median(top1), 3))
        print("Median top-2 accuracy =", round(np.median(top2), 3))
        print()
    print()
print()

Running firefox 3 classes.
Learner: dt
Accuracy = 0.40026284212330726
Top-2 Acc = 0.642
Accuracy = 0.4010056568196103
Top-2 Acc = 0.642
Accuracy = 0.40294840294840295
Top-2 Acc = 0.642
Accuracy = 0.40180561110793667
Top-2 Acc = 0.642
Accuracy = 0.40071995885949374
Top-2 Acc = 0.642
Accuracy = 0.40191989029198333
Top-2 Acc = 0.642
Accuracy = 0.40157705273984345
Top-2 Acc = 0.642
Accuracy = 0.40191989029198333
Top-2 Acc = 0.642
Accuracy = 0.40037712130735387
Top-2 Acc = 0.642
Accuracy = 0.40294840294840295
Top-2 Acc = 0.642
Accuracy = 0.40311982172447286
Top-2 Acc = 0.642
Accuracy = 0.40380549682875266
Top-2 Acc = 0.642
Accuracy = 0.40054854008342383
Top-2 Acc = 0.642
Accuracy = 0.40186275069996
Top-2 Acc = 0.642
Accuracy = 0.40031998171533056
Top-2 Acc = 0.642
Accuracy = 0.4035197988686361
Top-2 Acc = 0.642
Accuracy = 0.4001485629392606
Top-2 Acc = 0.642
Accuracy = 0.4000914233472373
Top-2 Acc = 0.642
Accuracy = 0.4023198674361465
Top-2 Acc = 0.642
Accuracy = 0.3998057253871207
Top-2 Ac

Accuracy = 0.1746757328152677
Top-2 Acc = 0.299
Accuracy = 0.1746757328152677
Top-2 Acc = 0.299

Median top-1 accuracy = 0.175
Median top-2 accuracy = 0.299

Learner: rf
Accuracy = 0.17884692303296953
Top-2 Acc = 0.299
Accuracy = 0.17810410833666648
Top-2 Acc = 0.299
Accuracy = 0.1782183875207131
Top-2 Acc = 0.299
Accuracy = 0.17890406262499287
Top-2 Acc = 0.299
Accuracy = 0.17878978344094623
Top-2 Acc = 0.299
Accuracy = 0.17787554996857322
Top-2 Acc = 0.299
Accuracy = 0.1781612479286898
Top-2 Acc = 0.299
Accuracy = 0.1777612707845266
Top-2 Acc = 0.299
Accuracy = 0.1769041769041769
Top-2 Acc = 0.299
Accuracy = 0.17850408548082966
Top-2 Acc = 0.299
Accuracy = 0.17924690017713274
Top-2 Acc = 0.299
Accuracy = 0.17936117936117937
Top-2 Acc = 0.299
Accuracy = 0.1786183646648763
Top-2 Acc = 0.299
Accuracy = 0.17758985200845667
Top-2 Acc = 0.299
Accuracy = 0.17747557282441004
Top-2 Acc = 0.299
Accuracy = 0.178561225072853
Top-2 Acc = 0.299
Accuracy = 0.17810410833666648
Top-2 Acc = 0.299
Accu

Accuracy = 0.23170104565453403
Top-2 Acc = 0.381
Accuracy = 0.23050111422204445
Top-2 Acc = 0.381
Accuracy = 0.23175818524655734
Top-2 Acc = 0.381
Accuracy = 0.23141534769441746
Top-2 Acc = 0.381
Accuracy = 0.23095823095823095
Top-2 Acc = 0.381
Accuracy = 0.23095823095823095
Top-2 Acc = 0.381
Accuracy = 0.23215816239072054
Top-2 Acc = 0.381

Median top-1 accuracy = 0.231
Median top-2 accuracy = 0.381

Learner: lr
Accuracy = 0.20970230272555854
Top-2 Acc = 0.381
Accuracy = 0.20970230272555854
Top-2 Acc = 0.381
Accuracy = 0.20970230272555854
Top-2 Acc = 0.381
Accuracy = 0.20970230272555854
Top-2 Acc = 0.381
Accuracy = 0.20970230272555854
Top-2 Acc = 0.381
Accuracy = 0.20970230272555854
Top-2 Acc = 0.381
Accuracy = 0.20970230272555854
Top-2 Acc = 0.381
Accuracy = 0.20970230272555854
Top-2 Acc = 0.381
Accuracy = 0.20970230272555854
Top-2 Acc = 0.381
Accuracy = 0.20970230272555854
Top-2 Acc = 0.381
Accuracy = 0.20970230272555854
Top-2 Acc = 0.381
Accuracy = 0.20970230272555854
Top-2 Acc = 0

Top-2 Acc = 0.291
Accuracy = 0.14810582252442717
Top-2 Acc = 0.291
Accuracy = 0.14839152048454374
Top-2 Acc = 0.291
Accuracy = 0.14787726415633393
Top-2 Acc = 0.291
Accuracy = 0.14879149762870694
Top-2 Acc = 0.291
Accuracy = 0.14736300782812412
Top-2 Acc = 0.291
Accuracy = 0.1481629621164505
Top-2 Acc = 0.291
Accuracy = 0.14667733272384434
Top-2 Acc = 0.291
Accuracy = 0.14736300782812412
Top-2 Acc = 0.291
Accuracy = 0.14759156619621736
Top-2 Acc = 0.291
Accuracy = 0.14782012456431062
Top-2 Acc = 0.291
Accuracy = 0.14702017027598424
Top-2 Acc = 0.291
Accuracy = 0.14713444946003085
Top-2 Acc = 0.291

Median top-1 accuracy = 0.148
Median top-2 accuracy = 0.291


Running chromium 3 classes.
Learner: dt
Accuracy = 0.3378092680418262
Top-2 Acc = 0.555
Accuracy = 0.33512370721673046
Top-2 Acc = 0.555
Accuracy = 0.33552368436089364
Top-2 Acc = 0.555
Accuracy = 0.33495228844066055
Top-2 Acc = 0.555
Accuracy = 0.33638077824124335
Top-2 Acc = 0.555
Accuracy = 0.33752357008170963
Top-2 Acc = 0.555

Accuracy = 0.11965030569681732
Top-2 Acc = 0.272
Accuracy = 0.11970744528884064
Top-2 Acc = 0.272
Accuracy = 0.11965030569681732
Top-2 Acc = 0.272
Accuracy = 0.11970744528884064
Top-2 Acc = 0.272
Accuracy = 0.11970744528884064
Top-2 Acc = 0.272
Accuracy = 0.11970744528884064
Top-2 Acc = 0.272
Accuracy = 0.11970744528884064
Top-2 Acc = 0.272
Accuracy = 0.11970744528884064
Top-2 Acc = 0.272
Accuracy = 0.11970744528884064
Top-2 Acc = 0.272
Accuracy = 0.11970744528884064
Top-2 Acc = 0.272
Accuracy = 0.11970744528884064
Top-2 Acc = 0.272
Accuracy = 0.11970744528884064
Top-2 Acc = 0.272
Accuracy = 0.11965030569681732
Top-2 Acc = 0.272
Accuracy = 0.11970744528884064
Top-2 Acc = 0.272
Accuracy = 0.11970744528884064
Top-2 Acc = 0.272
Accuracy = 0.11970744528884064
Top-2 Acc = 0.272
Accuracy = 0.11965030569681732
Top-2 Acc = 0.272

Median top-1 accuracy = 0.12
Median top-2 accuracy = 0.272

Learner: rf
Accuracy = 0.14153476944174617
Top-2 Acc = 0.272
Accuracy = 0.1433632363864922
Top-2 Acc = 0.2

In [61]:
def get_top2(model, x_test, y_test):
    y_test = np.array(y_test)
    preds = model.learner.predict_proba(x_test)
    best_n = np.argsort(preds, axis=1)[:,-2:]
    correct = 0
    total = len(y_test)
    
    for i, pred in enumerate(best_n):
        if y_test[i] in pred:
            correct += 1
    return round(correct / total, 3)

In [38]:
for filename, n_class, data in [(f, c, get_data(f, c)) for f in ["firefox", "eclipse", "chromium"] for c in [2, 3, 5, 7, 9]]:    
    print("Running", filename, n_class, "classes.")
    tr = Transform("normalize")
    tr.apply(data)
    data.y_train = to_categorical(data.y_train, num_classes=n_class)
    data.y_test = to_categorical(data.y_test, num_classes=n_class)
    top1 = []
    top2 = []
    for i in range(20):
        learner = MulticlassDL(n_classes=n_class, n_layers=2, n_units=10, n_epochs=30, verbose=0)
        
        learner.set_data(*data)
        learner.fit()
            
        preds = learner.predict(data.x_test)
        metr = ClassificationMetrics(data.y_test, preds)
        metr.add_metric('accuracy')
        print("Accuracy =", metr.get_metrics()[0])
        t2 = get_top2(learner, data)
        print("Top-2 Acc =", t2)
        top1.append(metr.get_metrics()[0])
        top2.append(t2)
        
    print("\nMedian top-1 accuracy =", round(np.median(top1), 3))
    print("Median top-2 accuracy =", round(np.median(top2), 3))
    print()

Running firefox 2 classes.
Accuracy = 0.704588309239472
Top-2 Acc = 1.0
Accuracy = 0.7076167076167076
Top-2 Acc = 1.0
Accuracy = 0.7040169133192389
Top-2 Acc = 1.0
Accuracy = 0.7045311696474487
Top-2 Acc = 1.0
Accuracy = 0.702759842294726
Top-2 Acc = 1.0
Accuracy = 0.7038454945431689
Top-2 Acc = 1.0
Accuracy = 0.7031598194388892
Top-2 Acc = 1.0
Accuracy = 0.704759728015542
Top-2 Acc = 1.0
Accuracy = 0.705502542711845
Top-2 Acc = 1.0
Accuracy = 0.7086452202731273
Top-2 Acc = 1.0
Accuracy = 0.704931146791612
Top-2 Acc = 1.0
Accuracy = 0.7059596594480315
Top-2 Acc = 1.0
Accuracy = 0.7063024970001714
Top-2 Acc = 1.0
Accuracy = 0.7052739843437518
Top-2 Acc = 1.0
Accuracy = 0.7047025884235186
Top-2 Acc = 1.0
Accuracy = 0.7047025884235186
Top-2 Acc = 1.0
Accuracy = 0.7089880578252671
Top-2 Acc = 1.0
Accuracy = 0.7044168904634021
Top-2 Acc = 1.0
Accuracy = 0.7055596823038683
Top-2 Acc = 1.0
Accuracy = 0.7040740529112622
Top-2 Acc = 1.0

Median top-1 accuracy = 0.705
Median top-2 accuracy = 1.0

Top-2 Acc = 0.592
Accuracy = 0.32323867207588136
Top-2 Acc = 0.6

Median top-1 accuracy = 0.324
Median top-2 accuracy = 0.594

Running eclipse 7 classes.
Accuracy = 0.29826867036169363
Top-2 Acc = 0.471
Accuracy = 0.29946860179418316
Top-2 Acc = 0.466
Accuracy = 0.3013542083309525
Top-2 Acc = 0.474
Accuracy = 0.29941146220215986
Top-2 Acc = 0.472
Accuracy = 0.305639677732701
Top-2 Acc = 0.48
Accuracy = 0.29941146220215986
Top-2 Acc = 0.475
Accuracy = 0.30152562710702246
Top-2 Acc = 0.472
Accuracy = 0.2991257642420433
Top-2 Acc = 0.47
Accuracy = 0.30181132506713904
Top-2 Acc = 0.475
Accuracy = 0.3021541626192789
Top-2 Acc = 0.472
Accuracy = 0.297240157705274
Top-2 Acc = 0.473
Accuracy = 0.2968973201531341
Top-2 Acc = 0.471
Accuracy = 0.3023255813953488
Top-2 Acc = 0.471
Accuracy = 0.30323981486772184
Top-2 Acc = 0.478
Accuracy = 0.2959830866807611
Top-2 Acc = 0.47
Accuracy = 0.30409690874807155
Top-2 Acc = 0.476
Accuracy = 0.2964402034169476
Top-2 Acc = 0.468
Accuracy = 0.30112564996285

In [39]:
from raise_utils.interpret.sk import Rx

In [49]:
data = {"dl": [0.534, 0.529, 0.528, 0.528, 0.532, 0.533, 0.530, 0.529, 0.534, 0.531, 0.531, 0.532, 0.533, 0.530, 0.529],"simple": [0.5467687560710817, 0.5476829895434546, 0.544997428718359, 0.44008913776355635, 0.5448831495343123, 0.5460259413747786, 0.5459688017827553, 0.5477972687275013, 0.547397291583338, 0.5482543854636878]}

In [50]:
Rx.show(Rx.sk(Rx.data(**data)))

   1         dl (                         |*                       ), 0.528,  0.529,  0.531,  0.532,  0.534
   2     simple (                         | *                      ), 0.545,  0.546,  0.547,  0.548,  0.548


In [52]:
np.median(
    [0.7210445117421862, 0.6487629278326953, 0.6873892920404548, 0.6892748985772241, 0.6956173932918119, 0.6466487629278327, 0.7082452431289641, 0.695103136963602, 0.6956745328838352, 0.6932746700188561, 0.6485915090566253, 0.6966459059482315, 0.6963602079881149, 0.6937889263470659, 0.6950459973715788, 0.6904176904176904, 0.6893891777612707, 0.6440203416947603, 0.697103022684418, 0.642877549854294]
)

0.693531798182961

In [7]:
for filename, n_class, data in [(f, 2, get_data(f, 2)) for f in ["eclipse", "chromium"]]:    
    print("Running", filename, n_class, "classes.")
    for learner in [DecisionTree(name="dt"), LogisticRegressionClassifier(name="lr"), RandomForest()]:
        print("Learner:", learner.__name__)
        top1 = []
        top2 = []
        for i in range(20):
            tr = Transform("normalize")
            tr.apply(data)
            learner.set_data(*data)
            learner.fit()
            
            preds = learner.predict(data.x_test)
            metr = ClassificationMetrics(data.y_test, preds)
            metr.add_metric('accuracy')
            print("Accuracy =", metr.get_metrics()[0])
            t2 = get_top2(learner, data)
            print("Top-2 Acc =", t2)
            top1.append(metr.get_metrics()[0])
            top2.append(t2)
        
        print("\nMedian top-1 accuracy =", round(np.median(top1), 3))
        print("Median top-2 accuracy =", round(np.median(top2), 3))
        print()
    print()
print()

Running eclipse 2 classes.
Learner: dt
Accuracy = 0.6195645963087824
Top-2 Acc = 1.0
Accuracy = 0.6179075481401063
Top-2 Acc = 1.0
Accuracy = 0.6193360379406891
Top-2 Acc = 1.0
Accuracy = 0.6176789897720131
Top-2 Acc = 1.0
Accuracy = 0.6211073652934118
Top-2 Acc = 1.0
Accuracy = 0.6176218501799897
Top-2 Acc = 1.0
Accuracy = 0.6199074338609223
Top-2 Acc = 1.0
Accuracy = 0.6177361293640363
Top-2 Acc = 1.0
Accuracy = 0.6180789669161763
Top-2 Acc = 1.0
Accuracy = 0.6185360836523627
Top-2 Acc = 1.0
Accuracy = 0.6170504542597566
Top-2 Acc = 1.0
Accuracy = 0.6196217359008057
Top-2 Acc = 1.0
Accuracy = 0.6175647105879664
Top-2 Acc = 1.0
Accuracy = 0.6207645277412719
Top-2 Acc = 1.0
Accuracy = 0.6175647105879664
Top-2 Acc = 1.0
Accuracy = 0.6187075024284326
Top-2 Acc = 1.0
Accuracy = 0.6148220101708474
Top-2 Acc = 1.0
Accuracy = 0.6194503171247357
Top-2 Acc = 1.0
Accuracy = 0.6193360379406891
Top-2 Acc = 1.0
Accuracy = 0.6151077081309639
Top-2 Acc = 1.0

Median top-1 accuracy = 0.618
Median top