In [67]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from scipy.stats import rankdata
from statistics import median

from raise_utils.interpret import ResultsInterpreter
from raise_utils.interpret.sk import Rx

In [26]:
datasets = ['cloudstack', 'cocoon', 'hadoop', 'deeplearning', 'ofbiz', 'qpid', 'hive', 'node']
times = ['1 day', '7 days', '14 days', '30 days', '90 days', '180 days', '365 days']
treatments = ['none', 'weighted', 'all']

In [46]:
def get_scores(data, time, full=False):
    best = 2.
    pd, pf, auc, prec = 0., 0., 0., 0.
    pd_r, pf_r, auc_r, prec_r = [], [], [], []
    d = {}
    for treatment in treatments:
        filename = f'../nondl/nondl-log/{data}-{time}-{treatment}.txt'
        r = ResultsInterpreter(None)
        line = r._read_file(filename)
            
        for learner in line.keys():
            p, q = median(line[learner]['pd']), median(line[learner]['pf'])
            d2h = np.sqrt(1.5* (1. - p) ** 2 + q ** 2)
            if d2h < best:
                best = d2h
                pd, pf = p, q
                auc = median(line[learner]['auc'])
                prec = median(line[learner]['prec'])
                pd_r = line[learner]['pd']
                pf_r = line[learner]['pf']
                auc_r = line[learner]['auc']
                prec_r = line[learner]['prec']
    
    if full:
        # Change the DL part later by adding that in above
        # i.e., add file_dl = f'....' and then doing the
        # same thing we did for non-DL
        d = {"Non-DL": pd_r, "DL": pd_r}
        Rx.show(Rx.sk(Rx.data(**d)))
        
        d = {"Non-DL": pf_r, "DL": pf_r}
        Rx.show(Rx.sk(Rx.data(**d)))
        
        d = {"Non-DL": auc_r, "DL": auc_r}
        Rx.show(Rx.sk(Rx.data(**d)))
        
        d = {"Non-DL": prec_r, "DL": prec_r}
        Rx.show(Rx.sk(Rx.data(**d)))
    else:
        return pd, pf, auc, best

In [66]:
for data in datasets:
    for time in times:
        pd, pf, auc, d2h = get_scores(data, time)
        
        cell_formatter = lambda val: f',{round(val, 2)}'
        print(f'{data}-{time},', end='')
        # Update the table markup
        for treatment in ['Best non-DL', 'Best DL']:
            # We need the & for all treatments but the first
            if treatment != 'Best non-DL':
                print(',' + treatment + ',,,,')
                break

            # Use our formatting lambdas to write the cell values and append a newline
            print(f'{treatment}{cell_formatter(auc)}{cell_formatter(d2h)}' + 
                            f'{cell_formatter(pd)}{cell_formatter(pf)}')



cloudstack-1 day,Best non-DL,0.76,0.38,0.83,0.31
,Best DL,,,,
cloudstack-7 days,Best non-DL,0.61,0.62,0.77,0.55
,Best DL,,,,
cloudstack-14 days,Best non-DL,0.64,0.58,0.6,0.31
,Best DL,,,,
cloudstack-30 days,Best non-DL,0.65,0.65,0.5,0.21
,Best DL,,,,
cloudstack-90 days,Best non-DL,0.68,0.5,0.69,0.33
,Best DL,,,,
cloudstack-180 days,Best non-DL,0.79,0.33,0.87,0.28
,Best DL,,,,
cloudstack-365 days,Best non-DL,0.54,0.74,0.5,0.41
,Best DL,,,,
cocoon-1 day,Best non-DL,0.88,0.23,1.0,0.23
,Best DL,,,,
cocoon-7 days,Best non-DL,0.83,0.27,0.89,0.23
,Best DL,,,,
cocoon-14 days,Best non-DL,0.89,0.23,1.0,0.23
,Best DL,,,,
cocoon-30 days,Best non-DL,0.88,0.24,1.0,0.24
,Best DL,,,,
cocoon-90 days,Best non-DL,0.83,0.3,0.94,0.29
,Best DL,,,,
cocoon-180 days,Best non-DL,0.86,0.29,1.0,0.29
,Best DL,,,,
cocoon-365 days,Best non-DL,0.81,0.37,1.0,0.37
,Best DL,,,,
hadoop-1 day,Best non-DL,0.95,0.11,1.0,0.11
,Best DL,,,,
hadoop-7 days,Best non-DL,0.76,0.42,0.93,0.41
,Best DL,,,,
hadoop-14 days,Best non-DL,0