In [1]:
%%javascript
IPython.OutputArea.auto_scroll_threshold = 9999;

<IPython.core.display.Javascript object>

In [2]:
import sys
sys.path.append('../../')

from matplotlib import pyplot
import numpy as np
import pandas
from matplotlib import pyplot as plt
from scipy.stats import rankdata
from IPython.display import display, HTML

from bayesian_benchmarks.database_utils import Database
from bayesian_benchmarks.data import classification_datasets, _ALL_REGRESSION_DATATSETS, _ALL_CLASSIFICATION_DATATSETS
ALL_DATATSETS = {}
ALL_DATATSETS.update(_ALL_REGRESSION_DATATSETS)
ALL_DATATSETS.update(_ALL_CLASSIFICATION_DATATSETS)
from bayesian_benchmarks.data import regression_datasets


In [3]:
def rankarray(A):
    ranks = []
    for a in A:
        ranks.append(rankdata(a))
    return np.array(ranks)


def read_regression_classification(db_loc, fs, models_names, datasets, task):
    if task == 'classification':
        fields = ['dataset', 'N', 'D', 'K'] + [m[1] for m in models_names]
    else:
        fields = ['dataset', 'N', 'D'] + [m[1] for m in models_names]

    results = {}
    for f in fs:
        results[f] = {'table':{f:[] for f in fields}, 'vals':[]}

    with Database(db_loc) as db:

        for dataset in datasets:
            for f in fs:
                results[f]['table']['dataset'].append(dataset[:10])
                results[f]['table']['N'].append(ALL_DATATSETS[dataset].N)
                results[f]['table']['D'].append(ALL_DATATSETS[dataset].D)
                if task == 'classification':
                    results[f]['table']['K'].append(ALL_DATATSETS[dataset].K)

            row = {f:[] for f in fs}
            for model, name in models_names:
                res = db.read(task, fs, {'model':model, 
                                         'dataset':dataset})
                    
                if len(res) == 0:
                    for f in fs:
                        results[f]['table'][name].append('')
                        row[f].append(np.nan)
                else:
                    print('{} {} {}'.format(model, dataset, len(res)))
                    for i, f in enumerate(fs):
                        L = [float(l[i]) for l in res]
                        m = np.average(L)
                        std = np.std(L) if len(L) > 1 else np.nan
                        if m < 1000 and m > -1000:
                            r = '{:.3f}({:.3f})'.format(m, std)
                            row[f].append(m)
                        else:
                            r = 'nan'
                            row[f].append(np.nan)

                        results[f]['table'][name].append(r)

            #             stderr = np.std(L)/float(len(L))**0.5
            #             r = '{:.3f} ({:.3f})'.format(m, stderr)
            for f in fs:   
                results[f]['vals'].append(row[f])


    for f in fs:
        if 'unnormalized' not in f:
            vals = np.array(results[f]['vals'])

            avgs = np.nanmean(vals, 0)
            meds = np.nanmedian(vals, 0)
            rks = np.nanmean(rankarray(vals), 0)

            for s, n in [[avgs, 'avg'], [meds, 'median'], [rks, 'avg rank']]:
                results[f]['table']['dataset'].append(n)
                results[f]['table']['N'].append('')
                results[f]['table']['D'].append('')
                if task == 'classification':
                    results[f]['table']['K'].append('')
                for ss, name in zip(s, [m[1] for m in models_names]):
                    results[f]['table'][name].append('{:.3f}'.format(ss))
    
    return results, fields


In [4]:
regression_datasets = ['boston', 'energy', 'concrete', 'power']
models_names = [['RegNet', 'SGD'], ['RegNetcovariancelow_rank_gaussian', 'SWAG']]

fs = ['test_loglik', 'test_rmse', 'test_loglik_unnormalized', 'test_rmse_unnormalized', 'test_calibration']

results, fields = read_regression_classification('../tasks/uci_small.db', 
                                                 fs, models_names, regression_datasets, 'regression')


RegNet boston 20
RegNetcovariancelow_rank_gaussian boston 20
RegNet energy 20
RegNetcovariancelow_rank_gaussian energy 20
RegNet concrete 20
RegNetcovariancelow_rank_gaussian concrete 20
RegNet power 20
RegNetcovariancelow_rank_gaussian power 20


In [7]:
print('normalised test loglikelihood')
display(HTML(pandas.DataFrame(results['test_loglik']['table'], columns=fields).to_html(index=False)))
# print(pandas.DataFrame(results['test_loglik']['table'], columns=fields).to_latex())

print('unnormalized test loglikelihood')
display(HTML(pandas.DataFrame(results['test_loglik_unnormalized']['table'], columns=fields).to_html(index=False)))


print('normalised test rmse')
display(HTML(pandas.DataFrame(results['test_rmse']['table'], columns=fields).to_html(index=False)))

print('normalised test rmse')
display(HTML(pandas.DataFrame(results['test_rmse_unnormalized']['table'], columns=fields).to_html(index=False)))

print('normalised test calibration')
display(HTML(pandas.DataFrame(results['test_calibration']['table'], columns=fields).to_html(index=False)))

normalised test loglikelihood


dataset,N,D,SGD,SWAG
boston,506.0,13.0,-0.318(0.240),-0.251(0.183)
energy,768.0,8.0,-0.021(1.476),-0.890(1.331)
concrete,1030.0,8.0,-0.211(0.126),-0.235(0.100)
power,9568.0,4.0,0.065(0.041),0.062(0.038)
avg,,,-0.121,-0.328
median,,,-0.116,-0.243
avg rank,,,1.750,1.250


unnormalized test loglikelihood


dataset,N,D,SGD,SWAG
boston,506,13,-2.536(0.240),-2.469(0.183)
energy,768,8,-2.332(1.476),-3.201(1.331)
concrete,1030,8,-3.026(0.126),-3.050(0.100)
power,9568,4,-2.772(0.041),-2.775(0.038)


normalised test rmse


dataset,N,D,SGD,SWAG
boston,506.0,13.0,0.336(0.097),0.347(0.104)
energy,768.0,8.0,0.973(2.723),1.833(4.293)
concrete,1030.0,8.0,0.326(0.033),0.342(0.030)
power,9568.0,4.0,0.230(0.008),0.230(0.008)
avg,,,0.466,0.688
median,,,0.331,0.345
avg rank,,,1.000,2.000


normalised test rmse


dataset,N,D,SGD,SWAG
boston,506,13,3.090(0.896),3.187(0.954)
energy,768,8,9.812(27.461),18.486(43.287)
concrete,1030,8,5.449(0.557),5.713(0.505)
power,9568,4,3.919(0.142),3.927(0.140)


normalised test calibration


dataset,N,D,SGD,SWAG
boston,506.0,13.0,0.913(0.039),0.936(0.036)
energy,768.0,8.0,0.983(0.023),0.970(0.040)
concrete,1030.0,8.0,0.909(0.032),0.930(0.023)
power,9568.0,4.0,0.956(0.006),0.957(0.005)
avg,,,0.940,0.948
median,,,0.934,0.946
avg rank,,,1.250,1.750


In [9]:
fs = ['test_loglik', 'test_acc']
results, fields = read_regression_classification(fs, models_names, classification_datasets, 'classification')


TypeError: read_regression_classification() missing 1 required positional argument: 'task'

In [7]:
print('test loglikelihood')
display(HTML(pandas.DataFrame(results['test_loglik']['table'], columns=fields).to_html(index=False)))
# print(pandas.DataFrame(results['test_loglik']['table'], columns=fields).to_latex())

print('test accuracy')
display(HTML(pandas.DataFrame(results['test_acc']['table'], columns=fields).to_html(index=False)))



test loglikelihood


dataset,N,D,K,lin,SVGP,SVGP_mb,DGP,svm,knn,gbm,ab,mlp
abalone,4177.0,9.0,3.0,-0.760(0.039),-2.150(0.181),-2.074(0.165),-1.825(0.170),-0.746(0.040),-2.564(0.324),-0.654(0.030),-1.053(0.007),-0.708(0.038)
acute-infl,120.0,7.0,2.0,-0.049(0.015),-0.008(0.002),-0.055(0.007),-0.063(0.008),-0.018(0.001),-0.000(0.000),-0.000(0.000),-0.025(0.063),-0.030(0.008)
acute-neph,120.0,7.0,2.0,-0.031(0.008),-0.007(0.002),-0.045(0.005),-0.051(0.006),-0.019(0.001),-0.000(0.000),-0.000(0.000),-0.085(0.256),-0.017(0.004)
adult,48842.0,15.0,2.0,-0.342(0.005),,-0.321(0.004),,-0.359(0.005),-1.146(0.048),-0.290(0.004),-0.665(0.000),-0.315(0.005)
annealing,898.0,32.0,5.0,-0.365(0.061),-0.743(0.358),-0.597(0.169),-1.051(0.333),-0.342(0.086),-0.769(0.394),-0.104(0.062),-1.219(0.021),-0.279(0.052)
arrhythmia,452.0,263.0,13.0,-1.343(0.298),-1.162(0.285),-1.179(0.301),-1.289(0.254),-1.071(0.143),-5.885(1.417),-1.331(0.344),-2.568(0.360),-1.333(0.287)
audiology-,196.0,60.0,18.0,-1.072(0.229),-0.914(0.338),-1.148(0.310),-1.140(0.311),-1.453(0.216),-4.130(1.691),-0.979(0.378),-2.904(0.326),-0.867(0.215)
balance-sc,625.0,5.0,3.0,-0.363(0.107),-0.039(0.046),-0.052(0.040),-0.018(0.014),-0.220(0.074),-2.087(0.818),-0.397(0.119),-0.994(0.009),-0.139(0.050)
balloons,16.0,5.0,2.0,-0.708(0.355),,,-0.615(0.119),-0.617(0.139),-0.576(0.161),-1.630(2.162),-2.676(2.902),-0.652(0.463)
bank,4521.0,17.0,2.0,-0.271(0.028),-0.252(0.024),-0.254(0.024),-0.253(0.024),-0.286(0.029),-1.143(0.224),-0.235(0.020),-0.646(0.002),-0.281(0.036)


test accuracy


dataset,N,D,K,lin,SVGP,SVGP_mb,DGP,svm,knn,gbm,ab,mlp
abalone,4177.0,9.0,3.0,0.636(0.029),0.664(0.024),0.672(0.025),0.669(0.026),0.661(0.024),0.633(0.029),0.696(0.022),0.688(0.024),0.668(0.028)
acute-infl,120.0,7.0,2.0,1.000(0.000),1.000(0.000),1.000(0.000),1.000(0.000),1.000(0.000),1.000(0.000),1.000(0.000),0.983(0.050),1.000(0.000)
acute-neph,120.0,7.0,2.0,1.000(0.000),1.000(0.000),1.000(0.000),1.000(0.000),1.000(0.000),1.000(0.000),1.000(0.000),0.992(0.025),1.000(0.000)
adult,48842.0,15.0,2.0,0.843(0.005),,0.849(0.005),,0.849(0.004),0.830(0.003),0.868(0.004),0.859(0.005),0.854(0.004)
annealing,898.0,32.0,5.0,0.848(0.041),0.892(0.046),0.898(0.034),0.862(0.043),0.876(0.044),0.880(0.041),0.967(0.017),0.870(0.036),0.890(0.047)
arrhythmia,452.0,263.0,13.0,0.707(0.092),0.761(0.056),0.761(0.053),0.754(0.043),0.674(0.051),0.615(0.048),0.754(0.061),0.613(0.070),0.678(0.056)
audiology-,196.0,60.0,18.0,0.795(0.096),0.755(0.069),0.745(0.065),0.730(0.081),0.630(0.105),0.535(0.081),0.810(0.073),0.265(0.125),0.705(0.072)
balance-sc,625.0,5.0,3.0,0.863(0.048),0.986(0.019),0.976(0.024),0.997(0.006),0.916(0.036),0.835(0.062),0.870(0.049),0.908(0.042),0.970(0.026)
balloons,16.0,5.0,2.0,0.500(0.316),,,0.600(0.300),0.600(0.300),0.600(0.300),0.650(0.450),0.400(0.300),0.550(0.350)
bank,4521.0,17.0,2.0,0.892(0.018),0.892(0.015),0.891(0.016),0.891(0.017),0.892(0.013),0.890(0.014),0.900(0.011),0.892(0.015),0.894(0.013)


In [8]:
# fields = ['dataset', 'N', 'D']

                
# colours = ['C{}'.format(i) for i in range(10)]

# fields = fields + [m[1] for m in models_names]
# results = {f:[] for f in fields}


# for dataset in regression_datasets:
    
#     fig, axs = plt.subplots(1, 2, figsize=(10, 5))

#     results['dataset'].append(dataset)
#     results['N'].append(ALL_REGRESSION_DATATSETS[dataset].N)
#     results['D'].append(ALL_REGRESSION_DATATSETS[dataset].D)

#     for (model, name), c in zip(models_names, colours):
#         with Database('../results/results.db') as db:
#             d = {'model':model, 'dataset':dataset}

#             res = db.read('active_learning_continuous', ['total_loglik', 'total_rmse'], d) 
#         if len(res)>0:
#             test_ll = res[0][0]
#             test_acc = res[0][1]

#             axs[0].plot(test_ll, label=model, color=c)r
#             axs[1].plot(test_acc, label=model, color=c)
#     axs[0].set_ylim(-10, 10)
#     plt.title('{} {} {}'.format(dataset,
#                                    ALL_REGRESSION_DATATSETS[dataset].N,
#                                    ALL_REGRESSION_DATATSETS[dataset].D))
#     plt.legend()
#     plt.show()


In [9]:

# fields = ['dataset', 'N', 'D', 'K']

# models_names = [['linear', 'lin'],
#                 ['variationally_sparse_gp', 'SVGP'],
#                 ['deep_gp_doubly_stochastic','DGP'],
#                 ['svm', 'svm'],
#                 ['knn', 'knn'],
#                 ['naive_bayes', 'nb'],
#                 ['decision_tree', 'dt'],
#                 ['random_forest', 'rf'],
#                 ['gradient_boosting_machine', 'gbm'],
#                 ['adaboost', 'ab'],
#                 ['mlp', 'mlp'],
#                 ]
                
# colours = ['C{}'.format(i) for i in range(10)]

# fields = fields + [m[1] for m in models_names]
# results = {f:[] for f in fields}


# for dataset in classification_datasets[:4]:  # don't show them all...
    
#     fig, axs = plt.subplots(1, 2, figsize=(10, 5))

#     results['dataset'].append(dataset)
#     results['N'].append(ALL_CLASSIFICATION_DATATSETS[dataset].N)
#     results['D'].append(ALL_CLASSIFICATION_DATATSETS[dataset].D)
#     results['K'].append(ALL_CLASSIFICATION_DATATSETS[dataset].K)

#     for (model, name), c in zip(models_names, colours):
#         with Database('../results/results.db') as db:
#             d = {'model':model, 'dataset':dataset}

#             res = db.read('active_learning_discrete', ['test_loglik', 'total_acc'], d) 
#         if len(res)>0:
#             test_ll = res[0][0]
#             test_acc = res[0][1]

#             axs[0].plot(test_ll, label=model, color=c)
#             axs[1].plot(test_acc, label=model, color=c)

#     plt.title('{} {} {} {}'.format(dataset,
#                                    ALL_CLASSIFICATION_DATATSETS[dataset].N,
#                                    ALL_CLASSIFICATION_DATATSETS[dataset].D,
#                                    ALL_CLASSIFICATION_DATATSETS[dataset].K))
#     plt.legend()
#     plt.show()