In [248]:
%run utils.ipynb

import pickle
import numpy as np

from os import listdir, path
from collections import defaultdict
from sklearn.metrics import mean_squared_error, explained_variance_score, mean_absolute_error

In [81]:
datasets = {'agg': {'x': {'cont':         x_cont_agg, 
                          'disc':         x_disc_agg}, 
                    'y': {# 'abs':          y_abs_agg['y'],
                          # 'abs_binned':   y_abs_agg['binned_y'],
                          'share':        y_share_agg['y'],
                          'share_binned': y_share_agg['binned_y']}
                   },
            'ind': {'x': {'cont':         x_cont_ind,
                          'disc':         x_disc_ind},
                    'y': {# 'abs':          y_abs_ind,
                          # 'abs_binned':   y_abs_bin_ind,
                          'share':        y_share_ind,
                          'share_binned': y_share_bin_ind}
                   }
           }

results_dir = 'gp_results'

In [82]:
files = [f for f in listdir(results_dir) if f != '.ipynb_checkpoints']

In [265]:
def scale_pred(pred):
    factor = 1/sum(pred)
    scaled = []
    for p in pred:
        scaled.append(p * factor)
    return tuple(scaled)

In [262]:
def parse_results(file):
    all_mse = []
    all_results = []
    all_var = []
    filename = path.splitext(file)[0].split('_')
    agg_level = filename[1]
    dist = filename[2]

    with open(path.join(results_dir, file), 'rb') as f:
        results = pickle.load(f)
    
    for fold in results['All results']:
        y_true = []
        y_pred = []
        results = {}
        for x in fold:
            pred = tuple([p for [p] in fold[x]])
            pred = scale_pred(pred)
            if agg_level == 'ind': x = tuple(list(x[1:]) + [x[0]])
            true_info = noc_dict[dist][x]
            noc = true_info['noc']
            if len(pred) == 3: true_dist = true_info['share']
            else:              true_dist = true_info['share_binned']
            y_true.append(true_dist)
            y_pred.append(pred)
            results[noc + ', ' + str(int(x[-1]))] = ({'y_true': true_dist,
                                                      'y_pred': pred})    
        mse = mean_squared_error(y_true, y_pred)
        var = explained_variance_score(y_true, y_pred)
        all_mse.append(mse)
        all_results.append(results)
        all_var.append(var)
        
    return all_mse, all_results, all_var

In [263]:
def check_results(file):
    mapping = {2: 'increase',
               1: 'constant',
               0: 'decrease'}
    wrong_noc = defaultdict(int)
    wrong_preds = defaultdict(list)
    correct = defaultdict(int)
    # first key is true, second key is predicted, value is number of times
    missed = {'decrease': defaultdict(int),
              'constant': defaultdict(int),
              'increase': defaultdict(int)}

    for fold in results[file]:
        for noc in fold:
            pred = mapping[np.argmax(fold[noc]['y_pred'])]
            true = mapping[np.argmax(fold[noc]['y_true'])]
            if pred != true: 
                wrong_noc[noc.split(',')[0]] += 1
                wrong_preds[noc.split(',')[0]].append(mean_absolute_error(
                    (fold[noc]['y_true']), (fold[noc]['y_pred'])
                ))
                missed[true][pred] += 1
            else: correct[true] += 1
    
    for key in wrong_preds:
        val = wrong_preds[key]
        wrong_preds[key] = sum(val)/len(val)
    
    return wrong_noc, correct, missed, wrong_preds

In [158]:
def get_top(n): return sorted(model_mse, key=model_mse.get)[:n]

In [266]:
model_mse = {}
results = {}

for file in files:
    mse, result, var = parse_results(file)
    avg_mse = sum(mse)/len(mse)
    avg_var = sum(var)/len(var)
    model_mse[file] = (avg_mse, avg_var)
    results[file] = result

In [267]:
for file in get_top(5):
    print(file)
    wrong_noc, correct, missed, wrong_preds = check_results(file)
    print('Number correct:')
    for item in correct.items():
        print(item)
    for true in missed:
        for pred in missed[true]:
            print('True:', true, 'Pred:', pred, 'Num:', missed[true][pred])

RBF_agg_cont_share.pkl
Number correct:
('constant', 22)
('increase', 1)
True: constant Pred: decrease Num: 2
True: constant Pred: increase Num: 2
True: decrease Pred: constant Num: 37
True: decrease Pred: increase Num: 5
True: increase Pred: constant Num: 47
True: increase Pred: decrease Num: 4
Matern52_agg_disc_share.pkl
Number correct:
('constant', 26)
True: decrease Pred: constant Num: 42
True: increase Pred: constant Num: 52
RBF_agg_disc_share.pkl
Number correct:
('constant', 26)
True: decrease Pred: constant Num: 42
True: increase Pred: constant Num: 52
Matern32_agg_disc_share.pkl
Number correct:
('constant', 26)
True: decrease Pred: constant Num: 42
True: increase Pred: constant Num: 52
Matern12_agg_disc_share.pkl
Number correct:
('constant', 26)
True: decrease Pred: constant Num: 42
True: increase Pred: constant Num: 52


## Scratchpad

In [None]:
# check which ones it's bad at

In [268]:
top = sorted(model_mse, key=model_mse.get)
for key in top:
    print(key, ':', model_mse[key])

RBF_agg_cont_share.pkl : (0.0668662270074651, -0.0670994265851075)
Matern52_agg_disc_share.pkl : (0.06758139983618208, -0.0640736796970455)
RBF_agg_disc_share.pkl : (0.0675821149832898, -0.06409487904270102)
Matern32_agg_disc_share.pkl : (0.06758309596655827, -0.06405143590509274)
Matern12_agg_disc_share.pkl : (0.067588693196657, -0.06403916208368224)
RBF_ind_disc_share_binned.pkl : (0.06812075990703201, 0.02940764305215563)
RBF_ind_cont_share_binned.pkl : (0.06812075990703201, 0.029407643052155708)
Matern52_ind_disc_share_binned.pkl : (0.06851196691297687, 0.008539976301254725)
Matern52_ind_cont_share_binned.pkl : (0.0685119669129769, 0.008539976301254804)
Matern32_ind_disc_share_binned.pkl : (0.06868473112836478, 0.009923170047500818)
Matern32_ind_cont_share_binned.pkl : (0.06868473112836478, 0.009923170047500906)
Matern12_ind_disc_share_binned.pkl : (0.06912622731861667, 0.0021023057102859677)
Matern12_ind_cont_share_binned.pkl : (0.06912622731861667, 0.0021023057102861342)
Matern12

In [269]:
for file in files:
    with open(path.join(results_dir, file), 'rb') as f:
        res = pickle.load(f)
    print(file, ':', res['Avg. score'])

RBF_agg_disc_share.pkl : 0.23333333333333334
RBF_agg_cont_share_binned.pkl : 0.7416666666666666
Matern12_ind_disc_share_binned.pkl : 0.6825873072908646
Matern52_agg_disc_share.pkl : 0.23333333333333334
Matern12_agg_cont_share.pkl : 0.2416666666666667
Matern32_agg_disc_share_binned.pkl : 0.7416666666666666
Matern32_agg_cont_share.pkl : 0.16666666666666669
Matern12_agg_cont_share_binned.pkl : 0.6666666666666666
Matern52_agg_cont_share.pkl : 0.16666666666666669
Matern12_agg_disc_share.pkl : 0.23333333333333334
RBF_ind_disc_share_binned.pkl : 0.6825873072908646
RBF_agg_cont_share.pkl : 0.20833333333333334
Matern32_agg_disc_share.pkl : 0.23333333333333334
Matern32_ind_cont_share_binned.pkl : 0.6825873072908646
Matern52_agg_disc_share_binned.pkl : 0.7333333333333332
Matern52_ind_cont_share_binned.pkl : 0.6825873072908646
Matern52_ind_disc_share_binned.pkl : 0.6825873072908646
Matern52_agg_cont_share_binned.pkl : 0.7416666666666666
Matern52_ind_disc_share.pkl : 0.2918193795664151
Matern12_ind

In [270]:
wrong_noc, correct, missed, wrong_preds = check_results('RBF_agg_cont_share.pkl')

In [271]:
wrong_noc

defaultdict(int,
            {'Accounting technicians and bookkeepers': 1,
             'Air transport ramp attendants': 1,
             'Airline ticket and service agents': 1,
             'Carpenters': 1,
             'Central control and process operators': 1,
             'Chefs': 1,
             'Computer network technicians': 6,
             'Cooks': 1,
             'Court clerks': 5,
             'Electrical mechanics': 6,
             'Financial managers': 6,
             'Fish and seafood plant workers': 1,
             'Fishermen/women': 1,
             'Forestry technologists and technicians': 4,
             'Furniture and fixture assemblers and inspectors': 6,
             'General farm workers': 1,
             'Graphic designers and illustrators': 1,
             'Health policy researchers': 1,
             'Industrial and manufacturing engineers': 5,
             'Instructors of persons with disabilities': 6,
             'Labourers in food': 5,
             'Light duty

In [272]:
correct

defaultdict(int, {'constant': 22, 'increase': 1})

In [273]:
missed

{'constant': defaultdict(int, {'decrease': 2, 'increase': 2}),
 'decrease': defaultdict(int, {'constant': 37, 'increase': 5}),
 'increase': defaultdict(int, {'constant': 47, 'decrease': 4})}

In [278]:
for key in sorted(wrong_preds, key=wrong_preds.get, reverse=True):
    print(key, ':', wrong_preds[key])

Chefs : 0.48120432064878615
Graphic designers and illustrators : 0.41891504293436715
Light duty cleaners : 0.4169030906144027
Fishermen/women : 0.3867865792733272
Furniture and fixture assemblers and inspectors : 0.35649530930799106
Cooks : 0.35120417677475696
Registered nurses and registered psychiatric nurses : 0.33544756207022886
Health policy researchers : 0.325241617043669
Shippers and receivers : 0.298938258722563
Technical sales specialists - wholesale trade : 0.2914329545347268
Airline ticket and service agents : 0.2838346841366573
Plastics processing machine operators : 0.27912561439084754
Operators and attendants in amusement : 0.2778199072013778
Electrical mechanics : 0.2667312503407426
Computer network technicians : 0.2601658052208458
Paramedical occupations : 0.25668144323786274
Medical administrative assistants : 0.25369024581704963
Painters and decorators (except interior decorators) : 0.247360303423348
Oil and gas well drillers : 0.24337949422069793
Supervisors : 0.2346