In [1]:
import numpy as np
import pandas as pd

def get_acc(predictions, df_truth):
    score = (predictions == predictions.max(axis=1, keepdims=True)).astype(np.float)
    score /= score.sum(axis=1, keepdims=True)
    return score[df_truth.item.values, df_truth.truth.values].sum() / df_truth.shape[0]

In [2]:
data_path = '../data/'
datasets = [
#     ('crowdscale2013/sentiment',   'senti'),
#     ('crowdscale2013/fact_eval',   'fact'),
    ('active-crowd-toolkit/CF',           'CF'),
    ('active-crowd-toolkit/CF_amt',       'CF_amt'),
    ('active-crowd-toolkit/MS',           'MS'),
    ('active-crowd-toolkit/SP',           'SP'),
    ('active-crowd-toolkit/SP_amt',       'SP_amt'),
    ('active-crowd-toolkit/ZenCrowd_all', 'ZC_all'),
    ('active-crowd-toolkit/ZenCrowd_in',  'ZC_in'),
    ('active-crowd-toolkit/ZenCrowd_us',  'ZC_us'),
#     ('crowd_truth_inference/d_Duck Identification',            'duck'),
    ('crowd_truth_inference/d_jn-product',                     'product'),
    ('crowd_truth_inference/d_sentiment',                      'senti_1k'),
#     ('crowd_truth_inference/s4_Dog data',                      'dog'),
    ('crowd_truth_inference/s4_Face Sentiment Identification', 'face'),
#     ('crowd_truth_inference/s4_Relevance',                     'rel'),
    ('crowd_truth_inference/s5_AdultContent',                  'adult'),
    ('SpectralMethodsMeetEM/bluebird', 'bird'),
    ('SpectralMethodsMeetEM/dog',      'dog'),
    ('SpectralMethodsMeetEM/rte',      'rte'),
    ('SpectralMethodsMeetEM/trec',     'trec'),
    ('SpectralMethodsMeetEM/web',      'web')
]

In [3]:
from ebcc import ebcc_vb

In [8]:
%%time
records = []
for dataset, abbrev in datasets:
    df_label = pd.read_csv(data_path + dataset + '/label.csv')
    df_label = df_label.drop_duplicates(keep='first')
    
    elbos = []
    seeds = []
    results = []
    for _ in range(40):
        seed = np.random.randint(1e8)
        prediction, elbo = ebcc_vb(df_label.values, num_groups=10, seed=seed, empirical_prior=True)
        elbos.append(elbo)
        results.append((prediction, seed, elbo))
        
    prediction_ik, seed, elbo = results[np.argmax(elbos)]
        
    df_truth = pd.read_csv(data_path + dataset + '/truth.csv')
    records.append((abbrev, get_acc(prediction_ik, df_truth), seed, elbo))    
    print('%-10s %10g %10d %10g'%records[-1])
    
df = pd.DataFrame.from_records(records, columns=['dataset', 'accuracy', 'seed', 'elbo'])
print(df['accuracy'].mean())
# display(df)

CF           0.883333    1542552    -157474
CF_amt       0.856667   15552534   -42861.5
MS           0.787143   28234435     -86977
SP           0.915183   75268953   -23261.1
SP_amt          0.944   25086090   -7144.35
ZC_all       0.862255   39386717   -13649.4
ZC_in        0.780882   24558634   -8246.29
ZC_us        0.912255   42924186   -8651.96
product      0.934937   83777558     -17345
senti_1k        0.961   92224614   -12174.8
face         0.635274   38241575   -10186.9
adult        0.747748   71066617    -210660
bird         0.861111   45725321   -3134.56
dog          0.840149   68786906   -27934.8
rte           0.93125   72551958   -9075.35
trec         0.703736   42364931   -70293.2
web          0.768564   34424996   -83622.6
0.8426756821015219
Wall time: 18min 5s
