In [1]:
import numpy as np
import pandas as pd

def get_acc(predictions, df_truth):
    score = (predictions == predictions.max(axis=1, keepdims=True)).astype(np.float)
    score /= score.sum(axis=1, keepdims=True)
    return score[df_truth.item.values, df_truth.truth.values].sum() / df_truth.shape[0]

In [2]:
data_path = '../data/'
datasets = [
    ('crowdscale2013/sentiment',   'senti'),
    ('crowdscale2013/fact_eval',   'fact'),
    ('active-crowd-toolkit/CF',           'CF'),
    ('active-crowd-toolkit/CF_amt',       'CF_amt'),
    ('active-crowd-toolkit/MS',           'MS'),
    ('active-crowd-toolkit/SP',           'SP'),
    ('active-crowd-toolkit/SP_amt',       'SP_amt'),
    ('active-crowd-toolkit/ZenCrowd_all', 'ZC_all'),
    ('active-crowd-toolkit/ZenCrowd_in',  'ZC_in'),
    ('active-crowd-toolkit/ZenCrowd_us',  'ZC_us'),
#     ('crowd_truth_inference/d_Duck Identification',            'duck'),
    ('crowd_truth_inference/d_jn-product',                     'product'),
    ('crowd_truth_inference/d_sentiment',                      'senti_1k'),
#     ('crowd_truth_inference/s4_Dog data',                      'dog'),
    ('crowd_truth_inference/s4_Face Sentiment Identification', 'face'),
#     ('crowd_truth_inference/s4_Relevance',                     'rel'),
    ('crowd_truth_inference/s5_AdultContent',                  'adult'),
    ('SpectralMethodsMeetEM/bluebird', 'bird'),
    ('SpectralMethodsMeetEM/dog',      'dog'),
    ('SpectralMethodsMeetEM/rte',      'rte'),
    ('SpectralMethodsMeetEM/trec',     'trec'),
    ('SpectralMethodsMeetEM/web',      'web')
]

In [3]:
%%time
from bwa import bwa

records = []
for dataset, abbrev in datasets:
    df_label = pd.read_csv(data_path + dataset + '/label.csv')
    df_label = df_label.drop_duplicates(keep='first')
    prediction_ik = bwa(df_label.values)

    df_truth = pd.read_csv(data_path + dataset + '/truth.csv')
    records.append((abbrev, get_acc(prediction_ik, df_truth)))    
    print('%-10s %g'%records[-1])

senti      0.89
fact       0.887153
CF         0.893333
CF_amt     0.86
MS         0.785714
SP         0.916983
SP_amt     0.946
ZC_all     0.834804
ZC_in      0.763725
ZC_us      0.910784
product    0.919423
senti_1k   0.956
face       0.618151
adult      0.741742
bird       0.759259
dog        0.831475
rte        0.9275
trec       0.604396
web        0.822465
Wall time: 10.7 s


In [4]:
df = pd.DataFrame.from_records(records, columns=['dataset', 'accuracy'])

print(df.mean())
display(df)

accuracy    0.835206
dtype: float64


Unnamed: 0,dataset,accuracy
0,senti,0.89
1,fact,0.887153
2,CF,0.893333
3,CF_amt,0.86
4,MS,0.785714
5,SP,0.916983
6,SP_amt,0.946
7,ZC_all,0.834804
8,ZC_in,0.763725
9,ZC_us,0.910784
