In [1]:
import numpy as np
import pandas as pd

def get_acc(predictions, df_truth):
    score = (predictions == predictions.max(axis=1, keepdims=True)).astype(np.float)
    score /= score.sum(axis=1, keepdims=True)
    return score[df_truth.item.values, df_truth.truth.values].sum() / df_truth.shape[0]

In [2]:
data_path = '../data/'
datasets = [
    ('crowdscale2013/sentiment',   'senti'),
    ('crowdscale2013/fact_eval',   'fact'),
    ('active-crowd-toolkit/CF',           'CF'),
    ('active-crowd-toolkit/CF_amt',       'CF_amt'),
    ('active-crowd-toolkit/MS',           'MS'),
    ('active-crowd-toolkit/SP',           'SP'),
    ('active-crowd-toolkit/SP_amt',       'SP_amt'),
    ('active-crowd-toolkit/ZenCrowd_all', 'ZC_all'),
    ('active-crowd-toolkit/ZenCrowd_in',  'ZC_in'),
    ('active-crowd-toolkit/ZenCrowd_us',  'ZC_us'),
#     ('crowd_truth_inference/d_Duck Identification',            'duck'),
    ('crowd_truth_inference/d_jn-product',                     'product'),
    ('crowd_truth_inference/d_sentiment',                      'senti_1k'),
#     ('crowd_truth_inference/s4_Dog data',                      'dog'),
    ('crowd_truth_inference/s4_Face Sentiment Identification', 'face'),
#     ('crowd_truth_inference/s4_Relevance',                     'rel'),
    ('crowd_truth_inference/s5_AdultContent',                  'adult'),
    ('SpectralMethodsMeetEM/bluebird', 'bird'),
    ('SpectralMethodsMeetEM/dog',      'dog'),
    ('SpectralMethodsMeetEM/rte',      'rte'),
    ('SpectralMethodsMeetEM/trec',     'trec'),
    ('SpectralMethodsMeetEM/web',      'web')
]

In [3]:
%%time
from ibcc import ibcc

records = []
for dataset, abbrev in datasets:
    df_label = pd.read_csv(data_path + dataset + '/label.csv')
    df_label = df_label.drop_duplicates(keep='first')
    prediction_ik = ibcc(df_label.values)

    df_truth = pd.read_csv(data_path + dataset + '/truth.csv')
    records.append((abbrev, get_acc(prediction_ik, df_truth)))    
    print('%-10s %g'%records[-1])

senti      0.831
fact       0.876736
CF         0.883333
CF_amt     0.856667
MS         0.79
SP         0.914983
SP_amt     0.944
ZC_all     0.795098
ZC_in      0.769608
ZC_us      0.826961
product    0.938304
senti_1k   0.96
face       0.640411
adult      0.744745
bird       0.888889
dog        0.83891
rte        0.9275
trec       0.705495
web        0.750848
Wall time: 16.3 s


In [4]:
df = pd.DataFrame.from_records(records, columns=['dataset', 'accuracy'])

print(df.mean())
display(df)

accuracy    0.835973
dtype: float64


Unnamed: 0,dataset,accuracy
0,senti,0.831
1,fact,0.876736
2,CF,0.883333
3,CF_amt,0.856667
4,MS,0.79
5,SP,0.914983
6,SP_amt,0.944
7,ZC_all,0.795098
8,ZC_in,0.769608
9,ZC_us,0.826961
