In [271]:
import pandas as pd
import numpy as np
from models import utils
from models import mace, pooled_multinomial, latent_annotator_clustering, dawid_skene
from sklearn.metrics.cluster import homogeneity_score, completeness_score, v_measure_score



In [272]:
num_classes = 3
num_annotators = 34
num_datapoints = 177
num_clusters = 2


## Pooled Multinomial

In [181]:
data_pm, gold_A_pm, gold_B_pm = pooled_multinomial.data_generator(num_classes, num_annotators, num_datapoints)

In [183]:
A_pm, B_pm, elbos_pm = pooled_multinomial.evaluate(data_pm, num_classes, num_iters = 100, num_restarts = 10, logspace=False)

In [None]:
elbos_pm

[-1081.9734837151007,
 -928.4369957994512,
 -899.9815021852912,
 -876.8694918605561,
 -855.3840889609024,
 -833.6420175830445,
 -811.9693658760352,
 -789.897366118541,
 -767.5299954372505,
 -746.007264903801,
 -726.8637547075595,
 -711.7066964725808,
 -700.9338163743432,
 -693.5876190090319,
 -688.6145941628431,
 -685.254992450317,
 -682.9877407037559,
 -681.4154221655762,
 -680.2317016977302,
 -679.2297462281504,
 -678.2946278991308,
 -677.3752908511675,
 -676.457222856825,
 -675.5506646685894,
 -674.6869809769794,
 -673.9019573904442,
 -673.2109289289169,
 -672.6049980401555,
 -672.0662063147718,
 -671.579031547008,
 -671.1331812258142,
 -670.7226114154948,
 -670.343780465589,
 -669.9939517371604,
 -669.6695338206736,
 -669.3642846471687,
 -669.0674904200467,
 -668.7632508027955,
 -668.434154974099,
 -668.0736321067621,
 -667.701949836577,
 -667.3617040445042,
 -667.0840148102665,
 -666.8682867477913,
 -666.6978625069447,
 -666.55785953786,
 -666.4378075642571,
 -666.3283623453813,
 

In [None]:
100 * np.mean(gold_A_pm.argmax(axis=1) == A_pm.argmax(axis=1))

18.5

In [None]:
A_pm_ls, B_pm_ls, elbos_pm_ls = pooled_multinomial.evaluate(data_pm, num_classes, 100, logspace=True)


In [None]:
elbos_pm_ls

[-1184.2179159096138,
 -930.5891674082584,
 -900.2983149436913,
 -868.5008512491958,
 -835.5014224392586,
 -801.9824523315797,
 -768.2932019452076,
 -738.0584519092963,
 -715.5741077950477,
 -701.0492393192901,
 -692.1153194992144,
 -686.622119863887,
 -683.1634696736256,
 -680.8148724913516,
 -679.0168268695381,
 -677.5168310652442,
 -676.2195750903603,
 -675.0808185752387,
 -674.0705427781579,
 -673.1649585751534,
 -672.3457357787366,
 -671.6004001970181,
 -670.9211790333712,
 -670.3025431950564,
 -669.7393833351074,
 -669.2265819078079,
 -668.7589921739411,
 -668.3312329019719,
 -667.9382636559964,
 -667.576730441108,
 -667.2437717300805,
 -666.932505029295,
 -666.6307732554396,
 -666.3264278248012,
 -666.0142786438043,
 -665.6997489507052,
 -665.3957932443728,
 -665.1136803213278,
 -664.8568875773773,
 -664.6237659072585,
 -664.4121497142537,
 -664.2206640605787,
 -664.048293008154,
 -663.8940016235247,
 -663.7566396776275,
 -663.6349601853321,
 -663.5276625681478,
 -663.4334347107

In [None]:
100 * np.mean(gold_A_pm.argmax(axis=1) == A_pm_ls.argmax(axis=1))

41.5

## Dawid-Skene

In [None]:
data_ds, gold_A_ds, gold_B_ds = dawid_skene.data_generator(num_classes, num_annotators, num_datapoints)

In [None]:
A_ds, B_ds, elbos_ds = dawid_skene.evaluate(data_ds, num_classes, num_annotators, num_iters=100, logspace=False)

In [None]:
elbos_ds

[-1108.4578934522638,
 -911.828611470443,
 -877.4462992400894,
 -846.1091736903059,
 -816.8879354115284,
 -791.2826919900352,
 -770.2274519861986,
 -753.9350606527033,
 -741.9239448238651,
 -733.2778858882115,
 -726.8680467310317,
 -721.6495827103346,
 -717.007613708705,
 -712.6762752246266,
 -708.640559172335,
 -705.0831856268939,
 -702.1740705533558,
 -699.869996067637,
 -697.9974395077272,
 -696.4083371255944,
 -695.0077561833617,
 -693.7327914888843,
 -692.5374577735327,
 -691.4012634474971,
 -690.3462087430759,
 -689.407731691923,
 -688.586628902919,
 -687.855952236947,
 -687.1930329610216,
 -686.5912461647338,
 -686.0538203595497,
 -685.5817233626169,
 -685.1660424906286,
 -684.7914716010682,
 -684.4447880321767,
 -684.1182313740535,
 -683.8062883190421,
 -683.5023498814583,
 -683.1984752977435,
 -682.887356179993,
 -682.5737731379003,
 -682.2773793575128,
 -682.0052616814518,
 -681.7457198564869,
 -681.4859749765548,
 -681.2183580384655,
 -680.9403053086148,
 -680.6549605027764,

In [None]:
100 * np.mean(gold_A_ds.argmax(axis=1) == A_ds.argmax(axis=1))

35.5

In [None]:
A_ds_ls, B_ds_ls, elbos_ds_ls = dawid_skene.evaluate(data_ds, num_classes, num_annotators, 100, logspace=True)

In [None]:
elbos_ds_ls

[-1113.533375926138,
 -912.8543165885959,
 -880.6980401894722,
 -848.3792349903136,
 -815.9122681700005,
 -787.3809551300886,
 -764.9886680087268,
 -748.1629136672728,
 -735.4024135445612,
 -725.4700498821821,
 -717.4428149418102,
 -710.9823382203812,
 -705.9475735526648,
 -702.0721273493593,
 -699.0532598860278,
 -696.6168519706131,
 -694.5092614252417,
 -692.5173013417208,
 -690.674218690657,
 -689.147915497843,
 -687.9203043842347,
 -686.9339168242924,
 -686.14976039259,
 -685.5338304662157,
 -685.0526131432855,
 -684.675256991604,
 -684.3764473905339,
 -684.1371404083596,
 -683.9435477542003,
 -683.7855236502785,
 -683.655221259084,
 -683.5463524697743,
 -683.4538934466909,
 -683.3739335215188,
 -683.3035053337948,
 -683.2403866187135,
 -683.1829120846647,
 -683.1298198044847,
 -683.0801356743176,
 -683.0330894153602,
 -682.9880537382503,
 -682.9444996012033,
 -682.9019623000662,
 -682.8600145876294,
 -682.818244031098,
 -682.7762324710716,
 -682.7335358424507,
 -682.6896628303035,

In [None]:
100 * np.mean(gold_A_ds.argmax(axis=1) == A_ds_ls.argmax(axis=1))


22.5

## MACE (no priors)

In [None]:
data_mace, gold_A_mace, gold_B_mace, gold_T_mace = mace.data_generator(num_classes, num_annotators, num_datapoints)

In [None]:
A_mace, B_mace, T_mace, elbos_mace = mace.evaluate(data_mace, num_classes, num_annotators, 100, logspace=False)

  B /= B.sum(axis=1, keepdims=True)
  T = (eta_1 / eta_2) / (eta_1 / eta_2 + 1 )
  T = (eta_1 / eta_2) / (eta_1 / eta_2 + 1 )


In [None]:
elbos_mace

In [None]:
100 * np.mean(gold_A_mace.argmax(axis=1) == A_mace.argmax(axis=1))

AttributeError: 'NoneType' object has no attribute 'argmax'

In [None]:
A_mace_ls, B_mace_ls, T_mace_ls, elbos_mace_ls = mace.evaluate(data_mace, num_classes, num_annotators, 100, logspace=True)

In [None]:
elbos_mace_ls

[-1068.283181084394,
 -790.6909702799018,
 -706.3305865562194,
 -662.5097433628235,
 -635.9969054747585,
 -618.1964271262252,
 -606.0683388510812,
 -597.7516661983938,
 -591.8991984684662,
 -587.7177916164583,
 -584.706623357015,
 -582.5217266079981,
 -580.9177993405491,
 -579.7191834842941,
 -578.8008027854311,
 -578.0739256979788,
 -577.4755378788323,
 -576.9606922034195,
 -576.4972122710413,
 -576.0621666686058,
 -575.6396321275224,
 -575.2193648681116,
 -574.7960643861437,
 -574.3689361070839,
 -573.9412591820454,
 -573.5196888894383,
 -573.1131243321549,
 -572.7311722234543,
 -572.3824757468532,
 -572.0733203165254,
 -571.8068677918594,
 -571.5831334033263,
 -571.3995553145345,
 -571.2518611773276,
 -571.1349459354813,
 -571.0435793514583,
 -570.9728785099761,
 -570.9185619006248,
 -570.877038989131,
 -570.8453946916442,
 -570.821317705146,
 -570.8030066949021,
 -570.7890748738247,
 -570.7784635852719,
 -570.7703691092315,
 -570.7641832591781,
 -570.759446526575,
 -570.75581183148

In [None]:
100 * np.mean(gold_A_mace.argmax(axis=1) == A_mace_ls.argmax(axis=1))

78.0

## Latent Annotator Clustering

In [284]:
data_lac, gold_A_lac, gold_B_lac, gold_C_lac = latent_annotator_clustering.data_generator(num_classes, num_annotators, num_datapoints, num_clusters)

In [285]:
A_lac, B_lac, C_lac, elbos_lac = latent_annotator_clustering.evaluate(data_lac, num_classes, num_annotators, num_clusters, num_iters = 100, smoothing=False, logspace=True)

In [286]:
elbos_lac

[-6788.520039219978,
 -6611.111661036342,
 -6543.265755403254,
 -6493.176870708811,
 -6445.341456077233,
 -6395.31433267645,
 -6342.598077696534,
 -6288.5866436660735,
 -6235.529538911479,
 -6185.769149398953,
 -6141.140280724735,
 -6102.599921885982,
 -6070.153310174196,
 -6043.0521710949615,
 -6020.140749279466,
 -6000.192071344548,
 -5982.13336913393,
 -5965.144729189151,
 -5948.666809307204,
 -5932.36184181259,
 -5916.060464029353,
 -5899.716400771546,
 -5883.381047173562,
 -5867.194360972636,
 -5851.387042143792,
 -5836.291059455094,
 -5822.311086838352,
 -5809.807816360293,
 -5798.964145557513,
 -5789.7535955821495,
 -5782.012942776898,
 -5775.532203984928,
 -5770.107544423906,
 -5765.559099133238,
 -5761.732715862732,
 -5758.497432716291,
 -5755.7425721471445,
 -5753.375196009404,
 -5751.318082645666,
 -5749.50845606243,
 -5747.897581092132,
 -5746.450684884278,
 -5745.145833511776,
 -5743.970625394286,
 -5742.917348589856,
 -5741.978733211271,
 -5741.145845005,
 -5740.408012574

In [287]:
print("A:")
print(100 * homogeneity_score(gold_A_lac.argmax(axis=1), A_lac.argmax(axis=1)))
print(100 * completeness_score(gold_A_lac.argmax(axis=1), A_lac.argmax(axis=1)))
print(100 * v_measure_score(gold_A_lac.argmax(axis=1), A_lac.argmax(axis=1)))
print("B:")
print(100 * homogeneity_score(gold_B_lac.argmax(axis=1), B_lac.argmax(axis=1)))
print(100 * completeness_score(gold_B_lac.argmax(axis=1), B_lac.argmax(axis=1)))
print(100 * v_measure_score(gold_B_lac.argmax(axis=1), B_lac.argmax(axis=1)))
print("C1:")
print(100 * homogeneity_score(gold_C_lac[0].argmax(axis=1), C_lac[0].argmax(axis=1)))
print(100 * completeness_score(gold_C_lac[0].argmax(axis=1), C_lac[0].argmax(axis=1)))
print(100 * v_measure_score(gold_C_lac[0].argmax(axis=1), C_lac[0].argmax(axis=1)))
print("C2:")
print(100 * homogeneity_score(gold_C_lac[1].argmax(axis=1), C_lac[1].argmax(axis=1)))
print(100 * completeness_score(gold_C_lac[1].argmax(axis=1), C_lac[1].argmax(axis=1)))
print(100 * v_measure_score(gold_C_lac[1].argmax(axis=1), C_lac[1].argmax(axis=1)))

A:
70.18197862065422
71.0246094083182
70.60077987313406
B:
83.6124066920416
83.82174761998222
83.71694628777809
C1:
27.40175421212811
27.40175421212811
27.40175421212811
C2:
100.0
57.93801642856952
73.3680436651211


In [288]:
gold_B_lac.argmax(axis=1), B_lac.argmax(axis=1)


(array([1, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 0, 0, 1, 1, 0, 0, 0, 0, 0, 1, 1,
        0, 0, 0, 1, 0, 1, 0, 0, 1, 1, 0, 1]),
 array([1, 0, 0, 0, 1, 1, 1, 1, 1, 1, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 1, 1,
        0, 0, 0, 1, 0, 1, 0, 0, 1, 1, 0, 1]))

In [289]:
predicted_B = B_lac.argmax(axis=1)

In [279]:
# for i, x in enumerate(predicted_B):
#     if x == 0:
#         predicted_B[i] = 1
#     elif x == 1:
#         predicted_B[i] = 0
    

In [290]:
accuracy = 100 * np.mean(gold_B_lac.argmax(axis=1) == predicted_B)
accuracy

97.05882352941177

In [291]:
gold_A_lac.argmax(axis=1), A_lac.argmax(axis=1)

(array([1, 0, 0, 2, 1, 1, 2, 2, 1, 1, 1, 0, 2, 1, 1, 1, 2, 2, 0, 2, 2, 2,
        1, 1, 2, 2, 1, 2, 1, 0, 0, 0, 0, 2, 0, 0, 0, 1, 2, 0, 0, 2, 2, 2,
        1, 1, 2, 1, 1, 2, 0, 0, 1, 0, 0, 2, 1, 2, 0, 2, 0, 0, 0, 1, 2, 0,
        0, 2, 2, 2, 0, 0, 1, 1, 0, 2, 2, 0, 1, 0, 0, 1, 0, 1, 0, 0, 1, 0,
        2, 2, 2, 0, 0, 2, 1, 1, 2, 1, 1, 2, 2, 1, 1, 0, 1, 1, 1, 2, 2, 2,
        0, 1, 0, 2, 1, 2, 0, 1, 1, 0, 2, 1, 1, 2, 0, 2, 2, 0, 0, 1, 1, 2,
        0, 2, 1, 2, 0, 2, 2, 2, 0, 0, 1, 2, 1, 1, 0, 0, 2, 1, 2, 0, 2, 0,
        1, 0, 2, 2, 1, 2, 2, 0, 1, 0, 0, 0, 2, 0, 2, 2, 0, 1, 1, 1, 2, 1,
        0]),
 array([0, 2, 2, 1, 0, 0, 1, 1, 0, 0, 0, 1, 1, 0, 0, 0, 1, 1, 2, 2, 1, 1,
        0, 0, 1, 1, 0, 1, 0, 2, 1, 2, 0, 1, 0, 1, 2, 0, 0, 2, 2, 1, 1, 1,
        0, 0, 1, 0, 0, 1, 1, 2, 0, 2, 2, 1, 0, 1, 2, 1, 1, 1, 2, 0, 1, 2,
        2, 1, 1, 0, 2, 2, 0, 0, 2, 1, 1, 2, 0, 1, 2, 0, 2, 0, 2, 2, 0, 2,
        1, 1, 1, 1, 2, 1, 0, 0, 1, 0, 0, 1, 1, 0, 0, 2, 0, 0, 1, 1, 1, 1,
        2, 0, 2, 1, 0, 1,

In [292]:
predicted = A_lac.argmax(axis=1)

In [293]:
for i, x in enumerate(predicted):
    if x == 0:
        predicted[i] = 1
    elif x == 1:
        predicted[i] = 2
    elif x == 2:
        predicted[i] = 0
    

In [294]:
accuracy = 100 * np.mean(gold_A_lac.argmax(axis=1) == predicted)
accuracy

90.3954802259887

In [297]:
gold_C_lac[0].argmax(axis=1), C_lac[0].argmax(axis=1), gold_C_lac[1].argmax(axis=1), C_lac[1].argmax(axis=1)

(array([1, 2, 1]), array([2, 1, 1]), array([2, 2, 0]), array([2, 0, 1]))