## Read data, clean, and merge

In [1]:
import pandas as pd
import numpy as np
import re
from collections import Counter
from sklearn.metrics import f1_score, confusion_matrix, classification_report

In [2]:
df = pd.read_excel("./data/gsl_eval_v0.xlsx")

# bert_preds from this notebook https://gist.github.com/yqngzh/cc38e60d8fcfa37059bf9271a07cc312
# which was ran on a GPU workbench
bert_preds_df = pd.read_excel("./data/gsl_eval_v0_bert_preds.xlsx")

yuqing_df = pd.read_excel("./data/yuqing_labels.xlsx")
argie_df = pd.read_excel("./data/argie_labels.xlsx")
congzhe_df = pd.read_excel("./data/congzhe_labels.xlsx")
willy_df = pd.read_excel("./data/willy_labels.xlsx")
emy_df = pd.read_excel("./data/emy_labels.xlsx")

In [3]:
bert_preds_df = bert_preds_df[[
    "listingId", "etsyUUID", "v2_bert_pred_labels", 
    "v2_bert_score_not_relevant", "v2_bert_score_prob_partial", "v2_bert_score_relevant"
]]

In [4]:
yuqing_df = yuqing_df[["listingId", "etsyUUID", "etsy_label", "notes"]]
yuqing_df.rename(columns={"etsy_label": "yuqing_label", "notes": "yuqing_notes"}, inplace=True)
yuqing_df.yuqing_label.value_counts()

partial         288
relevant        267
not_relevant    139
not_sure         26
Name: yuqing_label, dtype: int64

In [5]:
argie_df = argie_df[["listingId", "etsyUUID", "etsy_label", "notes"]]
argie_df.rename(columns={"etsy_label": "argie_label", "notes": "argie_notes"}, inplace=True)

argie_df.fillna({"argie_label": "not_sure"}, inplace=True)
argie_df["argie_label"] = argie_df.argie_label.apply(lambda x: x.replace("not sure", "not_sure").lower())
argie_df.argie_label.value_counts()

relevant        400
partial         173
not_relevant    128
not_sure         19
Name: argie_label, dtype: int64

In [6]:
willy_df = willy_df[["listingId", "etsyUUID", "etsy_label", "notes"]]
willy_df.rename(columns={"etsy_label": "willy_label", "notes": "willy_notes"}, inplace=True)

willy_df.fillna({"willy_label": "not_sure"}, inplace=True)
willy_df["willy_label"] = willy_df.willy_label.apply(
    lambda x: x.replace("irrelevant", "not_relevant").replace("partially relevant", "partial").replace("not sure", "not_sure").replace("shop query", "not_sure").replace("navigation query", "not_sure")
)
willy_df.willy_label.value_counts()

relevant        363
not_relevant    171
partial         158
not_sure         28
Name: willy_label, dtype: int64

In [7]:
congzhe_df = congzhe_df[["listingId", "etsyUUID", "etsy_label", "notes"]]
congzhe_df.rename(columns={"etsy_label": "congzhe_label", "notes": "congzhe_notes"}, inplace=True)

congzhe_df["congzhe_label"] = congzhe_df.congzhe_label.apply(
    lambda x: x.replace("not relevant", "not_relevant").replace("not sure", "not_sure")
)
congzhe_df.congzhe_label.value_counts()

relevant        368
not_relevant    182
partial         155
not_sure         15
Name: congzhe_label, dtype: int64

In [8]:
emy_df = emy_df[["listingId", "etsyUUID", "etsy_label", "notes"]]
emy_df.rename(columns={"etsy_label": "emy_label", "notes": "emy_notes"}, inplace=True)
emy_df.emy_label.value_counts()

relevant        266
partial         262
not_relevant    160
not_sure         32
Name: emy_label, dtype: int64

In [9]:
merged_df = pd.merge(df, bert_preds_df, on=["etsyUUID", "listingId"], how="left")
merged_df = pd.merge(merged_df, yuqing_df, on=["etsyUUID", "listingId"], how="left")
merged_df = pd.merge(merged_df, argie_df, on=["etsyUUID", "listingId"], how="left")
merged_df = pd.merge(merged_df, willy_df, on=["etsyUUID", "listingId"], how="left")
merged_df = pd.merge(merged_df, congzhe_df, on=["etsyUUID", "listingId"], how="left")
merged_df = pd.merge(merged_df, emy_df, on=["etsyUUID", "listingId"], how="left")

In [10]:
merged_df[["yuqing_label", "argie_label", "willy_label", "congzhe_label", "emy_label"]].info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1200 entries, 0 to 1199
Data columns (total 5 columns):
 #   Column         Non-Null Count  Dtype 
---  ------         --------------  ----- 
 0   yuqing_label   720 non-null    object
 1   argie_label    720 non-null    object
 2   willy_label    720 non-null    object
 3   congzhe_label  720 non-null    object
 4   emy_label      720 non-null    object
dtypes: object(5)
memory usage: 56.2+ KB


In [11]:
label_etsy_1, label_etsy_2, label_etsy_3 = [], [], []
etsy_person_1, etsy_person_2, etsy_person_3 = [], [], []
merged_notes = []

for i in range(merged_df.shape[0]):
    curr_row = merged_df.iloc[i, :]
    curr_persons = []
    curr_labels = []
    curr_notes = []

    if not pd.isnull(curr_row["yuqing_label"]):
        curr_persons.append("yuqing")
        curr_labels.append(curr_row["yuqing_label"])
        if not pd.isnull(curr_row["yuqing_notes"]):
            curr_notes.append(f'consideration for <{curr_row["yuqing_label"]}> label: {curr_row["yuqing_notes"]}')

    if not pd.isnull(curr_row["argie_label"]):
        curr_persons.append("argie")
        curr_labels.append(curr_row["argie_label"])
        if not pd.isnull(curr_row["argie_notes"]):
            curr_notes.append(f'consideration for <{curr_row["argie_label"]}> label: {curr_row["argie_notes"]}')
    
    if not pd.isnull(curr_row["willy_label"]):
        curr_persons.append("willy")
        curr_labels.append(curr_row["willy_label"])
        if not pd.isnull(curr_row["willy_notes"]):
            curr_notes.append(f'consideration for <{curr_row["willy_label"]}> label: {curr_row["willy_notes"]}')

    if not pd.isnull(curr_row["congzhe_label"]):
        curr_persons.append("congzhe")
        curr_labels.append(curr_row["congzhe_label"])
        if not pd.isnull(curr_row["congzhe_notes"]):
            curr_notes.append(f'consideration for <{curr_row["congzhe_label"]}> label: {curr_row["congzhe_notes"]}')

    if not pd.isnull(curr_row["emy_label"]):
        curr_persons.append("emy")
        curr_labels.append(curr_row["emy_label"])
        if not pd.isnull(curr_row["emy_notes"]):
            curr_notes.append(f'consideration for <{curr_row["emy_label"]}> label: {curr_row["emy_notes"]}')

    assert len(curr_persons) == 3
    assert len(curr_labels) == 3

    etsy_person_1.append(curr_persons[0])
    etsy_person_2.append(curr_persons[1])
    etsy_person_3.append(curr_persons[2])

    label_etsy_1.append(curr_labels[0])
    label_etsy_2.append(curr_labels[1])
    label_etsy_3.append(curr_labels[2])
    
    curr_notes = "|".join(curr_notes)
    merged_notes.append(curr_notes)

In [12]:
merged_df["label_etsy_1"] = label_etsy_1
merged_df["label_etsy_2"] = label_etsy_2
merged_df["label_etsy_3"] = label_etsy_3

merged_df["etsy_person_1"] = etsy_person_1
merged_df["etsy_person_2"] = etsy_person_2
merged_df["etsy_person_3"] = etsy_person_3

merged_df["etsy_notes"] = merged_notes

In [13]:
merged_df.columns

Index(['query', 'queryEn', 'listingId', 'etsy_url', 'atlas_url',
       'titleEn_vertica', 'etsyUUID', 'platform', 'userLanguage',
       'anno_data_source', 'is_test', 'labelbox_majority_label',
       'label_annotator_1', 'label_annotator_2', 'label_annotator_3',
       'is_gsl_v0_eval', 'v2_bert_pred_labels', 'v2_bert_score_not_relevant',
       'v2_bert_score_prob_partial', 'v2_bert_score_relevant', 'yuqing_label',
       'yuqing_notes', 'argie_label', 'argie_notes', 'willy_label',
       'willy_notes', 'congzhe_label', 'congzhe_notes', 'emy_label',
       'emy_notes', 'label_etsy_1', 'label_etsy_2', 'label_etsy_3',
       'etsy_person_1', 'etsy_person_2', 'etsy_person_3', 'etsy_notes'],
      dtype='object')

In [14]:
merged_df.label_annotator_3.value_counts()

relevant        444
partial         389
not_relevant    364
not_sure          3
Name: label_annotator_3, dtype: int64

## Aggregate labels

In [15]:
def checkIsGiftQuery(query_str):
    pattern = r"(?i)\bgift|\bfor (\bhim|\bher|\bmom|\bdad|\bmother|\bfather|\bdaughter|\bson|\bwife|\bhusband|\bpartner|\baunt|\buncle|\bniece|\bnephew|\bfiance|\bcousin|\bin law|\bboyfriend|\bgirlfriend|\bgrand|\bfriend|\bbest friend)"
    result = re.search(pattern, query_str) 
    is_gift = result is not None
    return is_gift

def checkIsGiftInDF(row):
    if row["anno_data_source"].startswith("us"):
        return checkIsGiftQuery(row["query"])
    else:
        return checkIsGiftQuery(row["queryEn"])

In [16]:
merged_df["queryIsGift"] = merged_df.apply(checkIsGiftInDF, axis=1)

In [17]:
merged_df.labelbox_majority_label.value_counts()

partial         400
not_relevant    400
relevant        400
Name: labelbox_majority_label, dtype: int64

In [18]:
def round_to_partial(label_vec):
    """ given a vector (length 3) of labels, generate the round label
    """
    if any(x == "not_sure" for x in label_vec):
        return "not_sure"
    if all(x == "not_relevant" for x in label_vec):
        return "not_relevant"
    if all(x == "relevant" for x in label_vec):
        return "relevant"
    return "partial"

In [19]:
etsy_majority, etsy_round, labelbox_round = [], [], []
etsy_unanimous, labelbox_unanimous = [], []

for i in range(merged_df.shape[0]):
    curr_row = merged_df.iloc[i, :]
    etsy_labels = [
        curr_row["label_etsy_1"],
        curr_row["label_etsy_2"],  
        curr_row["label_etsy_3"],  
    ]
    labelbox_labels = [
        curr_row["label_annotator_1"],
        curr_row["label_annotator_2"],
        curr_row["label_annotator_3"],
    ]
    
    etsy_counter = Counter(etsy_labels)
    etsy_most_common = etsy_counter.most_common()[0]

    labelbox_counter = Counter(labelbox_labels)
    labelbox_most_common = labelbox_counter.most_common()[0]
    assert labelbox_most_common[1] >= 2
    assert labelbox_most_common[0] == curr_row["labelbox_majority_label"]

    if len(etsy_counter) == 1 and etsy_most_common[0] != "not_sure":
        etsy_unanimous.append(True)
    else:
        etsy_unanimous.append(False)
    if len(labelbox_counter) == 1 and labelbox_most_common[0] != "not_sure":
        labelbox_unanimous.append(True)
    else:
        labelbox_unanimous.append(False)

    # etsy majority label
    if etsy_most_common[1] >= 2 and etsy_most_common[0] != "not_sure":
        etsy_majority.append(etsy_most_common[0])
    else:
        etsy_majority.append("not_sure")
    
    # labelbox round to partial label
    etsy_round.append(round_to_partial(etsy_labels))

    # etsy round to partial label
    labelbox_round.append(round_to_partial(labelbox_labels))

In [20]:
merged_df["labelbox_round_label"] = labelbox_round
merged_df["etsy_majority_label"] = etsy_majority
merged_df["etsy_round_label"] = etsy_round
merged_df["etsy_unanimous"] = etsy_unanimous
merged_df["labelbox_unanimous"] = labelbox_unanimous

In [21]:
merged_df.etsy_unanimous.value_counts()

False    606
True     594
Name: etsy_unanimous, dtype: int64

In [22]:
merged_df.labelbox_unanimous.value_counts()

True     696
False    504
Name: labelbox_unanimous, dtype: int64

In [23]:
merged_df[merged_df.queryIsGift].etsy_unanimous.value_counts()

True     68
False    39
Name: etsy_unanimous, dtype: int64

In [24]:
merged_df[merged_df.queryIsGift].labelbox_unanimous.value_counts()

True     64
False    43
Name: labelbox_unanimous, dtype: int64

In [25]:
merged_df.labelbox_round_label.value_counts()

partial         672
relevant        301
not_relevant    217
not_sure         10
Name: labelbox_round_label, dtype: int64

In [26]:
merged_df.etsy_majority_label.value_counts()

relevant        539
partial         312
not_relevant    247
not_sure        102
Name: etsy_majority_label, dtype: int64

In [27]:
merged_df.etsy_round_label.value_counts()

partial         620
relevant        361
not_relevant    114
not_sure        105
Name: etsy_round_label, dtype: int64

## Confusion matrices

In [28]:
confusion_matrix(merged_df.labelbox_majority_label, merged_df.etsy_majority_label, labels=["relevant", "partial", "not_relevant", "not_sure"])

array([[358,  25,   6,  11],
       [128, 199,  36,  37],
       [ 53,  88, 205,  54],
       [  0,   0,   0,   0]])

In [29]:
confusion_matrix(merged_df.labelbox_round_label, merged_df.etsy_round_label, labels=["relevant", "partial", "not_relevant", "not_sure"])

array([[240,  47,   1,  13],
       [112, 468,  37,  55],
       [  9, 102,  74,  32],
       [  0,   3,   2,   5]])

## Individual VS etsy majority

In [30]:
yuqing_sub_df = merged_df[~pd.isnull(merged_df.yuqing_label)]
argie_sub_df = merged_df[~pd.isnull(merged_df.argie_label)]
willy_sub_df = merged_df[~pd.isnull(merged_df.willy_label)]
congzhe_sub_df = merged_df[~pd.isnull(merged_df.congzhe_label)]
emy_sub_df = merged_df[~pd.isnull(merged_df.emy_label)]

In [31]:
def compute_fpr(y_true, y_pred):
    tn = np.sum((y_true == 0) & (y_pred == 0))
    fp = np.sum((y_true == 0) & (y_pred == 1))
    fpr = fp / (fp + tn)
    return fpr


def compare_individual_to_majority(df, label_field, majority_field="etsy_majority_label"):
    df_sub = df[np.logical_and(df[majority_field] != "not_sure", df[label_field] != "not_sure")]
    
    f1 = f1_score(df_sub[majority_field], df_sub[label_field], average="macro")
    print(f"{f1=}")
    
    irrel_binary_true = df_sub[majority_field].apply(lambda x: 0 if x == "not_relevant" else 1)
    irrel_binary_pred = df_sub[label_field].apply(lambda x: 0 if x == "not_relevant" else 1)
    irrel_fpr = compute_fpr(irrel_binary_true, irrel_binary_pred)
    print(f"{irrel_fpr=}")

    rel_binary_true = df_sub[majority_field].apply(lambda x: 1 if x == "relevant" else 0)
    rel_binary_pred = df_sub[label_field].apply(lambda x: 1 if x == "relevant" else 0)
    rel_fpr = compute_fpr(rel_binary_true, rel_binary_pred)
    print(f"{rel_fpr=}")

    cls_report = classification_report(df_sub[majority_field], df_sub[label_field], digits=4)
    print(cls_report)
    
    conf_mat = confusion_matrix(df_sub[majority_field], df_sub[label_field])
    print(conf_mat)

In [32]:
compare_individual_to_majority(yuqing_sub_df, "yuqing_label")

f1=0.8817506522602002
irrel_fpr=0.14893617021276595
rel_fpr=0.020527859237536656
              precision    recall  f1-score   support

not_relevant     0.9449    0.8511    0.8955       141
     partial     0.7559    0.9600    0.8458       200
    relevant     0.9732    0.8439    0.9039       301

    accuracy                         0.8816       642
   macro avg     0.8913    0.8850    0.8818       642
weighted avg     0.8993    0.8816    0.8840       642

[[120  20   1]
 [  2 192   6]
 [  5  42 254]]


In [33]:
compare_individual_to_majority(argie_sub_df, "argie_label")

f1=0.8271816103513158
irrel_fpr=0.23404255319148937
rel_fpr=0.15705128205128205
              precision    recall  f1-score   support

not_relevant     0.9076    0.7660    0.8308       141
     partial     0.7654    0.7251    0.7447       171
    relevant     0.8683    0.9472    0.9060       341

    accuracy                         0.8499       653
   macro avg     0.8471    0.8128    0.8272       653
weighted avg     0.8498    0.8499    0.8475       653

[[108  25   8]
 [  6 124  41]
 [  5  13 323]]


In [34]:
compare_individual_to_majority(willy_sub_df, "willy_label")

f1=0.8294841495776439
irrel_fpr=0.11643835616438356
rel_fpr=0.11838006230529595
              precision    recall  f1-score   support

not_relevant     0.8269    0.8836    0.8543       146
     partial     0.7867    0.6743    0.7262       175
    relevant     0.8879    0.9290    0.9080       324

    accuracy                         0.8496       645
   macro avg     0.8338    0.8290    0.8295       645
weighted avg     0.8466    0.8496    0.8465       645

[[129  11   6]
 [ 25 118  32]
 [  2  21 301]]


In [35]:
compare_individual_to_majority(congzhe_sub_df, "congzhe_label")

f1=0.8261547266403779
irrel_fpr=0.1342281879194631
rel_fpr=0.1476923076923077
              precision    recall  f1-score   support

not_relevant     0.8431    0.8658    0.8543       149
     partial     0.7987    0.6761    0.7323       176
    relevant     0.8625    0.9233    0.8919       326

    accuracy                         0.8433       651
   macro avg     0.8348    0.8217    0.8262       651
weighted avg     0.8408    0.8433    0.8401       651

[[129  12   8]
 [ 17 119  40]
 [  7  18 301]]


In [36]:
compare_individual_to_majority(emy_sub_df, "emy_label")

f1=0.8762269925874889
irrel_fpr=0.11594202898550725
rel_fpr=0.011560693641618497
              precision    recall  f1-score   support

not_relevant     0.8472    0.8841    0.8652       138
     partial     0.7884    0.9135    0.8463       208
    relevant     0.9848    0.8581    0.9171       303

    accuracy                         0.8814       649
   macro avg     0.8735    0.8852    0.8762       649
weighted avg     0.8926    0.8814    0.8834       649

[[122  14   2]
 [ 16 190   2]
 [  6  37 260]]


In [37]:
compare_individual_to_majority(merged_df, "label_annotator_1")

f1=0.6639790791594181
irrel_fpr=0.25102880658436216
rel_fpr=0.11711711711711711
              precision    recall  f1-score   support

not_relevant     0.5796    0.7490    0.6535       243
     partial     0.5552    0.6282    0.5895       312
    relevant     0.8474    0.6710    0.7490       538

    accuracy                         0.6761      1093
   macro avg     0.6608    0.6827    0.6640      1093
weighted avg     0.7045    0.6761    0.6822      1093

[[182  38  23]
 [ 74 196  42]
 [ 58 119 361]]


In [38]:
compare_individual_to_majority(merged_df, "label_annotator_2")

f1=0.664723692176609
irrel_fpr=0.2145748987854251
rel_fpr=0.0913978494623656
              precision    recall  f1-score   support

not_relevant     0.5861    0.7854    0.6713       247
     partial     0.5284    0.5981    0.5611       311
    relevant     0.8768    0.6735    0.7618       539

    accuracy                         0.6773      1097
   macro avg     0.6638    0.6857    0.6647      1097
weighted avg     0.7126    0.6773    0.6845      1097

[[194  42  11]
 [ 85 186  40]
 [ 52 124 363]]


In [39]:
compare_individual_to_majority(merged_df, "label_annotator_3")

f1=0.669414095304614
irrel_fpr=0.21721311475409835
rel_fpr=0.1079136690647482
              precision    recall  f1-score   support

not_relevant     0.6044    0.7828    0.6821       244
     partial     0.5341    0.6026    0.5663       312
    relevant     0.8595    0.6809    0.7598       539

    accuracy                         0.6813      1095
   macro avg     0.6660    0.6887    0.6694      1095
weighted avg     0.7099    0.6813    0.6874      1095

[[191  40  13]
 [ 77 188  47]
 [ 48 124 367]]


## Individual VS etsy RTP

In [32]:
compare_individual_to_majority(yuqing_sub_df, "yuqing_label", majority_field="etsy_round_label")

f1=0.8310787449180985
irrel_fpr=0.0
rel_fpr=0.10738255033557047
              precision    recall  f1-score   support

not_relevant     0.5917    1.0000    0.7435        71
     partial     1.0000    0.7420    0.8519       376
    relevant     0.8147    1.0000    0.8979       211

    accuracy                         0.8526       658
   macro avg     0.8021    0.9140    0.8311       658
weighted avg     0.8965    0.8526    0.8549       658

[[ 71   0   0]
 [ 49 279  48]
 [  0   0 211]]


In [34]:
compare_individual_to_majority(argie_sub_df, "argie_label", majority_field="etsy_round_label")

f1=0.6987451946732622
irrel_fpr=0.0
rel_fpr=0.36425339366515835
              precision    recall  f1-score   support

not_relevant     0.5983    1.0000    0.7487        70
     partial     1.0000    0.4409    0.6119       372
    relevant     0.5818    1.0000    0.7356       224

    accuracy                         0.6877       666
   macro avg     0.7267    0.8136    0.6987       666
weighted avg     0.8171    0.6877    0.6679       666

[[ 70   0   0]
 [ 47 164 161]
 [  0   0 224]]


In [35]:
compare_individual_to_majority(willy_sub_df, "willy_label", majority_field="etsy_round_label")

f1=0.6534880069366434
irrel_fpr=0.0
rel_fpr=0.28703703703703703
              precision    recall  f1-score   support

not_relevant     0.4268    1.0000    0.5982        67
     partial     1.0000    0.4137    0.5853       365
    relevant     0.6353    1.0000    0.7770       216

    accuracy                         0.6698       648
   macro avg     0.6873    0.8046    0.6535       648
weighted avg     0.8192    0.6698    0.6505       648

[[ 67   0   0]
 [ 90 151 124]
 [  0   0 216]]


In [36]:
compare_individual_to_majority(congzhe_sub_df, "congzhe_label", majority_field="etsy_round_label")

f1=0.6521984539778134
irrel_fpr=0.0
rel_fpr=0.2974828375286041
              precision    recall  f1-score   support

not_relevant     0.4359    1.0000    0.6071        68
     partial     1.0000    0.4092    0.5808       369
    relevant     0.6243    1.0000    0.7687       216

    accuracy                         0.6662       653
   macro avg     0.6867    0.8031    0.6522       653
weighted avg     0.8170    0.6662    0.6457       653

[[ 68   0   0]
 [ 88 151 130]
 [  0   0 216]]


In [38]:
compare_individual_to_majority(emy_sub_df, "emy_label", majority_field="etsy_round_label")

f1=0.7722554880773852
irrel_fpr=0.0
rel_fpr=0.1036036036036036
              precision    recall  f1-score   support

not_relevant     0.4459    1.0000    0.6168        66
     partial     1.0000    0.6614    0.7962       378
    relevant     0.8244    1.0000    0.9038       216

    accuracy                         0.8061       660
   macro avg     0.7568    0.8871    0.7723       660
weighted avg     0.8871    0.8061    0.8135       660

[[ 66   0   0]
 [ 82 250  46]
 [  0   0 216]]


In [39]:
compare_individual_to_majority(merged_df, "label_annotator_1", majority_field="etsy_round_label")

f1=0.5922826778810552
irrel_fpr=0.18584070796460178
rel_fpr=0.18442622950819673
              precision    recall  f1-score   support

not_relevant     0.2997    0.8142    0.4381       113
     partial     0.8202    0.4863    0.6105       619
    relevant     0.6778    0.7867    0.7282       361

    accuracy                         0.6194      1093
   macro avg     0.5992    0.6957    0.5923      1093
weighted avg     0.7193    0.6194    0.6316      1093

[[ 92  12   9]
 [192 301 126]
 [ 23  54 284]]


In [40]:
compare_individual_to_majority(merged_df, "label_annotator_2", majority_field="etsy_round_label")

f1=0.6044612663301915
irrel_fpr=0.11403508771929824
rel_fpr=0.16371077762619374
              precision    recall  f1-score   support

not_relevant     0.3079    0.8860    0.4570       114
     partial     0.8310    0.4847    0.6122       619
    relevant     0.7037    0.7895    0.7441       361

    accuracy                         0.6271      1094
   macro avg     0.6142    0.7200    0.6045      1094
weighted avg     0.7345    0.6271    0.6396      1094

[[101   9   4]
 [203 300 116]
 [ 24  52 285]]


In [41]:
compare_individual_to_majority(merged_df, "label_annotator_3", majority_field="etsy_round_label")

f1=0.5943064564327023
irrel_fpr=0.1504424778761062
rel_fpr=0.17486338797814208
              precision    recall  f1-score   support

not_relevant     0.3009    0.8496    0.4444       113
     partial     0.8172    0.4766    0.6020       619
    relevant     0.6901    0.7895    0.7364       361

    accuracy                         0.6185      1093
   macro avg     0.6027    0.7052    0.5943      1093
weighted avg     0.7218    0.6185    0.6301      1093

[[ 96  13   4]
 [200 295 124]
 [ 23  53 285]]


### Write out data with aggregated labels

In [40]:
merged_df.drop(columns=[
    "etsy_url", "atlas_url", "is_test", "yuqing_label", "yuqing_notes",
    "argie_label", "argie_notes", "willy_label", "willy_notes", 
    "congzhe_label", "congzhe_notes", "emy_label", "emy_notes"
], inplace=True)

In [41]:
merged_df.to_excel("./data/gsl_eval_v0_internal_label_bert_preds.xlsx", index=False)