In [1]:
import pandas as pd
import numpy as np
import json
from collections import Counter

from google.cloud import bigquery

In [2]:
bq_client = bigquery.Client(project="etsy-search-ml-dev")



## US Annotation V2

In [3]:
us_v2_query = """SELECT 
    query,
    '' as queryEn,
    listingId,
    etsyUUID,
    requestSource as platform,
    userLanguage,
    listingTitle as titleEn_vertica,
    source as anno_data_source,
    gold_label,
    label_annotator_1,
    label_annotator_2,
    label_annotator_3,
    is_test
FROM `etsy-sr-etl-prod.yzhang.sem_rel_human_annotation_v2`"""

query_job = bq_client.query(us_v2_query)
rows = query_job.result()
us_v2 = rows.to_dataframe()

In [4]:
us_v2["anno_data_source"] = us_v2["anno_data_source"].apply(lambda x: f"us_v2-{x}")
us_v2["anno_data_source"].value_counts()

us_v2-direct_specified      10500
us_v2-direct_unspecified    10490
us_v2-broad                  6000
us_v2-gift                   3000
Name: anno_data_source, dtype: int64

In [5]:
us_v2.label_annotator_3.value_counts()

relevant        16285
partial          8889
not_relevant     4663
not_sure          153
Name: label_annotator_3, dtype: int64

In [6]:
us_v2.columns

Index(['query', 'queryEn', 'listingId', 'etsyUUID', 'platform', 'userLanguage',
       'titleEn_vertica', 'anno_data_source', 'gold_label',
       'label_annotator_1', 'label_annotator_2', 'label_annotator_3',
       'is_test'],
      dtype='object')

In [7]:
test_queries = set(us_v2[us_v2.is_test]["query"].unique())
train_queries = set(us_v2[~us_v2.is_test]["query"].unique())
len(test_queries)#.intersection(train_queries))
# test split 2316 queries, overlap with train 289

2316

## US Annotation V1

4-point scale, downloaded the excel file from repo: https://github.com/etsy/semantic-relevance/blob/main/notebooks/annotation_dataset_v1.xlsx

In [8]:
us_v1 = pd.read_excel("./annotation_dataset_v1.xlsx")
us_v1.columns

# us_v1[np.logical_and(us_v1.label != us_v1.majorityLabel, ~us_v1.majorityLabel.isna())][["label", "labels", "majorityLabel"]].head()
# label - first annotator label

Index(['Unnamed: 0', 'etsy_uuid', 'listingId', 'query', 'platform', 'bin',
       'taxonomy', 'title', 'bordaRank', 'finalRank', 'description', 'label',
       'taxonomyPath', 'tags', 'title_1',
       'verticaListingTranslations_primaryLanguageTitle', 'labels',
       'agreedLabel', 'majorityLabel'],
      dtype='object')

In [9]:
us_v1 = us_v1[["query", "listingId", "etsy_uuid", "platform", "title", "majorityLabel", "labels"]]

In [10]:
# format individual labeler annotations
us_v1["label_annotator_1"] = us_v1.labels.apply(lambda labels_str: json.loads(labels_str)[0])
us_v1["label_annotator_2"] = us_v1.labels.apply(lambda labels_str: json.loads(labels_str)[1])
us_v1["label_annotator_3"] = us_v1.labels.apply(lambda labels_str: json.loads(labels_str)[2])

In [11]:
us_v1.iloc[44:47,:]

Unnamed: 0,query,listingId,etsy_uuid,platform,title,majorityLabel,labels,label_annotator_1,label_annotator_2,label_annotator_3
44,inkpotuk,927788528,Eu-6dheDKNLuUWTysd-WIktZMgfe,web,A5 (ish) Hardback Notebook Lined or Dotted Pap...,1.0,"[0, 1, 1]",0,1,1
45,inkpotuk,938988086,Eu-6dheDKNLuUWTysd-WIktZMgfe,web,A6 Notebook Made of PU Leather Hardback Notebo...,1.0,"[1, 1, 4]",1,1,4
46,inkpotuk,967367257,Eu-6dheDKNLuUWTysd-WIktZMgfe,web,"Personalised notebook A6, personalised notepad...",1.0,"[1, 1, 0]",1,1,0


In [12]:
# map 4-point scale to 3-point scale
map_dict = {0: "not_sure", 1: "not_relevant", 2: "partial", 3: "partial", 4: "relevant"}

us_v1["majorityLabel"] = us_v1["majorityLabel"].map(map_dict)

In [13]:
us_v1.iloc[44:47,:]

Unnamed: 0,query,listingId,etsy_uuid,platform,title,majorityLabel,labels,label_annotator_1,label_annotator_2,label_annotator_3
44,inkpotuk,927788528,Eu-6dheDKNLuUWTysd-WIktZMgfe,web,A5 (ish) Hardback Notebook Lined or Dotted Pap...,not_relevant,"[0, 1, 1]",0,1,1
45,inkpotuk,938988086,Eu-6dheDKNLuUWTysd-WIktZMgfe,web,A6 Notebook Made of PU Leather Hardback Notebo...,not_relevant,"[1, 1, 4]",1,1,4
46,inkpotuk,967367257,Eu-6dheDKNLuUWTysd-WIktZMgfe,web,"Personalised notebook A6, personalised notepad...",not_relevant,"[1, 1, 0]",1,1,0


In [14]:
us_v1["label_annotator_1"] = us_v1["label_annotator_1"].map(map_dict)
us_v1["label_annotator_2"] = us_v1["label_annotator_2"].map(map_dict)
us_v1["label_annotator_3"] = us_v1["label_annotator_3"].map(map_dict)

In [15]:
us_v1.label_annotator_3.value_counts()

relevant        19305
partial          4502
not_relevant     3981
not_sure          237
Name: label_annotator_3, dtype: int64

In [16]:
# format columns to match with V2
us_v1.drop('labels', axis=1, inplace=True) 
us_v1.rename(columns={"majorityLabel": "gold_label", "etsy_uuid": "etsyUUID", "title": "titleEn_vertica"}, inplace=True)
us_v1["userLanguage"] = ""
us_v1["anno_data_source"] = "us_v1"
us_v1["queryEn"] = ""
us_v1["is_test"] = False

In [17]:
us_v1.iloc[10:15,:]

Unnamed: 0,query,listingId,etsyUUID,platform,titleEn_vertica,gold_label,label_annotator_1,label_annotator_2,label_annotator_3,userLanguage,anno_data_source,queryEn,is_test
10,seedling cups,1117447411,Eu-2lSwjH_T8GmLOol51dyXuWk87,web,Southern Red Credar Seedling,not_relevant,not_relevant,not_relevant,not_relevant,,us_v1,,False
11,seedling cups,1146306150,Eu-2lSwjH_T8GmLOol51dyXuWk87,web,Nursery Flower Pots 3 Inch Square Heavy Duty 1...,relevant,relevant,relevant,partial,,us_v1,,False
12,seedling cups,1221549646,Eu-2lSwjH_T8GmLOol51dyXuWk87,web,Seed Starter | Seedling Tray | Genius Plant Hack,relevant,not_relevant,relevant,relevant,,us_v1,,False
13,seedling cups,1258868503,Eu-2lSwjH_T8GmLOol51dyXuWk87,web,Interior Live Oak | Medium Tree Seedling | The...,,not_relevant,relevant,partial,,us_v1,,False
14,seedling cups,1405409965,Eu-2lSwjH_T8GmLOol51dyXuWk87,web,Coast Redwood tree seedling - Sequoia sempervi...,partial,partial,not_relevant,partial,,us_v1,,False


In [18]:
us_v1.columns

Index(['query', 'listingId', 'etsyUUID', 'platform', 'titleEn_vertica',
       'gold_label', 'label_annotator_1', 'label_annotator_2',
       'label_annotator_3', 'userLanguage', 'anno_data_source', 'queryEn',
       'is_test'],
      dtype='object')

## International

In [19]:
intl_query = """SELECT 
    query, 
    queryEn,
    listingId, 
    etsyUUID, 
    requestSource as platform,
    userLanguage,
    listingTitleEn as titleEn_vertica,
    source as anno_data_source,
    gold_label,
    label_annotator_1,
    label_annotator_2,
    label_annotator_3
FROM `etsy-search-ml-dev.aclapp.semrel_human_annotation_v1`"""

query_job = bq_client.query(intl_query)
rows = query_job.result()
intl_df = rows.to_dataframe()

In [20]:
intl_df.shape

(26803, 12)

In [21]:
intl_df.label_annotator_2.value_counts()

relevant        16367
partial          5353
not_relevant     5019
not_sure           64
Name: label_annotator_2, dtype: int64

In [22]:
intl_df["anno_data_source"] = intl_df["anno_data_source"].apply(lambda x: f"intl-{x}")
intl_df.anno_data_source.value_counts()

intl-de    5698
intl-fr    5687
intl-es    5097
intl-it    3944
intl-nl    3375
intl-pt     836
intl-pl     808
intl-ru     681
intl-ja     677
Name: anno_data_source, dtype: int64

Create a test split in INTL dataset

In [23]:
import binascii
import hashlib

In [24]:
def uuid_is_test(uuid: str):
    uuid_bytes = uuid.encode('utf-8')
    uuid_hash = binascii.crc32(uuid_bytes)
    # Sample 10%
    return uuid_hash % 10 == 0

In [25]:
intl_df['is_test'] = (intl_df.etsyUUID.apply(uuid_is_test)) & (intl_df.gold_label.notnull())
intl_df.is_test.value_counts()

False    24159
True      2644
Name: is_test, dtype: int64

## Combine datasets

In [26]:
joint_columns = [
    'query', 
    'queryEn', 
    'listingId',
    'etsyUUID', 
    'platform', 
    'userLanguage',
    'titleEn_vertica', 
    'anno_data_source', 
    'gold_label',
    'label_annotator_1', 
    'label_annotator_2', 
    'label_annotator_3',
    "is_test"
]

In [27]:
us_v1 = us_v1[joint_columns]
us_v2 = us_v2[joint_columns]
intl_df = intl_df[joint_columns]

In [28]:
combined_df = pd.concat([us_v1, us_v2, intl_df])
combined_df = combined_df.reset_index(drop=True)

In [29]:
combined_df.shape

(84818, 13)

In [30]:
len(combined_df["query"].unique())

25550

In [31]:
# compute gold label again because we're merging 2 and 3 in US V1 data, which may lead to majority agreeing on partial
# if no consensus, use no_consensus

updated_gold_label = []

for i in range(combined_df.shape[0]):
    anno1 = combined_df["label_annotator_1"][i]
    anno2 = combined_df["label_annotator_2"][i]
    anno3 = combined_df["label_annotator_3"][i]
    ori_gold_label = combined_df["gold_label"][i]
    ds_source = combined_df["anno_data_source"][i]

    individual_annos = [anno1, anno2, anno3]
    individual_counter = Counter(individual_annos)
    most_common_element = individual_counter.most_common()[0]
    
    if most_common_element[1] == 1:
        # no majority
        if not pd.isnull(ori_gold_label):
            assert ori_gold_label == "no_consensus"
        updated_gold_label.append("no_consensus")
    else:
        # there is a majority
        majority_vote = most_common_element[0]
        if ori_gold_label != majority_vote:
            assert ds_source == "us_v1"
        updated_gold_label.append(majority_vote)

In [32]:
test_df = pd.DataFrame({"old": combined_df.gold_label, "new": updated_gold_label})
print(sum(test_df.old != test_df.new))
print(sum(test_df.old.isna()))

1862
1862


In [33]:
test_df[test_df.old != test_df.new]["new"].value_counts()

no_consensus    1737
partial          124
not_sure           1
Name: new, dtype: int64

In [34]:
combined_df["gold_label"] = updated_gold_label

## Deal with duplicated (query, listingId) pairs

In [35]:
# keep distinct occurrences of query, listingId, gold_label
combined_df = combined_df.drop_duplicates(subset=['query', 'listingId', 'gold_label'])

In [36]:
# among them, if there is duplicated query, listingId, remove these pairs completely
ql_pairs = combined_df.apply(lambda row: (row["query"], row["listingId"]), axis=1).values
ql_counter = Counter(ql_pairs)
{k: v for k,v in ql_counter.items() if v > 1}

{('unique gifts for her', 1275026920): 2,
 ('gifts for women', 1215794797): 2,
 ('gifts for women', 1017317699): 2}

In [37]:
case1_inds = combined_df[np.logical_and(combined_df["query"]=="unique gifts for her", combined_df.listingId==1275026920)].index
case2_inds = combined_df[np.logical_and(combined_df["query"]=="gifts for women", combined_df.listingId==1215794797)].index
case3_inds = combined_df[np.logical_and(combined_df["query"]=="gifts for women", combined_df.listingId==1017317699)].index

In [38]:
combined_df.loc[case1_inds, :]

Unnamed: 0,query,queryEn,listingId,etsyUUID,platform,userLanguage,titleEn_vertica,anno_data_source,gold_label,label_annotator_1,label_annotator_2,label_annotator_3,is_test
28182,unique gifts for her,,1275026920,EuNf_gJvbdc7yhiYFWqg7O9roi4d,boe,en-GB,"Personalized Wooden Guitar Picks with Case, Cu...",us_v2-broad,no_consensus,partial,not_relevant,relevant,False
78862,unique gifts for her,unique gifts for her,1275026920,Eunac1kaM7zXdS8O4stHQ33JDYe8,boe,fr,"Personalized Wooden Guitar Picks with Case, Cu...",intl-fr,relevant,relevant,partial,relevant,False


In [39]:
combined_df.drop(list(case1_inds) + list(case2_inds) + list(case3_inds), inplace=True)

In [40]:
combined_df.shape

(84453, 13)

In [41]:
combined_df[["query", "listingId"]].drop_duplicates().shape

(84453, 2)

In [42]:
combined_df.gold_label.value_counts()

relevant        52730
partial         16695
not_relevant    12279
no_consensus     2598
not_sure          151
Name: gold_label, dtype: int64

In [43]:
len(combined_df["query"].unique())

25550

## Add in listing link columns, Make Eval split, Save to Excel

In [44]:
combined_df["etsy_url"] = combined_df["listingId"].apply(lambda x: f'=HYPERLINK("https://www.etsy.com/listing/{x}", "link")')
combined_df["atlas_url"] = combined_df["listingId"].apply(lambda x: f'=HYPERLINK("https://atlas.etsycorp.com/listing/{x}/lookup", "link")')

In [45]:
combined_df = combined_df[[
    'query', 'queryEn', 'listingId', 'etsy_url', 'atlas_url', 'titleEn_vertica', 'etsyUUID', 'platform', 'userLanguage',
    'anno_data_source', 'is_test', 'gold_label', 'label_annotator_1', 'label_annotator_2', 'label_annotator_3',
]]

In [46]:
combined_df.rename(columns={"gold_label": "labelbox_majority_label"}, inplace=True)

In [47]:
combined_df.is_test.value_counts()

False    78866
True      5587
Name: is_test, dtype: int64

In [48]:
combined_df[combined_df.is_test].labelbox_majority_label.value_counts()

relevant        3387
partial         1286
not_relevant     823
no_consensus      85
not_sure           6
Name: labelbox_majority_label, dtype: int64

In [49]:
# Train split irrelevant class
print(combined_df[np.logical_and(~combined_df.is_test, combined_df.labelbox_majority_label == "not_relevant")].shape)

(12279, 15)


In [50]:
combined_df.reset_index(drop=True, inplace=True)

In [51]:
test_not_relevant = combined_df[np.logical_and(combined_df.is_test, combined_df.labelbox_majority_label == "not_relevant")]
sample_not_relevant = test_not_relevant.sample(n=400, replace=False, random_state=123)
not_relevant_index = list(sample_not_relevant.index)

test_partial = combined_df[np.logical_and(combined_df.is_test, combined_df.labelbox_majority_label == "partial")]
sample_partial = test_partial.sample(n=400, replace=False, random_state=456)
partial_index = list(sample_partial.index)

test_relevant = combined_df[np.logical_and(combined_df.is_test, combined_df.labelbox_majority_label == "relevant")]
sample_relevant = test_relevant.sample(n=400, replace=False, random_state=789)
relevant_index = list(sample_relevant.index)

In [52]:
print(len(not_relevant_index))
print(len(partial_index))
print(len(relevant_index))

anno_index = not_relevant_index + partial_index + relevant_index
print(len(anno_index))
print(len(set(anno_index)))

400
400
400
1200
1200


In [53]:
combined_df["is_gsl_v0_eval"] = False
combined_df["is_gsl_v0_eval"][anno_index] = True

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  combined_df["is_gsl_v0_eval"][anno_index] = True


In [54]:
combined_df[combined_df["is_gsl_v0_eval"]].labelbox_majority_label.value_counts()

partial         400
relevant        400
not_relevant    400
Name: labelbox_majority_label, dtype: int64

In [55]:
combined_df[combined_df["is_gsl_v0_eval"]].anno_data_source.value_counts()

us_v2-direct_specified      235
us_v2-direct_unspecified    223
us_v2-broad                 138
intl-fr                     126
intl-es                     121
intl-de                     103
intl-it                      71
intl-nl                      67
us_v2-gift                   60
intl-pt                      25
intl-pl                      13
intl-ja                      12
intl-ru                       6
Name: anno_data_source, dtype: int64

In [56]:
# get all not_relevant, and sampled relevant & partial pairs from eval split
# for internal annotation
# df_eval = combined_df[combined_df.is_test]

# df_eval_irr = df_eval[df_eval.labelbox_majority_label == "not_relevant"]
# print(df_eval_irr.shape)

# df_eval_partial = df_eval[df_eval.labelbox_majority_label == "partial"]
# df_anno_partial = df_eval_partial.sample(n=500, replace=False, random_state=123)

# df_eval_relevant = df_eval[df_eval.labelbox_majority_label == "relevant"]
# df_anno_relevant = df_eval_relevant.sample(n=500, replace=False, random_state=456)

# df_anno = pd.concat([df_eval_irr, df_anno_partial, df_anno_relevant])
# df_anno = df_anno.reset_index(drop=True)
# df_anno = df_anno.sample(frac=1)

In [57]:
# len(df_eval["query"].unique())

In [58]:
df_anno = combined_df[combined_df.is_gsl_v0_eval]
# shuffle
df_anno = df_anno.sample(frac=1)
print(df_anno.shape)

(1200, 16)


In [59]:
# write out datasets
combined_df.to_excel("./semrel_merged_human_annotation_v0.xlsx", index=False)
df_anno.to_excel("./semrel_eval_internal_label_v0.xlsx", index=False)

## Analysis

### N listings per query

In [60]:
n_listings_per_query = combined_df["query"].value_counts()
n_listings_per_query.describe()

count    25550.000000
mean         3.305401
std          8.988862
min          1.000000
25%          1.000000
50%          1.000000
75%          2.000000
max        599.000000
Name: query, dtype: float64

In [61]:
print(np.quantile(n_listings_per_query, 0.9))
print(np.quantile(n_listings_per_query, 0.95))
print(np.quantile(n_listings_per_query, 0.99))

8.0
20.0
20.0


In [62]:
n_listings_per_query.head(n=50)

Find shirts                     599
gifts for women                 340
linen                           319
personalized school supplies    297
personalized gift               279
personalized gifts for women    275
unique gifts for her            248
birthflower                     228
travel accessories              222
couple gift                     195
Custom womens clothing          185
handmade gift                   173
anniversary gifts for women     168
Custom kids clothing            166
ohrringe                        164
personalized party decor        159
personalized gift for him       150
vintage style                   147
best gifts for him              139
anniversary gifts for him       126
back to college gift            114
home gift                       113
geburtstagsgeschenk frau        110
personalized jewelry            109
Womens spring dresses           107
valentines day decor            103
gifts for kids                  101
college school supplies     

### N queries per listing

In [63]:
n_queries_per_listing = combined_df["listingId"].value_counts()
n_queries_per_listing.describe()

count     82148.0
mean     1.028059
std       0.23644
min           1.0
25%           1.0
50%           1.0
75%           1.0
max          14.0
Name: listingId, dtype: Float64

In [64]:
print(np.quantile(n_queries_per_listing, 0.9))
print(np.quantile(n_queries_per_listing, 0.95))
print(np.quantile(n_queries_per_listing, 0.99))

1.0
1.0
2.0


In [65]:
n_queries_per_listing.head(n=50)

1267332404    14
644199956     12
1174657223    11
1189955105    11
1348928799    10
1601632705     9
1347295099     8
1010384443     8
1601633065     8
1426958759     8
1236331178     7
1029698212     7
1055846134     7
1384459783     7
1414472417     7
965245133      7
898008338      7
1532612369     7
1023099765     6
694614053      6
1559697371     6
1035684640     6
1384930469     6
1338704568     6
823311406      6
1468526006     6
1228586816     6
932218890      6
956119765      6
1514410368     6
861154494      5
1060789662     5
1213617116     5
865636015      5
1470954332     5
1519654979     5
1232974294     5
1204433276     5
1308248418     5
1606106909     5
1153444246     5
1368416197     5
1099780776     5
1319609800     5
1208309840     5
1554919480     5
1528412179     5
1092252036     5
624493147      5
784553483      5
Name: listingId, dtype: Int64