In [1]:
import pandas as pd
import numpy as np
import json
from collections import Counter

from google.cloud import bigquery

In [2]:
bq_client = bigquery.Client(project="etsy-search-ml-dev")



## US Annotation V2

In [3]:
us_v2_query = """SELECT 
    query, 
    '' as queryEn,
    listingId, 
    etsyUUID, 
    requestSource as platform,
    userLanguage,
    listingTitle as titleEn_vertica,
    source as anno_data_source,
    gold_label,
    label_annotator_1,
    label_annotator_2,
    label_annotator_3
FROM `etsy-sr-etl-prod.yzhang.sem_rel_human_annotation_v2`"""

query_job = bq_client.query(us_v2_query)
rows = query_job.result()
us_v2 = rows.to_dataframe()

In [4]:
us_v2["anno_data_source"] = us_v2["anno_data_source"].apply(lambda x: f"us_v2-{x}")
us_v2["anno_data_source"].value_counts()

us_v2-direct_specified      10500
us_v2-direct_unspecified    10490
us_v2-broad                  6000
us_v2-gift                   3000
Name: anno_data_source, dtype: int64

In [5]:
us_v2.label_annotator_3.value_counts()

relevant        16285
partial          8889
not_relevant     4663
not_sure          153
Name: label_annotator_3, dtype: int64

In [6]:
us_v2.columns

Index(['query', 'queryEn', 'listingId', 'etsyUUID', 'platform', 'userLanguage',
       'titleEn_vertica', 'anno_data_source', 'gold_label',
       'label_annotator_1', 'label_annotator_2', 'label_annotator_3'],
      dtype='object')

## US Annotation V1

4-point scale, downloaded the excel file from repo: https://github.com/etsy/semantic-relevance/blob/main/notebooks/annotation_dataset_v1.xlsx

In [7]:
us_v1 = pd.read_excel("./annotation_dataset_v1.xlsx")
us_v1.columns

# us_v1[np.logical_and(us_v1.label != us_v1.majorityLabel, ~us_v1.majorityLabel.isna())][["label", "labels", "majorityLabel"]].head()
# label - first annotator label

Index(['Unnamed: 0', 'etsy_uuid', 'listingId', 'query', 'platform', 'bin',
       'taxonomy', 'title', 'bordaRank', 'finalRank', 'description', 'label',
       'taxonomyPath', 'tags', 'title_1',
       'verticaListingTranslations_primaryLanguageTitle', 'labels',
       'agreedLabel', 'majorityLabel'],
      dtype='object')

In [8]:
us_v1 = us_v1[["query", "listingId", "etsy_uuid", "platform", "title", "majorityLabel", "labels"]]

In [9]:
# format individual labeler annotations
us_v1["label_annotator_1"] = us_v1.labels.apply(lambda labels_str: json.loads(labels_str)[0])
us_v1["label_annotator_2"] = us_v1.labels.apply(lambda labels_str: json.loads(labels_str)[1])
us_v1["label_annotator_3"] = us_v1.labels.apply(lambda labels_str: json.loads(labels_str)[2])

In [10]:
us_v1.iloc[24:27,:]

Unnamed: 0,query,listingId,etsy_uuid,platform,title,majorityLabel,labels,label_annotator_1,label_annotator_2,label_annotator_3
24,maple seed necklace,631014318,Eu-2uUd5aNMLBJGRg_cQUKYTzl5f,web,Mini Samaras - Sterling Silver Seed Dangle Ear...,1.0,"[1, 4, 1]",1,4,1
25,maple seed necklace,687908755,Eu-2uUd5aNMLBJGRg_cQUKYTzl5f,web,Mini Samara Bracelet - Maple Seed Helicopter -...,1.0,"[1, 4, 1]",1,4,1
26,maple seed necklace,743987072,Eu-2uUd5aNMLBJGRg_cQUKYTzl5f,web,Mountain necklace - Sterling silver - Wanderlu...,0.0,"[0, 0, 3]",0,0,3


In [11]:
# map 4-point scale to 3-point scale
map_dict = {0: "not_sure", 1: "not_relevant", 2: "partial", 3: "partial", 4: "relevant"}

us_v1["majorityLabel"] = us_v1["majorityLabel"].map(map_dict)

In [12]:
us_v1.iloc[24:27,:]

Unnamed: 0,query,listingId,etsy_uuid,platform,title,majorityLabel,labels,label_annotator_1,label_annotator_2,label_annotator_3
24,maple seed necklace,631014318,Eu-2uUd5aNMLBJGRg_cQUKYTzl5f,web,Mini Samaras - Sterling Silver Seed Dangle Ear...,not_relevant,"[1, 4, 1]",1,4,1
25,maple seed necklace,687908755,Eu-2uUd5aNMLBJGRg_cQUKYTzl5f,web,Mini Samara Bracelet - Maple Seed Helicopter -...,not_relevant,"[1, 4, 1]",1,4,1
26,maple seed necklace,743987072,Eu-2uUd5aNMLBJGRg_cQUKYTzl5f,web,Mountain necklace - Sterling silver - Wanderlu...,not_sure,"[0, 0, 3]",0,0,3


In [13]:
us_v1["label_annotator_1"] = us_v1["label_annotator_1"].map(map_dict)
us_v1["label_annotator_2"] = us_v1["label_annotator_2"].map(map_dict)
us_v1["label_annotator_3"] = us_v1["label_annotator_3"].map(map_dict)

In [14]:
us_v1.label_annotator_3.value_counts()

relevant        19305
partial          4502
not_relevant     3981
not_sure          237
Name: label_annotator_3, dtype: int64

In [15]:
# format columns to match with V2
us_v1.drop('labels', axis=1, inplace=True) 
us_v1.rename(columns={"majorityLabel": "gold_label", "etsy_uuid": "etsyUUID", "title": "titleEn_vertica"}, inplace=True)
us_v1["userLanguage"] = ""
us_v1["anno_data_source"] = "us_v1"
us_v1["queryEn"] = ""

In [16]:
us_v1.iloc[10:15,:]

Unnamed: 0,query,listingId,etsyUUID,platform,titleEn_vertica,gold_label,label_annotator_1,label_annotator_2,label_annotator_3,userLanguage,anno_data_source,queryEn
10,seedling cups,1117447411,Eu-2lSwjH_T8GmLOol51dyXuWk87,web,Southern Red Credar Seedling,not_relevant,not_relevant,not_relevant,not_relevant,,us_v1,
11,seedling cups,1146306150,Eu-2lSwjH_T8GmLOol51dyXuWk87,web,Nursery Flower Pots 3 Inch Square Heavy Duty 1...,relevant,relevant,relevant,partial,,us_v1,
12,seedling cups,1221549646,Eu-2lSwjH_T8GmLOol51dyXuWk87,web,Seed Starter | Seedling Tray | Genius Plant Hack,relevant,not_relevant,relevant,relevant,,us_v1,
13,seedling cups,1258868503,Eu-2lSwjH_T8GmLOol51dyXuWk87,web,Interior Live Oak | Medium Tree Seedling | The...,,not_relevant,relevant,partial,,us_v1,
14,seedling cups,1405409965,Eu-2lSwjH_T8GmLOol51dyXuWk87,web,Coast Redwood tree seedling - Sequoia sempervi...,partial,partial,not_relevant,partial,,us_v1,


In [17]:
us_v1.columns

Index(['query', 'listingId', 'etsyUUID', 'platform', 'titleEn_vertica',
       'gold_label', 'label_annotator_1', 'label_annotator_2',
       'label_annotator_3', 'userLanguage', 'anno_data_source', 'queryEn'],
      dtype='object')

## International

In [18]:
intl_query = """SELECT 
    query, 
    queryEn,
    listingId, 
    etsyUUID, 
    requestSource as platform,
    userLanguage,
    listingTitleEn as titleEn_vertica,
    source as anno_data_source,
    gold_label,
    label_annotator_1,
    label_annotator_2,
    label_annotator_3
FROM `etsy-search-ml-dev.aclapp.semrel_human_annotation_v1`"""

query_job = bq_client.query(intl_query)
rows = query_job.result()
intl_df = rows.to_dataframe()

In [19]:
intl_df.shape

(26803, 12)

In [20]:
intl_df.label_annotator_2.value_counts()

relevant        16367
partial          5353
not_relevant     5019
not_sure           64
Name: label_annotator_2, dtype: int64

In [21]:
intl_df["anno_data_source"] = intl_df["anno_data_source"].apply(lambda x: f"intl-{x}")
intl_df.anno_data_source.value_counts()

intl-de    5698
intl-fr    5687
intl-es    5097
intl-it    3944
intl-nl    3375
intl-pt     836
intl-pl     808
intl-ru     681
intl-ja     677
Name: anno_data_source, dtype: int64

## Combine datasets

In [22]:
joint_columns = [
    'query', 
    'queryEn', 
    'listingId',
    'etsyUUID', 
    'platform', 
    'userLanguage',
    'titleEn_vertica', 
    'anno_data_source', 
    'gold_label',
    'label_annotator_1', 
    'label_annotator_2', 
    'label_annotator_3'
]

In [23]:
us_v1 = us_v1[joint_columns]
us_v2 = us_v2[joint_columns]
intl_df = intl_df[joint_columns]

In [24]:
combined_df = pd.concat([us_v1, us_v2, intl_df])
combined_df = combined_df.reset_index(drop=True)

In [25]:
combined_df.shape

(84818, 12)

In [26]:
len(combined_df["query"].unique())

25550

In [27]:
# compute gold label again because we're merging 2 and 3 in US V1 data, which may lead to majority agreeing on partial
# if no consensus, use no_consensus

updated_gold_label = []

for i in range(combined_df.shape[0]):
    anno1 = combined_df["label_annotator_1"][i]
    anno2 = combined_df["label_annotator_2"][i]
    anno3 = combined_df["label_annotator_3"][i]
    ori_gold_label = combined_df["gold_label"][i]
    ds_source = combined_df["anno_data_source"][i]

    individual_annos = [anno1, anno2, anno3]
    individual_counter = Counter(individual_annos)
    most_common_element = individual_counter.most_common()[0]
    
    if most_common_element[1] == 1:
        # no majority
        if not pd.isnull(ori_gold_label):
            assert ori_gold_label == "no_consensus"
        updated_gold_label.append("no_consensus")
    else:
        # there is a majority
        majority_vote = most_common_element[0]
        if ori_gold_label != majority_vote:
            assert ds_source == "us_v1"
        updated_gold_label.append(majority_vote)

In [28]:
test_df = pd.DataFrame({"old": combined_df.gold_label, "new": updated_gold_label})
print(sum(test_df.old != test_df.new))
print(sum(test_df.old.isna()))

1862
1862


In [29]:
test_df[test_df.old != test_df.new]["new"].value_counts()

no_consensus    1737
partial          124
not_sure           1
Name: new, dtype: int64

In [30]:
combined_df["gold_label"] = updated_gold_label
combined_df.gold_label.value_counts()

relevant        53062
partial         16707
not_relevant    12291
no_consensus     2607
not_sure          151
Name: gold_label, dtype: int64

## Deal with duplicated (query, listingId) pairs

In [31]:
# keep distinct occurrences of query, listingId, gold_label
combined_df = combined_df.drop_duplicates(subset=['query', 'listingId', 'gold_label'])

In [32]:
# among them, if there is duplicated query, listingId, remove these pairs completely
ql_pairs = combined_df.apply(lambda row: (row["query"], row["listingId"]), axis=1).values
ql_counter = Counter(ql_pairs)
{k: v for k,v in ql_counter.items() if v > 1}

{('unique gifts for her', 1275026920): 2,
 ('gifts for women', 1215794797): 2,
 ('gifts for women', 1017317699): 2}

In [33]:
case1_inds = combined_df[np.logical_and(combined_df["query"]=="unique gifts for her", combined_df.listingId==1275026920)].index
case2_inds = combined_df[np.logical_and(combined_df["query"]=="gifts for women", combined_df.listingId==1215794797)].index
case3_inds = combined_df[np.logical_and(combined_df["query"]=="gifts for women", combined_df.listingId==1017317699)].index

In [34]:
combined_df.iloc[case1_inds, :]

Unnamed: 0,query,queryEn,listingId,etsyUUID,platform,userLanguage,titleEn_vertica,anno_data_source,gold_label,label_annotator_1,label_annotator_2,label_annotator_3
28525,leather photo album,,1590258982,Euu6-2aQVOizToOdJOEohJILcz81,boe,en-US,"Personalized Vintage Genuine Leather Journal, ...",us_v2-direct_specified,no_consensus,partial,relevant,not_relevant
79219,fiori giganti organza,giant organza flowers,1578241206,Euf_O1xMthWWDB417Y3HDxCz8mec,web,it,Giant Flowers for Gorgeous Backdrop Wedding Ev...,intl-it,relevant,relevant,relevant,relevant


In [35]:
combined_df.drop(list(case1_inds) + list(case2_inds) + list(case3_inds), inplace=True)

In [36]:
combined_df.shape

(84453, 12)

In [37]:
combined_df[["query", "listingId"]].drop_duplicates().shape

(84453, 2)

## Set up train and evaluation split, separate by query

In [38]:
unique_queries = combined_df["query"].unique()
n = len(unique_queries)
print(n)

np.random.seed(1234)
test_queries = np.random.choice(unique_queries, size = int(n*0.1), replace=False)
print(len(test_queries))

25550
2555


In [39]:
combined_df["is_eval_split"] = combined_df["query"].isin(test_queries)
combined_df["is_eval_split"].value_counts()

False    75753
True      8700
Name: is_eval_split, dtype: int64

In [40]:
combined_df.columns

Index(['query', 'queryEn', 'listingId', 'etsyUUID', 'platform', 'userLanguage',
       'titleEn_vertica', 'anno_data_source', 'gold_label',
       'label_annotator_1', 'label_annotator_2', 'label_annotator_3',
       'is_eval_split'],
      dtype='object')

## Add in listing link columns, save to Excel

In [41]:
combined_df["etsy_url"] = combined_df["listingId"].apply(lambda x: f"'https://www.etsy.com/listing/{x}")
combined_df["atlas_url"] = combined_df["listingId"].apply(lambda x: f"'https://atlas.etsycorp.com/listing/{x}/lookup")

In [42]:
combined_df = combined_df[[
    'query', 'queryEn', 'listingId', 'etsy_url', 'atlas_url', 'titleEn_vertica', 'etsyUUID', 'platform', 'userLanguage',
    'anno_data_source', 'is_eval_split', 'gold_label', 'label_annotator_1', 'label_annotator_2', 'label_annotator_3',
]]

In [43]:
combined_df.rename(columns={"gold_label": "labelbox_majority_label"}, inplace=True)

In [44]:
# write out full merged dataset
combined_df.to_excel("./semrel_merged_human_annotation.xlsx", index=False)

In [45]:
# get eval split and irrelevant labels in training split
df_eval = combined_df[combined_df.is_eval_split]
df_eval = df_eval.sample(frac=1)  #8700

df_train_irrel = combined_df[np.logical_and(~combined_df.is_eval_split, combined_df.labelbox_majority_label == "not_relevant")]
df_train_irrel = df_train_irrel.sample(frac=1)  #11010

In [46]:
df_eval.to_excel("./semrel_internal_label_eval.xlsx", index=False)
df_train_irrel.to_excel("./semrel_internal_label_train_irrel.xlsx", index=False)

## Analysis

### N listings per query

In [47]:
n_listings_per_query = combined_df["query"].value_counts()
n_listings_per_query.describe()

count    25550.000000
mean         3.305401
std          8.988862
min          1.000000
25%          1.000000
50%          1.000000
75%          2.000000
max        599.000000
Name: query, dtype: float64

In [48]:
print(np.quantile(n_listings_per_query, 0.9))
print(np.quantile(n_listings_per_query, 0.95))
print(np.quantile(n_listings_per_query, 0.99))

8.0
20.0
20.0


In [49]:
n_listings_per_query.head(n=50)

Find shirts                     599
gifts for women                 340
linen                           319
personalized school supplies    297
personalized gift               279
personalized gifts for women    275
unique gifts for her            248
birthflower                     228
travel accessories              222
couple gift                     195
Custom womens clothing          185
handmade gift                   173
anniversary gifts for women     168
Custom kids clothing            166
ohrringe                        164
personalized party decor        159
personalized gift for him       150
vintage style                   147
best gifts for him              139
anniversary gifts for him       126
back to college gift            114
home gift                       113
geburtstagsgeschenk frau        110
personalized jewelry            109
Womens spring dresses           107
valentines day decor            103
gifts for kids                  101
college school supplies     

### N queries per listing

In [50]:
n_queries_per_listing = combined_df["listingId"].value_counts()
n_queries_per_listing.describe()

count     82148.0
mean     1.028059
std       0.23644
min           1.0
25%           1.0
50%           1.0
75%           1.0
max          14.0
Name: listingId, dtype: Float64

In [51]:
print(np.quantile(n_queries_per_listing, 0.9))
print(np.quantile(n_queries_per_listing, 0.95))
print(np.quantile(n_queries_per_listing, 0.99))

1.0
1.0
2.0


In [52]:
n_queries_per_listing.head(n=50)

1267332404    14
644199956     12
1174657223    11
1189955105    11
1348928799    10
1601632705     9
1347295099     8
1010384443     8
1601633065     8
1426958759     8
1236331178     7
1029698212     7
1055846134     7
1384459783     7
1414472417     7
965245133      7
898008338      7
1532612369     7
1023099765     6
694614053      6
1559697371     6
1035684640     6
1384930469     6
1338704568     6
823311406      6
1468526006     6
1228586816     6
932218890      6
956119765      6
1514410368     6
861154494      5
1060789662     5
1213617116     5
865636015      5
1470954332     5
1519654979     5
1232974294     5
1204433276     5
1308248418     5
1606106909     5
1153444246     5
1368416197     5
1099780776     5
1319609800     5
1208309840     5
1554919480     5
1528412179     5
1092252036     5
624493147      5
784553483      5
Name: listingId, dtype: Int64