This notebook analyzes coverage of the v0 eval split

In [1]:
import pandas as pd
import numpy as np
import time
import json
from copy import deepcopy
from google.cloud import bigquery

In [2]:
df = pd.read_excel("./data/gsl_eval_v0.xlsx")

In [3]:
df.labelbox_majority_label.value_counts()

partial         400
not_relevant    400
relevant        400
Name: labelbox_majority_label, dtype: int64

In [4]:
client = bigquery.Client(project="etsy-search-ml-dev")

query_str = """with anno_data as (
  select distinct etsyUUID, query, query as queryEn, listingId, 
  from `etsy-sr-etl-prod.yzhang.sem_rel_human_annotation_v2`
  union all
  select distinct etsyUUID, query, queryEn, listingId, 
  from `etsy-search-ml-dev.aclapp.isearch_semrel_surveyv1_teacherv2_features_w_primaryLang`
),
qlm AS (
  select query_raw query, bin as queryBin 
  from `etsy-batchjobs-prod.snapshots.query_level_metrics_raw`
  where _date = date('2025-02-07')
),
qis AS (
  SELECT query_raw query,
  CASE 
    WHEN class_id = 0 THEN 'broad' 
    WHEN class_id = 1 THEN 'direct_unspecified'
    WHEN class_id = 2 THEN 'direct_specified' 
  END AS qisClass
  FROM `etsy-search-ml-prod.mission_understanding.qis_scores`
),
qisv2 AS (
  SELECT query,
  CASE 
    WHEN prediction = 0 THEN 'broad' 
    WHEN prediction = 1 THEN 'direct_unspecified'
    WHEN prediction = 2 THEN 'direct_specified' 
  END AS qisClassV2
  FROM `etsy-search-ml-prod.mission_understanding.qis_scores_v2`
),
lfb AS (
  SELECT key listingId, 
    verticaListings_taxonomyPath listingTaxo,
  FROM `etsy-ml-systems-prod.feature_bank_v2.listing_feature_bank_most_recent`
)
select 
  etsyUUID, query, queryEn, listingId, queryBin, qisClass, qisClassV2, listingTaxo
from anno_data
left join qlm using (query)
left join qis using (query)
left join qisv2 using (query)
left join lfb using (listingId)
"""

query_job = client.query(query_str)
features_df = query_job.result().to_dataframe()



In [5]:
merged_df = pd.merge(df, features_df, on=["etsyUUID", "query", "listingId"], how="left")

In [6]:
merged_df["listingTopTaxo"] = merged_df.listingTaxo.apply(lambda x: x.split(".")[0])

In [7]:
merged_df[["etsyUUID", "query", "listingId", "anno_data_source", "queryBin", "qisClass", "qisClassV2", "listingTopTaxo"]].info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1200 entries, 0 to 1199
Data columns (total 8 columns):
 #   Column            Non-Null Count  Dtype 
---  ------            --------------  ----- 
 0   etsyUUID          1200 non-null   object
 1   query             1200 non-null   object
 2   listingId         1200 non-null   int64 
 3   anno_data_source  1200 non-null   object
 4   queryBin          1170 non-null   object
 5   qisClass          1189 non-null   object
 6   qisClassV2        1188 non-null   object
 7   listingTopTaxo    1200 non-null   object
dtypes: int64(1), object(7)
memory usage: 84.4+ KB


In [8]:
merged_df.anno_data_source.str.startswith("us").value_counts()

True     656
False    544
Name: anno_data_source, dtype: int64

In [9]:
len(merged_df["query"].unique())

859

In [10]:
query_df = merged_df[["query", "queryBin", "qisClassV2"]].drop_duplicates()
query_df.shape

(859, 3)

In [11]:
query_df.queryBin.value_counts()

head      236
top.01    219
top.1     158
tail      114
torso     108
Name: queryBin, dtype: int64

In [12]:
query_df.qisClassV2.value_counts()

direct_specified      365
broad                 271
direct_unspecified    216
Name: qisClassV2, dtype: int64

In [13]:
listing_df = merged_df[["listingId", "listingTopTaxo"]].drop_duplicates()

In [14]:
len(listing_df.listingId.unique())

1200

In [15]:
listing_df.listingTopTaxo.value_counts()

home_and_living                246
clothing                       158
jewelry                        141
art_and_collectibles           137
paper_and_party_supplies       115
craft_supplies_and_tools       107
accessories                     59
toys_and_games                  51
weddings                        45
bags_and_purses                 40
electronics_and_accessories     39
bath_and_beauty                 28
books_movies_and_music          20
pet_supplies                     8
shoes                            6
Name: listingTopTaxo, dtype: int64

gift queries

In [16]:
import re

In [17]:
def checkIsGiftQuery(query_str):
    pattern = r"(?i)\bgift|\bfor (\bhim|\bher|\bmom|\bdad|\bmother|\bfather|\bdaughter|\bson|\bwife|\bhusband|\bpartner|\baunt|\buncle|\bniece|\bnephew|\bfiance|\bcousin|\bin law|\bboyfriend|\bgirlfriend|\bgrand|\bfriend|\bbest friend)"
    result = re.search(pattern, query_str) 
    is_gift = result is not None
    return is_gift

In [18]:
def checkIsGiftInDF(row):
    if row["anno_data_source"].startswith("us"):
        return checkIsGiftQuery(row["query"])
    else:
        return checkIsGiftQuery(row["queryEn_x"])

In [19]:
merged_df["queryIsGift"] = merged_df.apply(checkIsGiftInDF, axis=1)

In [20]:
merged_df[merged_df["anno_data_source"] == "us_v2-gift"].queryIsGift.value_counts()

True    60
Name: queryIsGift, dtype: int64

In [21]:
merged_df[merged_df["anno_data_source"] != "us_v2-gift"].queryIsGift.value_counts()

False    1093
True       47
Name: queryIsGift, dtype: int64

In [22]:
merged_df[np.logical_and(merged_df.queryIsGift, merged_df["anno_data_source"] != "us_v2-gift")]

Unnamed: 0,query,queryEn_x,listingId,etsy_url,atlas_url,titleEn_vertica,etsyUUID,platform,userLanguage,anno_data_source,...,label_annotator_2,label_annotator_3,is_gsl_v0_eval,queryEn_y,queryBin,qisClass,qisClassV2,listingTaxo,listingTopTaxo,queryIsGift
54,anniversary gifts for women,,801724215,,,"Tiger eye brass Ring, Handmade Ring, Women Rin...",EueMU1ppuQpAY2hp5qEPMdMhkFbd,boe,en-US,us_v2-broad,...,relevant,relevant,True,anniversary gifts for women,top.01,broad,broad,jewelry.rings.statement_rings,jewelry,True
77,unique gifts for her,unique gifts for her,1405418532,,,Vintage zodiac charm pendant - sagittarius cha...,EuoVxlZmrGD-8ul8vpKBu_A1Pw9d,boe,fr,intl-fr,...,relevant,relevant,True,unique gifts for her,top.01,broad,broad,jewelry.necklaces.pendant_necklaces,jewelry,True
82,personalized gift for him,,1393233546,,,Adventure Book Scrapbook Couples Gift for Boyf...,Eu5sFjO5nQZ_vwAr8iAzYrL7nDc1,boe,en-US,us_v2-broad,...,relevant,relevant,True,personalized gift for him,top.01,broad,broad,books_movies_and_music.books.blank_books.journ...,books_movies_and_music,True
115,gift for dog,,1321188874,,,Farmhouse Christmas Ornament Jack Russell Terr...,EuaTiDIgon6rz3CCDTVHm95mc915,boe,en-US,us_v2-broad,...,relevant,relevant,True,gift for dog,top.01,broad,broad,home_and_living.home_decor.seasonal_decor.chri...,home_and_living,True
146,30th wedding anniversary gifts,,1529669642,,,"Personalised 30th Anniversary Gift, Pearl Anni...",EuKFdRLCuzFmYRG3Iuw3CBcfQU12,web,en-GB,us_v2-broad,...,relevant,relevant,True,30th wedding anniversary gifts,top.1,broad,broad,home_and_living.home_decor.picture_frames_and_...,home_and_living,True
170,unique gifts for her,,1670022097,,,"Small brass contemporary ring, Statement Uniqu...",Euao8J8mQlKUea0NbP3Q4a__bu61,boe,en-US,us_v2-broad,...,relevant,relevant,True,unique gifts for her,top.01,broad,broad,jewelry.rings.statement_rings,jewelry,True
209,wedding gift,,1490399459,,,"Personalized Cutting Board With Handle, Custom...",EuJ2-g2LX1-WpK4hwWbiNwka2Mde,web,en-US,us_v2-broad,...,relevant,relevant,True,wedding gift,top.01,broad,broad,home_and_living.kitchen_and_dining.dining_and_...,home_and_living,True
210,teacher appreciation gift,teacher appreciation gift,1377489767,,,A Parents Thank You Printable Nursery Teachers...,EuuIAuFuQTGYA3VrjJEJlAZC_Q91,boe,nl,intl-nl,...,relevant,relevant,True,teacher appreciation gift,top.01,broad,broad,art_and_collectibles.prints.digital_prints,art_and_collectibles,True
216,home gift,,1326183511,,,Round Personalized Charcuterie Board Set/19pcs...,EuqLmuRY0wVH_cs4vC4rJXVEcW4c,boe,en-US,us_v2-broad,...,partial,partial,True,home gift,top.01,broad,broad,home_and_living.home_decor.home_accents,home_and_living,True
231,trauergeschenk,sympathy gift,1737503881,,,Mourning light memorial light lantern farewell...,EuaEY2rK-72xUPB70W2j0VzCiIc8,web,de,intl-de,...,relevant,relevant,True,sympathy gift,top.01,broad,broad,home_and_living.home_decor.candles_and_home_fr...,home_and_living,True


In [23]:
query_gift_df = merged_df[["query", "queryIsGift"]].drop_duplicates()
query_gift_df.shape

(859, 2)

In [24]:
query_gift_df.queryIsGift.value_counts()

False    781
True      78
Name: queryIsGift, dtype: int64