In [1]:
import pandas as pd
import numpy as np
import utils

from google.cloud import bigquery

In [2]:
test_df = pd.read_excel("./gsl_eval_v0_all_except_llm.xlsx")

In [3]:
test_df = test_df[test_df[utils.GT_FIELD] != "not_sure"].reset_index(drop=True)
test_df.fillna("", inplace=True)

test_df["ori_title"] = np.where(test_df["listingTitle"] != "", test_df["listingTitle"], test_df["listingTitleEn"])
test_df["en_title"] = np.where(test_df["listingTitleEn"] != "", test_df["listingTitleEn"], test_df["listingTitle"])

In [4]:
test_df[["query", "listingId"]].drop_duplicates().shape

(1095, 2)

In [5]:
test_df[utils.GT_FIELD].value_counts()

partial         620
relevant        361
not_relevant    114
Name: etsy_round_label, dtype: int64

In [6]:
len(test_df["query"].unique())

792

## Get additional query segment features

In [7]:
sql_query = """with qlm AS (
      select distinct query_raw as query, bin as queryBin 
      from `etsy-data-warehouse-prod.rollups.query_level_metrics_raw`
    ),
    qisv3 AS (
      select query_raw query,
      CASE 
        WHEN prediction = 0 THEN 'broad' 
        WHEN prediction = 1 THEN 'direct_unspecified'
        WHEN prediction = 2 THEN 'direct_specified' 
      END as qisClass
      from `etsy-search-ml-prod.mission_understanding.qis_scores_v3`
    ),
    qtcv5 as (
      select distinct
       coalesce(s.query, b.query) as query,
       coalesce(s.full_path, b.full_path) as queryTaxoFullPath,
      from `etsy-data-warehouse-prod.mission_understanding.query_taxonomy_seller` s
      full outer join `etsy-data-warehouse-prod.mission_understanding.query_taxonomy_buyer` b
       using(query)
    ),
    qee_raw AS (
      select distinct
        searchQuery as query,
        fandom,
        motif,
        style,
        material,
        color,
        technique,
        tangibleItem,
        size,
        occasion,
        customization,
        age,
        price,
        quantity,
        recipient
      from `etsy-search-ml-prod.mission_understanding.query_entity_features`
    ),
    qee AS (
      select *
      from qee_raw
      QUALIFY ROW_NUMBER() OVER (PARTITION BY query ORDER BY RAND()) = 1 
    ),
    anno_queries AS (
      select distinct query, 
      from `etsy-search-ml-dev.aclapp.semrel_human_annotation_v1`
      union all
      select distinct query
      from `etsy-sr-etl-prod.yzhang.sem_rel_human_annotation_v2`
    )
    select distinct
      aq.query,
      queryBin as seg_queryBin,
      qisClass as seg_qisClass,
      queryTaxoFullPath as seg_queryTaxoFullPath,
      SPLIT(queryTaxoFullPath, ".")[OFFSET(0)] as seg_queryTaxoTop,
      fandom,
      motif,
      style,
      material,
      color,
      technique,
      tangibleItem,
      size,
      occasion,
      customization,
      age,
      price,
      quantity,
      recipient
    from anno_queries aq
    left join qlm using (query)
    left join qisv3 using (query)
    left join qtcv5 using (query)
    left join qee using (query)
"""

# ideally should match on user language for qee

In [8]:
bq_client = bigquery.Client(project="etsy-search-ml-dev")

query_job = bq_client.query(sql_query)
rows = query_job.result()
query_segment_df = rows.to_dataframe()



In [9]:
merged_df = pd.merge(test_df, query_segment_df, on=["query"], how="left")

In [10]:
merged_df.shape

(1095, 65)

In [11]:
merged_df.seg_queryBin.fillna("novel", inplace=True)
merged_df.seg_qisClass.fillna("missing", inplace=True)
merged_df.seg_queryTaxoTop.fillna("missing", inplace=True)
merged_df.seg_queryTaxoFullPath.fillna("missing", inplace=True)

In [12]:
merged_df.to_excel("./gsl_eval_v0_segments.xlsx", index=False)

## Coverage

In [2]:
merged_df = pd.read_excel("./gsl_eval_v0_segments.xlsx")

In [3]:
merged_df.seg_queryBin.value_counts().sort_index()

head      261
novel     111
tail      108
top.01    318
top.1     179
torso     118
Name: seg_queryBin, dtype: int64

In [4]:
pd.crosstab(merged_df.seg_queryBin, merged_df[utils.GT_FIELD])

etsy_round_label,not_relevant,partial,relevant
seg_queryBin,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
head,30,160,71
novel,9,82,20
tail,21,72,15
top.01,25,120,173
top.1,17,99,63
torso,12,87,19


In [6]:
merged_df[merged_df.seg_queryBin == "novel"].head()

Unnamed: 0,query,queryEn,listingId,titleEn_vertica,etsyUUID,platform,userLanguage,anno_data_source,labelbox_majority_label,label_annotator_1,...,color,technique,tangibleItem,size,occasion,customization,age,price,quantity,recipient
13,witcher&#39;s fan gift,,1568718608,"Ice Dragon Sticker, Cool Dragon Decal for Drag...",EuQihQcn91nsBOnr95cwktixyW81,web,en-US,us_v2-gift,not_relevant,not_relevant,...,[],[],[],[],[],[],[],[],[],[]
19,wings of fire hivewing plush,,1014676666,Tamarin the Rainwing,EudHDKudZxJ7LPbFWC3HleACx16d,web,en-US,us_v2-direct_specified,partial,partial,...,[],[],[],[],[],[],[],[],[],[]
57,&quot;sigrid nunez&quot;,,1038525124,Custom Sisters Print - Sisters Christmas Gifts...,Euyp7eJhKDUC67U7xhAuKbnm1o5a,web,en-US,us_v2-broad,not_relevant,not_relevant,...,[],[],[],[],[],[],[],[],[],[]
59,floor vases large blue,,1234484300,"8.5&#39;&#39; Turkish Caftan Decor, Handmade C...",EuznlhUV2X8HEAbFPSU5vkNL1y16,boe,en-GB,us_v2-direct_specified,not_relevant,partial,...,[],[],[],[],[],[],[],[],[],[]
62,laserburn rose,,1644146021,"Laser Burn PNG, 3D Illusion Laser Engraving Fi...",EucGn0fA1UZrZc2ShXEvMx3YnVb9,web,en-US,us_v2-broad,relevant,relevant,...,[],[],[],[],[],[],[],[],[],[]


In [16]:
merged_df.seg_qisClass.value_counts().sort_index()

broad                 185
direct_specified      492
direct_unspecified    185
missing               233
Name: seg_qisClass, dtype: int64

In [17]:
pd.crosstab(merged_df.seg_qisClass, merged_df[utils.GT_FIELD])

etsy_round_label,not_relevant,partial,relevant
seg_qisClass,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
broad,25,103,57
direct_specified,49,342,101
direct_unspecified,30,80,75
missing,10,95,128


In [18]:
pd.crosstab(merged_df.seg_qisClass, merged_df.seg_queryBin)

seg_queryBin,head,novel,tail,top.01,top.1,torso
seg_qisClass,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
broad,32,13,20,49,44,27
direct_specified,165,74,79,45,53,76
direct_unspecified,57,9,9,45,50,15
missing,7,15,0,179,32,0


In [19]:
merged_df.seg_queryTaxoTop.value_counts().sort_index()

accessories                     30
art_and_collectibles            34
bags_and_purses                 24
bath_and_beauty                 13
books_movies_and_music           5
clothing                        66
craft_supplies_and_tools        39
electronics_and_accessories     12
home_and_living                165
jewelry                         72
missing                        501
paper_and_party_supplies        53
pet_supplies                    12
shoes                            3
toys_and_games                  37
weddings                        29
Name: seg_queryTaxoTop, dtype: int64

In [20]:
pd.crosstab(merged_df.seg_queryTaxoTop, merged_df[utils.GT_FIELD])

etsy_round_label,not_relevant,partial,relevant
seg_queryTaxoTop,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
accessories,0,16,14
art_and_collectibles,8,17,9
bags_and_purses,1,11,12
bath_and_beauty,1,5,7
books_movies_and_music,1,3,1
clothing,8,36,22
craft_supplies_and_tools,5,24,10
electronics_and_accessories,1,3,8
home_and_living,12,83,70
jewelry,7,36,29


In [21]:
entity = "recipient"
print(merged_df[merged_df[entity].apply(lambda x: len(eval(x)) > 0)].shape[0])
print(merged_df[merged_df[entity].apply(lambda x: len(eval(x)) > 0)][utils.GT_FIELD].value_counts().sort_index())

102
not_relevant     6
partial         34
relevant        62
Name: etsy_round_label, dtype: int64


In [22]:
merged_df["has_QEE"] = np.logical_not(merged_df.queryEntities.isna())
merged_df["has_queryTaxo"] = merged_df.seg_queryTaxoTop != "missing"

In [23]:
pd.crosstab(merged_df.has_QEE, merged_df.seg_queryBin)

seg_queryBin,head,novel,tail,top.01,top.1,torso
has_QEE,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
False,128,111,107,13,7,110
True,133,0,1,305,172,8


In [24]:
pd.crosstab(merged_df.has_queryTaxo, merged_df.seg_queryBin)

seg_queryBin,head,novel,tail,top.01,top.1,torso
has_queryTaxo,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
False,44,111,106,85,55,100
True,217,0,2,233,124,18


!!Confounding: queries with QEE or query Taxo more likely come from top-head queries