In [1]:
!gcloud config set project etsy-bigquery-adhoc-prod


If you have a compatible Python interpreter installed, you can use it by setting
the CLOUDSDK_PYTHON environment variable to point to it.

Updated property [core/project].


In [2]:
%load_ext google.cloud.bigquery

In [20]:
import tensorflow as tf
import numpy as np
import matplotlib.pyplot as plt
from collections import Counter

First batch of features to look at
+ {click, purchase}Level2TaxonomyPaths
+ {click, purchase}Level2TaxonomyCounts
+ {click, purchase}Level2TaxonomyPrices

# Training data

```sql
create or replace table `etsy-search-ml-dev.yzhang.taxo_diversity_raw_train_web_07_03`
as (
    with fl as (
        select
            requestUUID, visitId, position, 
            candidateInfo.docInfo.listingInfo.listingId, 
            candidateInfo.scores,
            candidateInfo.docInfo.listingInfo.activeListingBasics.priceUsd,
            candidateInfo.docInfo.listingInfo.verticaListings.shopId,
            candidateInfo.docInfo.listingInfo.verticaListings.taxonomyPath,
            candidateInfo.docInfo.listingInfo.verticaListings.tags,
            attributions,
            ctx.docInfo.queryInfo.query, 
            ctx.docInfo.queryInfo.queryTaxoDemandFeatures.*,
         from `etsy-ml-systems-prod.attributed_instance.query_pipeline_web_organic_2023_07_03`, unnest(contextualInfo) as ctx
         where ctx.docInfo.queryInfo.query is not null
         order by requestUUID, position
    ),
    query_gms as (
      select query, sum(attributed_gms) as gms
      from `etsy-data-warehouse-prod.search.query_sessions_new`
      where _date = date('2023-07-03')
      group by query
    )
    select fl.*, gms
    from fl
    left join query_gms
    on fl.query = query_gms.query
)
```

```sql
with query_with_taxoes as (
  select *
  from `etsy-search-ml-dev.yzhang.taxo_diversity_raw_train_web_07_03`
  where array_length(purchaseLevel2TaxonomyPaths) > 0
),
query_gms as (
  select distinct query, gms
  from query_with_taxoes
)
select sum(gms) from query_gms
```

queries with purchased taxonomies on one of the days in training data

+ total: 827734 queries, 99038566 GMS
+ have purchase taxos: 290207 queries (35%), 62946653 GMS (64%)
+ have more than 1 purchased taxoes: 75488 queries (9%), 30887075 GMS (31%)

In [4]:
%%bigquery df
select query, attributions, purchaseLevel2TaxonomyPaths, purchaseLevel2TaxonomyCounts
from `etsy-search-ml-dev.yzhang.taxo_diversity_raw_train_web_07_03`
where array_length(purchaseLevel2TaxonomyPaths) > 1

Query complete after 0.02s: 100%|█████████████████████████████████████████████████████████████████| 4/4 [00:00<00:00, 1053.78query/s]
Downloading: 100%|█████████████████████████████████████████████████████████████████| 21314469/21314469 [01:16<00:00, 280189.96rows/s]


In [5]:
df.head()

Unnamed: 0,query,attributions,purchaseLevel2TaxonomyPaths,purchaseLevel2TaxonomyCounts
0,barbie svg,[no_event],"[craft_supplies_and_tools.canvas_and_surfaces,...","[588, 367, 239, 178, 2, 1, 1, 1, 1]"
1,cribbage board,[no_event],"[toys_and_games.games_and_puzzles, weddings.gi...","[157, 13, 4, 1]"
2,bucket hat,[no_event],"[accessories.hats_and_caps, craft_supplies_and...","[177, 23, 20, 1, 1]"
3,wall art,[no_event],"[art_and_collectibles.prints, home_and_living....","[185, 120, 28, 3, 2, 2]"
4,engagement ring,[no_event],"[jewelry.rings, craft_supplies_and_tools.tools...","[204, 2, 1]"


In [15]:
df.purchaseLevel2TaxonomyPaths[0]

array(['craft_supplies_and_tools.canvas_and_surfaces',
       'art_and_collectibles.drawing_and_illustration',
       'art_and_collectibles.prints',
       'craft_supplies_and_tools.patterns_and_how_to',
       'paper_and_party_supplies.paper',
       'craft_supplies_and_tools.knives_and_cutting_tools',
       'clothing.gender_neutral_adult_clothing',
       'paper_and_party_supplies.party_supplies', ''], dtype=object)

In [13]:
df.purchaseLevel2TaxonomyCounts[0]

array([588, 367, 239, 178,   2,   1,   1,   1,   1])

In [17]:
print(len(df.purchaseLevel2TaxonomyPaths[0]))
print(len(df.purchaseLevel2TaxonomyCounts[0]))
print(df.purchaseLevel2TaxonomyPaths[0][-1])

9
9



Attributions

In [8]:
tmp = list(df.attributions.values)
labels = [item for sublist in tmp for item in sublist]

In [10]:
len(labels)

21407343

In [11]:
Counter(labels)

Counter({'no_event': 20696937,
         'click': 615718,
         'fav': 6399,
         'cart_add': 62463,
         'purchase': 25826})

Distribution of number of taxonomies per query

In [18]:
df['feature_length'] = df.purchaseLevel2TaxonomyPaths.apply(lambda x: len(x))

In [19]:
df.feature_length.plt(kind='hist')

AttributeError: 'Series' object has no attribute 'plt'

In [48]:
df_diverse = df[df.feature_length >= 3]
df_others = df[df.feature_length < 3]

In [44]:
print(len(df['query'].unique()))
print(len(df_diverse['query'].unique()))

367642
28893


In [46]:
print(df.gms.sum())
print(df_diverse.gms.sum())

70549707
17684844


In [50]:
df[df.feature_length >= 2].gms.sum()

28716140

+ 20% queries have at least 2 purchased taxoes, account for 40% GMS
+ 8% queries have at least 3 purchased taxoes, account for 25% GMS
+ queries with more high-demand taxonomies appear to account for larger percent of GMS / query

In [56]:
diverse_query_length = df_diverse["query"].apply(lambda x: len(x.split(" ")))
others_query_length = df_others["query"].apply(lambda x: len(x.split(" ")))

In [59]:
diverse_query_length.value_counts()

query
2    13867
3     8688
1     4607
4     1556
5      163
6        8
8        4
Name: count, dtype: int64

In [60]:
others_query_length.value_counts()

query
2     142453
3     129735
4      32969
1      29180
5       4013
6        356
7         29
9          8
8          4
11         1
10         1
Name: count, dtype: int64

In [62]:
df_others["query"]

0               playboy wallet
1                      playdoh
3            playboy tooth gem
8                  playdoh mat
9                     playdate
                  ...         
367637     3 pin anderson plug
367638    kuromi car accessory
367639        vin number plate
367640               vin plate
367641      chevy black emblem
Name: query, Length: 338749, dtype: object

# Generated attribution scores

In [None]:
path_file = "gs://training-dev-search-data-jtzn/neural_ranking/third_pass/temp/1689139587703/908724743918/nr-third-pass-taxo-diversity-exp4-development-1689139587703/beam-generate_3598403727498674176/examples_output_dir"
path_file = f"{path_file}/train/part-00000-of-15624.tfrecord"
dataset = tf.data.TFRecordDataset(path_file)

for raw_record in train_dataset.take(1000):
    example = tf.train.Example().FromString(raw_record.numpy())
    
    for feature_i in example.features.feature:
        print(feature_i)
        if 'attributions_score' in feature_i or 'TaxonomyPaths' in feature_i:
            print(example.features.feature)
            length = len(example.features.feature[feature_i].feature)
            for k in range(length):
                feature_value = example.features.feature[feature_i].feature[k].float_list.value
                if 'attributions_score' in feature_i:
                    values_score.extend(feature_value)
                if 'purchaseLevel2TaxonomyPaths:overlap' in feature_i:
                    level2_purchase_overlap.extend(feature_value)
                if 'clickLevel2TaxonomyPaths:overlap' in feature_i:
                    level2_click_overlap.extend(feature_value)
                if 'clickLevel2TaxonomyPaths:cosine' in feature_i:
                    level2_click_cosine.extend(feature_value)

In [None]:
values_score = []
top_purchase_overlap = []
top_click_overlap = []
level2_purchase_overlap = []
level2_click_overlap = []
level2_click_cosine = []