In [1]:
!gcloud config set project etsy-bigquery-adhoc-prod


If you have a compatible Python interpreter installed, you can use it by setting
the CLOUDSDK_PYTHON environment variable to point to it.

Updated property [core/project].


In [2]:
%load_ext google.cloud.bigquery

In [3]:
import matplotlib.pyplot as plt
import seaborn as sns
import scipy
import pandas as pd
import numpy as np

## Coverage in FBV2

```sql
select sum(queryLevelMetrics_gms)
from `etsy-ml-systems-prod.feature_bank_v2.query_feature_bank_2023-10-01`
where queryTaxoDemandFeatures_purchaseTopTaxonomyPaths is not null
and queryTaxoDemandFeatures_purchaseTopTaxonomyPaths.list[0] is not null
```

```sql
select sum(q.gms)
from `etsy-ml-systems-prod.feature_bank_v2.query_feature_bank_2023-10-01` fb
left join `etsy-data-warehouse-prod.rollups.query_level_metrics_raw` q
on fb.key = q.query_raw
```

## Coverage in a day of FL data

```sql
create or replace table `etsy-sr-etl-prod.yzhang.web_fl_train_1002`
as (
    with fl as (
      select
          requestUUID, visitId, position, 
          candidateInfo.docInfo.listingInfo.listingId, 
          candidateInfo.docInfo.listingInfo.verticaListings.taxonomyPath,
          ctx.docInfo.queryInfo.query,
          ctx.docInfo.queryInfo.queryTaxoDemandFeatures.*,
          attributions,
      from `etsy-ml-systems-prod.attributed_instance.query_pipeline_web_organic_2023_10_02`, 
          unnest(contextualInfo) as ctx
      where ctx.docInfo.queryInfo.query is not null
      order by requestUUID, position
    ),
    query_gms as (
      select query_raw, gms 
      from `etsy-data-warehouse-prod.rollups.query_level_metrics_raw` 
    )
    select
      fl.*, query_gms.gms
    from fl  
    left join query_gms 
    on fl.query = query_gms.query_raw
)
```

```sql
SELECT count(distinct query)
FROM `etsy-sr-etl-prod.yzhang.web_fl_train_1002`
where array_length(purchaseTopTaxonomyPaths) > 0
```

+ Queries with purchased taxonomies: 331904 / 925234 (36%)
+ Accounting for GMS 101571374484320 / 107290298625912 (94.7%)

+ Queries with clicked taxonomies purchase: 593278 / 925234 (64%)
+ accounting for GMS: 101857085524361 / 107290298625912 (94.9%)

## Coverage in 3 days of RPC data

```sql
with rpc_query as (
  select distinct query
  from `etsy-sr-etl-prod.yzhang.query_taxo_lastpass_rpc` rpc
  where purchase_top_paths is not null
  and purchase_top_paths.list[0] is not null
),
qgms as (
  select query, q.gms
  from rpc_query
  left join `etsy-data-warehouse-prod.rollups.query_level_metrics_raw` q
  on rpc_query.query = q.query_raw
)
select sum(gms) from qgms
```

## How positively correlated is query demand feature and queryBuyerTaxoDresden

+ Number of queries
    + query demand purchase top taxo available: 4861275 / 1994917889 = 0.24%
    + query buyer taxo dresden available: 734605469 / 1994917889 = 37%
    + both available: 4652145 / 1994917889 = 0.23%

+ query level GMS
    + query demand purchase top taxo available: 123699245786 / 243526671508 = 51%
    + query buyer taxo dresden available: 188767010402 / 243526671508 = 78%
    + both available: 123289689477 / 243526671508 = 51%

In [4]:
%%bigquery df
select 
    `key` as query_str,
    queryLevelMetrics_gms as query_gms,
    queryTaxoDemandFeatures_purchaseTopTaxonomyPaths as demand_top,
    queryTaxoDemandFeatures_purchaseLevel2TaxonomyPaths as demand_level2,
    queryBuyerTaxoDresden_taxoPath as dresden
from `etsy-ml-systems-prod.feature_bank_v2.query_feature_bank_most_recent`
where queryTaxoDemandFeatures_purchaseLevel2TaxonomyPaths is not null
and array_length(queryTaxoDemandFeatures_purchaseLevel2TaxonomyPaths.list) > 0
and queryBuyerTaxoDresden_taxoPath is not null

Query complete after 0.02s: 100%|█████████████| 3/3 [00:00<00:00, 922.23query/s]
Downloading: 100%|██████████████| 4652145/4652145 [00:11<00:00, 414231.78rows/s]


In [5]:
def process_full_path_top(taxo_path):
    taxo_path_split = taxo_path.split(".")
    return taxo_path_split[0]
        
def process_full_path_level2(taxo_path):
    taxo_path_split = taxo_path.split(".")
    if len(taxo_path_split) > 1:
        return taxo_path_split[0] + '.' + taxo_path_split[1]
    else:
        return ""

In [6]:
df["demand_top_clean"] = df.demand_top.apply(lambda x: [item["element"] for item in x["list"]])
df["demand_level2_clean"] = df.demand_level2.apply(lambda x: [item["element"] for item in x["list"]])
df["dresden_top"] = df.dresden.apply(process_full_path_top)
df["dresden_level2"] = df.dresden.apply(process_full_path_level2)

In [7]:
df.head()

Unnamed: 0,query_str,query_gms,demand_top,demand_level2,dresden,demand_top_clean,demand_level2_clean,dresden_top,dresden_level2
0,mom lamp,26248.0,{'list': [{'element': 'home_and_living'}]},{'list': [{'element': 'home_and_living.lightin...,home_and_living,[home_and_living],[home_and_living.lighting],home_and_living,
1,mom letter blanket,39626.0,{'list': [{'element': 'home_and_living'}]},{'list': [{'element': 'home_and_living.bedding...,home_and_living.bedding.blankets_and_throws,[home_and_living],[home_and_living.bedding],home_and_living,home_and_living.bedding
2,girl nutcracker dress,96467.0,{'list': [{'element': 'clothing'}]},{'list': [{'element': 'clothing.girls_clothing...,clothing.girls_clothing.dresses,[clothing],[clothing.girls_clothing],clothing,clothing.girls_clothing
3,girl of now sample,,{'list': [{'element': 'bath_and_beauty'}]},{'list': [{'element': 'bath_and_beauty.fragran...,bath_and_beauty.fragrances,[bath_and_beauty],[bath_and_beauty.fragrances],bath_and_beauty,bath_and_beauty.fragrances
4,happy paw tag,9460.0,{'list': [{'element': 'pet_supplies'}]},{'list': [{'element': 'pet_supplies.pet_collar...,pet_supplies.pet_collars_and_leashes.pet_id_tags,[pet_supplies],[pet_supplies.pet_collars_and_leashes],pet_supplies,pet_supplies.pet_collars_and_leashes


In [8]:
def compare_taxo_top(row):
    return float(row.demand_top_clean[0] == row.dresden_top)

def compare_taxo_level2(row):
    return float(row.demand_level2_clean[0] == row.dresden_level2)

In [9]:
df["top_same"] = df.apply(compare_taxo_top, axis=1)
df["level2_same"] = df.apply(compare_taxo_level2, axis=1)

In [10]:
df[df.top_same == 1][["demand_top_clean", "dresden_top"]].head(n=10)

Unnamed: 0,demand_top_clean,dresden_top
0,[home_and_living],home_and_living
1,[home_and_living],home_and_living
2,[clothing],clothing
3,[bath_and_beauty],bath_and_beauty
4,[pet_supplies],pet_supplies
5,[home_and_living],home_and_living
6,[bags_and_purses],bags_and_purses
7,"[bags_and_purses, weddings]",bags_and_purses
8,[bags_and_purses],bags_and_purses
9,[bags_and_purses],bags_and_purses


In [11]:
df[df.level2_same == 1][["demand_level2_clean", "dresden_level2"]].head(n=10)

Unnamed: 0,demand_level2_clean,dresden_level2
1,[home_and_living.bedding],home_and_living.bedding
2,[clothing.girls_clothing],clothing.girls_clothing
3,[bath_and_beauty.fragrances],bath_and_beauty.fragrances
4,[pet_supplies.pet_collars_and_leashes],pet_supplies.pet_collars_and_leashes
6,[bags_and_purses.wallets_and_money_clips],bags_and_purses.wallets_and_money_clips
7,"[bags_and_purses.wallets_and_money_clips, wedd...",bags_and_purses.wallets_and_money_clips
8,[bags_and_purses.wallets_and_money_clips],bags_and_purses.wallets_and_money_clips
9,[bags_and_purses.wallets_and_money_clips],bags_and_purses.wallets_and_money_clips
12,[bags_and_purses.wallets_and_money_clips],bags_and_purses.wallets_and_money_clips
14,[electronics_and_accessories.electronics_cases...,electronics_and_accessories.electronics_cases


In [12]:
df.top_same.sum()

2968801.0

In [13]:
df.level2_same.sum()

1857121.0

In [14]:
df[df.top_same == 1].query_gms.sum()

90752509244.0

In [15]:
df[df.level2_same == 1].query_gms.sum()

67168836483.0

+ 2968801 / 4652145 = 64% queries with both features have the same top level taxonomy, which account for 90752509244 / 243526671508 = 37% total query level GMS
+ 1857121 / 4652145 = 40% queries with both features have the same level 2 taxonomy, which account for 67168836483 / 243526671508 = 28% total query level GMS

## Is it possible listing is purchased but not in query top taxonomy? If yes how often?