In [1]:
import copy
from tqdm.notebook import tqdm
import pandas as pd
import numpy as np
import tensorflow as tf
from google.cloud import bigquery

2024-11-05 10:37:10.467999: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [2]:
# DATE_STR = "2024-09-16"
# DATE_LIST = [d.strftime('%Y-%m-%d') for d in pd.date_range('2024-09-01','2024-09-10')]

DATE_STR = "2024-10-22"
DATE_LIST = [d.strftime('%Y-%m-%d') for d in pd.date_range('2024-10-23','2024-11-03')]

In [3]:
NIR_PATH = f"gs://etldata-prod-search-ranking-data-hkwv8r/data/shared/neural_ir/30d/large_voc_huge_hidden_assym/models/{DATE_STR.replace('-', '_')}/training_dir/checkpoints/saved_model"
nir_model = tf.saved_model.load(NIR_PATH)

In [4]:
def sample_pairs_from_date(date_str):
    query_str = f"""with lfb_clean as (
      SELECT 
        key AS listingId,
        IFNULL(verticaSellerBasics_shopName, "") shopName,
      FROM `etsy-ml-systems-prod.feature_bank_v2.listing_feature_bank_{DATE_STR}`
      where verticaSellerBasics_shopName is not null
      and verticaSellerBasics_shopName != ""
    ),
    results_pair as(
        SELECT distinct _date, query, listingId, requestUUID, target, 'web' rpSource 
        FROM `etsy-search-kubeflow-dev.behavioral_data.results_pair_fl_web`
        where _date = date('{date_str}')
        UNION ALL
        SELECT distinct _date, query, listingId, requestUUID, target, 'boe' rpSource 
        FROM `etsy-search-kubeflow-dev.behavioral_data.results_pair_fl_boe`
        where _date = date('{date_str}')
    )
    select 
      _date, query, requestUUID, target, rpSource,
      lfb_clean.*
    from results_pair
    join lfb_clean
    using (listingId)
    where query is not null
    and rand() > 0.9
    limit 50000
    """

    client = bigquery.Client(project="etsy-bigquery-adhoc-prod")
    query_job = client.query(query_str)
    rows = query_job.result()
    df = rows.to_dataframe()

    return df


def compute_nir_cosine_score(df, nir_model, batch_size=256):
    n_batches = df.shape[0] // batch_size + 1

    all_cos_scores = None
    
    for i in range(n_batches):
        start_idx = i * batch_size
        end_idx = min( (i+1) * batch_size, df.shape[0] )
        curr_df = df.iloc[start_idx:end_idx, :]

        query_embeddings = nir_model.signatures["embed_raw_queries"](
            tf.constant(list(curr_df['query']), tf.string)
        )["embedding"]

        shop_name_embeddings = nir_model.signatures["embed_raw_queries"](
            tf.constant(list(curr_df['shopName']), tf.string)
        )["embedding"]

        normalize_query = tf.math.l2_normalize(query_embeddings, axis=1)
        normalize_shop = tf.math.l2_normalize(shop_name_embeddings, axis=1)
        cos_score = tf.math.reduce_sum(tf.multiply(normalize_query, normalize_shop), axis=1)
        cos_score = tf.reshape(cos_score, [-1]).numpy()

        if all_cos_scores is None:
            all_cos_scores = cos_score
        else:
            all_cos_scores = np.concatenate((all_cos_scores, cos_score), axis=None)

    new_df = copy.deepcopy(df)
    new_df['query_shop_cosine'] = all_cos_scores
    return new_df


def get_shop_name_queries_examples(date_list, nir_model, threshold=0.8):
    output_df = None
    
    for date_str in tqdm(date_list):
        df = sample_pairs_from_date(date_str)
        new_df = compute_nir_cosine_score(df, nir_model, batch_size=512)
        new_df = new_df[new_df.query_shop_cosine >= threshold]

        if output_df is None:
            output_df = new_df
        else:
            output_df = pd.concat([output_df, new_df], ignore_index=True)

    return output_df

In [5]:
df = get_shop_name_queries_examples(DATE_LIST, nir_model)

  0%|          | 0/12 [00:00<?, ?it/s]



In [6]:
df.shape

(2343, 8)

In [7]:
df[["query", "shopName"]].head(n=20)

Unnamed: 0,query,shopName
0,beckyzee38104,BeckyZee38104
1,afrocosmetics,ShopAfroCosmetics
2,lanufaktur,LaNuFaktur
3,grafikaprintco,GrafikaPrintCo
4,butterflyblues,ButterflyBluesCroche
5,frankieprintco,FrankiePrintCo
6,hsinchuen,hsinchuen
7,elliearmstrongart,EllieArmstrongArt
8,cloudyberryco,cloudyberryco
9,bonpatterns,bonpatterns


In [8]:
len(df.shopName.unique())

1943

In [9]:
unique_listing_ids = list(df.listingId.unique())
len(unique_listing_ids)

2173

In [10]:
query_str = f"""with lfb as (
    select
        key AS listingId,
        IFNULL(verticaListings_title, "") listingTitle,
        IFNULL(verticaListings_description, "") listingDescription,
        IFNULL(verticaListings_taxonomyPath, "") listingTaxo,
        (SELECT STRING_AGG(element, ';') FROM UNNEST(kbAttributesV2_sellerAttributesV2.list)) AS listingAttributes,
        IFNULL(verticaListings_tags, "") listingTags,
        IFNULL(verticaSellerBasics_shopName, "") shopName,
        (SELECT STRING_AGG(element, ';') FROM UNNEST(descNgrams_ngrams.list)) AS descNgrams
    FROM `etsy-ml-systems-prod.feature_bank_v2.listing_feature_bank_{DATE_STR}`
),
lfb_clean as (
    select *
    from lfb
    where listingTitle != ""
    and listingDescription != ""
    and listingAttributes != ""
    and shopName != ""
)
select *
from lfb_clean
where listingId in ({",".join([str(x) for x in unique_listing_ids])})
"""

client = bigquery.Client(project="etsy-bigquery-adhoc-prod")
query_job = client.query(query_str)
rows = query_job.result()
fb_df = rows.to_dataframe()



In [11]:
merged_df = pd.merge(df, fb_df, on="listingId", how="inner")

In [12]:
merged_df.shape

(1519, 15)

In [13]:
merged_df.head()

Unnamed: 0,_date,query,requestUUID,target,rpSource,listingId,shopName_x,query_shop_cosine,listingTitle,listingDescription,listingTaxo,listingAttributes,listingTags,shopName_y,descNgrams
0,2024-10-23,afrocosmetics,bd110951-b74e-49f1-a110-4926611fd9d8,fav,boe,1075778478,ShopAfroCosmetics,0.81475,Raw African Black Soap Liquid 100% Pure Natura...,Raw African Black Soap Liquid 100% Pure Natura...,bath_and_beauty.soaps.body_washes_and_liquid_s...,Secondary color#Brown;Primary color#Black,.african black soap.raw black soap.soap.shea b...,ShopAfroCosmetics,raw;african;black;soap;liquid;pure;natural;org...
1,2024-10-23,grafikaprintco,5f9867ae-8f73-4a27-b244-158feb879cd5,fav,boe,1522727158,GrafikaPrintCo,1.0,Emerald Green Matisse Leaf Throw Blanket | Fun...,♥ 𝙚𝙭𝙘𝙚𝙡𝙡𝙚𝙣𝙩 𝙜𝙞𝙛𝙩 𝙞𝙙𝙚𝙖 ♥\n\n• Perfect gift to b...,home_and_living.bedding.blankets_and_throws,Primary color#Green,.danish pastel.girl room decor.housewarming gi...,GrafikaPrintCo,perfect;gift;warmth;kid;girlfriend;lover;frien...
2,2024-10-23,butterflyblues,656e8026-15b0-4caf-b699-86243fd4f76b,fav,boe,1724416250,ButterflyBluesCroche,0.807735,Spider Web Top PATTERN PDF (with video) not th...,[This is a DIGITAL PDF - NOT the physical item...,craft_supplies_and_tools.patterns_and_how_to.p...,Craft type#Crochet;Holiday#Halloween,.crochet.pattern.tutorial.how to make.handmade...,ButterflyBluesCroche,digital;pdf;physical;item;ite;digital pdf;phys...
3,2024-10-23,frankieprintco,f8b04705-fc1e-4a32-8248-5cd75209e71e,fav,boe,1769696902,FrankiePrintCo,1.0,Dinosaur Sweater: Colorful Maximalist Knit Cre...,This cozy crewneck knit sweater features a one...,clothing.womens_clothing.sweaters.pullover_swe...,Neckline#Crew;Hand knit#No;Hooded#No,.Knit Crewneck.Frankie Print Co.Unique Whimsic...,FrankiePrintCo,cozy;crewneck;knit;sweater;kind;design;everyda...
4,2024-10-23,bonpatterns,8dc87472-2f14-4204-b51e-7ceb226de500,click,web,894949439,bonpatterns,1.0,Bunny sewing pattern pdf Instant Download Sewi...,Download this sewing pattern for our sweet lit...,craft_supplies_and_tools.patterns_and_how_to.t...,Craft type#Sewing;Craft type#Doll making;Occas...,.easy sewing pattern.soft toy pattern pdf.soft...,bonpatterns,download;sewing;pattern;sweet;little;bunny;dol...


In [14]:
merged_df.to_csv(f"shop_name_query_real_{DATE_STR}_dn.csv", index=False, escapechar='\\')