In [1]:
from google.cloud import bigquery
import pandas as pd
import numpy as np
import uuid

In [2]:
# full dataset before sampling
query_str = """select *
from `etsy-search-ml-dev.search.yzhang_emqueries_issue_problem_requests_full`
order by query, mmxRequestUUID, listingRank
"""

client = bigquery.Client(project="etsy-search-ml-dev")
query_job = client.query(query_str)
rows = query_job.result()
full_df = rows.to_dataframe()


# final selected list
with open("./exact_match_key_queries_prototype.txt", "r") as f:
    final_key_queries = f.read().splitlines()
    
    
# get full data for final selected queries
selected_df = full_df[full_df["query"].isin(final_key_queries)]

In [3]:
selected_df.shape

(4800, 63)

### cleaning

In [4]:
# create a unique ID for each row
selected_df = selected_df.copy()
selected_df["tableUUID"] = [str(uuid.uuid4()) for _ in range(len(selected_df))]


# fill missing values
selected_df = selected_df[[
    "tableUUID", "query", "queryEn", "queryBin", "qisClass", "userCountry",
    "listingId", "listingRank", 
    "llm_final_label", "llm_consensus_type",
    "listingImageUrls", 
    'listingTitle', 'listingTitleEn',
    'listingDescription', 'listingDescriptionEn',
    'listingAttributes',
    'listingVariations'
]]
selected_df.fillna("", inplace=True)


# translation first fallback to original for query, title, description
selected_df["en_query"] = selected_df.apply(
    lambda row: row["queryEn"] if row["queryEn"] != "" else row["query"], axis=1
)
selected_df["en_title"] = selected_df.apply(
    lambda row: row["listingTitleEn"] if row["listingTitleEn"] != "" else row["listingTitle"],
    axis=1,
)
selected_df["en_desc"] = selected_df.apply(
    lambda row: row["listingDescriptionEn"] if row["listingDescriptionEn"] != "" else row["listingDescription"],
    axis=1,
)


# clean listing attribute to V2 format
selected_df["listingAttributes"] = selected_df["listingAttributes"].str.replace("#", ":").str.lower()


# split image url into hero and non-hero
selected_df["listingHeroImageUrl"] = selected_df["listingImageUrls"].apply(lambda x: x.split(";")[0])
selected_df["listingOtherImageUrls"] = selected_df["listingImageUrls"].apply(lambda x: ";".join((x.split(";")[1:])))

### sample listings

In [5]:
# sample up to 3 listing per query
concensus_weight_map = {"5-0": 1, "4-1": 2, "3-2": 2} # LLM uncertain get sampled with higher prob
output_sample = None

for i in range(len(final_key_queries)):
    curr_query = final_key_queries[i]
    curr_df = selected_df[selected_df["query"] == curr_query]

    # sample at most 2 top listings that are partial/irrelevant
    top_sample = None
    top_df = curr_df[np.logical_and(
        curr_df["listingRank"] < 12, 
        curr_df["llm_final_label"].isin(["partial", "not_relevant"])
    )]
    if top_df.shape[0] > 0:
        top_probs = top_df["llm_consensus_type"].map(concensus_weight_map).values
        top_probs = top_probs / np.sum(top_probs)
        top_sample = top_df.sample(n=min(top_df.shape[0], 2), weights=top_probs)

    # sample 1 bottom listings that are fully relevant
    bottom_sample = None    
    bottom_df = curr_df[np.logical_and(
        curr_df["listingRank"] >= 12, 
        curr_df["llm_final_label"] == "relevant"
    )]
    if bottom_df.shape[0] > 0:
        bottom_probs = bottom_df["llm_consensus_type"].map(concensus_weight_map).values
        bottom_probs = bottom_probs / np.sum(bottom_probs)
        bottom_sample = bottom_df.sample(n=1, weights=bottom_probs)

    if top_sample is not None and bottom_sample is not None:
        curr_sample = pd.concat([top_sample, bottom_sample])
    elif top_sample is not None:
        curr_sample = top_sample
    elif bottom_sample is not None:
        curr_sample = bottom_sample
    else:
        print(query)
        curr_sample = None

    if curr_sample is not None:
        if output_sample is None:
            output_sample = curr_sample
        else:
            output_sample = pd.concat([output_sample, curr_sample])

In [6]:
output_sample.shape

(297, 22)

### final formatting and output

In [7]:
output_sample = output_sample[[
    "tableUUID", "en_query", "queryBin", "qisClass", "userCountry",
    "listingId", "listingHeroImageUrl", "en_title", "en_desc", 
    "listingAttributes", "listingVariations", "listingOtherImageUrls",
    "llm_final_label", "llm_consensus_type"
]]
output_sample.rename(columns={
    "en_query": "queryRaw",
    "en_title": "listingTitle",
    "en_desc": "listingDescription"
}, inplace=True)

# for this round we didn't get spell corrected queries from rpc logs
# making them identical, will make sure query reflects spell corrected ones moving forward 
output_sample.insert(loc=1, column="query", value=output_sample["queryRaw"].values)

In [8]:
listing_viewer_urls = output_sample["listingId"].apply(
    lambda x: f"https://atlas.etsycorp.com/recsys/listing_viewer.php?listing_ids={str(x)}&json_key="
)
output_sample.insert(loc=7, column="listingUrl", value=listing_viewer_urls.values)

In [9]:
output_sample.columns

Index(['tableUUID', 'query', 'queryRaw', 'queryBin', 'qisClass', 'userCountry',
       'listingId', 'listingUrl', 'listingHeroImageUrl', 'listingTitle',
       'listingDescription', 'listingAttributes', 'listingVariations',
       'listingOtherImageUrls', 'llm_final_label', 'llm_consensus_type'],
      dtype='object')

In [10]:
output_sample.to_csv("./exact_match_key_queries_for_v4_2.csv", index=False)

In [11]:
output_sample.drop(["llm_final_label", "llm_consensus_type"], axis=1, inplace=True)

In [12]:
bq_schema = [
    bigquery.SchemaField(name="tableUUID", field_type="STRING", mode="REQUIRED"),
    bigquery.SchemaField(name="query", field_type="STRING", mode="REQUIRED"),
    bigquery.SchemaField(name="queryRaw", field_type="STRING", mode="NULLABLE"),
    bigquery.SchemaField(name="queryBin", field_type="STRING", mode="NULLABLE"),
    bigquery.SchemaField(name="qisClass", field_type="STRING", mode="NULLABLE"),
    bigquery.SchemaField(name="userCountry", field_type="STRING", mode="NULLABLE"),
    bigquery.SchemaField(name="listingId", field_type="INT64", mode="REQUIRED"),
    bigquery.SchemaField(name="listingHeroImageUrl", field_type="STRING", mode="NULLABLE"),
    bigquery.SchemaField(name="listingTitle", field_type="STRING", mode="NULLABLE"),
    bigquery.SchemaField(name="listingDescription", field_type="STRING", mode="NULLABLE"),
    bigquery.SchemaField(name="listingAttributes", field_type="STRING", mode="NULLABLE"),
    bigquery.SchemaField(name="listingVariations", field_type="STRING", mode="NULLABLE"),
    bigquery.SchemaField(name="listingOtherImageUrls", field_type="STRING", mode="NULLABLE")
]

In [13]:
client = bigquery.Client(project="etsy-search-ml-dev")
output_table = "etsy-search-ml-dev.search.exact_match_key_queries_for_v4_2"

job_config = bigquery.LoadJobConfig(
    schema= bq_schema,
    write_disposition="WRITE_TRUNCATE",
)

upload_job = client.load_table_from_dataframe(
    output_sample, output_table, job_config=job_config
)
upload_job.result()

LoadJob<project=etsy-search-ml-dev, location=US, id=6daf4a7e-9598-425a-90d6-2660e9ac7902>