In [1]:
from google.cloud import bigquery
import pandas as pd
import numpy as np
import uuid

## Read new problematic query data

In [2]:
client = bigquery.Client(project="etsy-search-ml-dev")

In [3]:
# human annotation data 
query_str = """select 
    GENERATE_UUID() as row_id,
    raw_query as query,
    listing_id as listingId,
    final_label as gt_label_5grade
from `etsy-search-ml-dev.search.exact_match_key_queries_for_v4_2_init_labelled`
"""

query_job = client.query(query_str)
rows = query_job.result()
human_anno_df = rows.to_dataframe()



In [4]:
# full dataset before sampling
query_str = """select
    platform,
    userCountry,
    userLanguage,
    query,
    queryEn,
    queryBin as seg_queryBin,
    qisClass as seg_qisClass,
    queryTaxoFullPath as seg_queryTaxoFullPath,
    queryTaxoTop as seg_queryTaxoTop,
    queryEntities_tangibleItem as tangibleItem,
    queryEntities_fandom as fandom,
    queryEntities_motif as motif,
    queryEntities_style as style,
    queryEntities_material as material,
    queryEntities_color as color,
    queryEntities_technique as technique,
    queryEntities_size as size,
    queryEntities_occasion as occasion,
    queryEntities_customization as customization,
    queryEntities_age as age,
    queryEntities_price as price,
    queryEntities_quantity as quantity,
    queryEntities_recipient as recipient,
    queryEntities,
    queryRewrites,
    queryIsGift,
    listingId,
    listingCountry,
    shop_primaryLanguage,
    listingTitle,
    listingTitleEn,
    listingTaxo,
    listingTags,
    listingAttributes,
    listingShopName,
    listingDescription,
    listingDescriptionEn,
    listingDescNgrams,
    listingImageUrls,
    listingHeroImageCaption,
    listingVariations,
    listingReviews
from `etsy-search-ml-dev.search.yzhang_emqueries_issue_problem_requests_full`
"""

query_job = client.query(query_str)
rows = query_job.result()
full_df = rows.to_dataframe()

In [5]:
merged_df = pd.merge(human_anno_df, full_df, on=["query", "listingId"], how="left")

In [6]:
print(merged_df.shape)
print(merged_df.gt_label_5grade.value_counts())

(300, 44)
gt_label_5grade
partially_relevant_substitute    101
relevant                          98
partially_relevant_complement     69
partially_relevant_neither        20
irrelevant                         9
not_sure                           3
Name: count, dtype: int64


## Clean new data

In [7]:
# remove records with unsure label from golden eval
merged_df = merged_df[merged_df.gt_label_5grade != "not_sure"]

In [8]:
merged_df.loc[merged_df.gt_label_5grade == "irrelevant", "gt_label_5grade"] = "not_relevant"

In [9]:
print(merged_df.shape)
print(merged_df.gt_label_5grade.value_counts())

(297, 44)
gt_label_5grade
partially_relevant_substitute    101
relevant                          98
partially_relevant_complement     69
partially_relevant_neither        20
not_relevant                       9
Name: count, dtype: int64


In [10]:
# add anno_data_source and gt_label
merged_df["anno_data_source"] = "problematic_queries_99"
merged_df["gt_label"] = merged_df["gt_label_5grade"]

# add missing columns
merged_df["etsyUUID"] = ""
merged_df["product_type_in_query"] = ""
merged_df["product_type_in_listing_if_mismatch"] = ""
merged_df["subsitute_complementary"] = ""
merged_df["descriptor_mismatch"] = ""

In [11]:
# 3-grade vs 5-grade gt label conversion
merged_df.loc[
    merged_df.gt_label_5grade.isin([
        "partially_relevant_substitute", 
        "partially_relevant_complement",
        "partially_relevant_neither"]), 
    "gt_label"
] = "partial"

merged_df.loc[merged_df.gt_label_5grade == "partially_relevant_substitute", "subsitute_complementary"] = "Substitute"
merged_df.loc[merged_df.gt_label_5grade == "partially_relevant_complement", "subsitute_complementary"] = "Complement"
merged_df.loc[merged_df.gt_label_5grade == "partially_relevant_neither", "subsitute_complementary"] = "Other"

In [12]:
merged_df.gt_label.value_counts()

gt_label
partial         190
relevant         98
not_relevant      9
Name: count, dtype: int64

## Make 5-grade gt label for partial purchases

In [13]:
# V4.1 eval data
old_df = pd.read_json(
    "gs://training-dev-search-data-jtzn/semantic_relevance/datasets/v4_eval_golden_data/eval_data_v4-1_majority.jsonl", 
    lines=True
)

In [14]:
old_df["gt_label_5grade"] = ""

pp_mask = old_df["anno_data_source"] == "partial_purchases"
gt_rel_mask = old_df["gt_label"] == "relevant"
gt_par_mask = old_df["gt_label"] == "partial"
gt_irr_mask = old_df["gt_label"] == "not_relevant"

old_df.loc[np.logical_and(pp_mask, gt_rel_mask), "gt_label_5grade"] = "relevant"
old_df.loc[np.logical_and(pp_mask, gt_par_mask), "gt_label_5grade"] = "partially_relevant_neither"
old_df.loc[np.logical_and(pp_mask, gt_irr_mask), "gt_label_5grade"] = "not_relevant"

old_df.loc[np.logical_and(
    np.logical_and(pp_mask, gt_par_mask), 
    old_df.subsitute_complementary == "Substitute"
), "gt_label_5grade"] = "partially_relevant_substitute"

old_df.loc[np.logical_and(
    np.logical_and(pp_mask, gt_par_mask), 
    old_df.subsitute_complementary == "Complement"
), "gt_label_5grade"] = "partially_relevant_complement"

In [15]:
pp_df = old_df[old_df.anno_data_source == "partial_purchases"]
print(pp_df.gt_label.value_counts())
print(pp_df.gt_label_5grade.value_counts())

gt_label
relevant        249
partial         152
not_relevant      2
Name: count, dtype: int64
gt_label_5grade
relevant                         249
partially_relevant_neither        68
partially_relevant_substitute     67
partially_relevant_complement     17
not_relevant                       2
Name: count, dtype: int64


In [16]:
pd.crosstab(pp_df.subsitute_complementary, pp_df.gt_label)

gt_label,not_relevant,partial,relevant
subsitute_complementary,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Complement,0,17,5
Not Sure,0,5,0
Other,1,4,1
Substitute,1,67,11


## Output

In [17]:
df = pd.concat([old_df, merged_df])

In [18]:
df = df[[
    "row_id", "etsyUUID", "platform", "userCountry", "userLanguage", 
    # query
    "query", "queryEn", "seg_queryBin", "seg_qisClass", "seg_queryTaxoFullPath", "seg_queryTaxoTop", 
    "tangibleItem", "fandom", "motif", "style", "material", "color", "technique", "size", "occasion", 
    "customization", "age", "price", "quantity", "recipient", "queryEntities", "queryRewrites", "queryIsGift", 
    # listing
    "listingId", "listingCountry", "shop_primaryLanguage", "listingTitle", "listingTitleEn", "listingTaxo", 
    "listingTags", "listingAttributes", "listingShopName", "listingDescription", "listingDescriptionEn", 
    "listingDescNgrams", "listingImageUrls", "listingHeroImageCaption", "listingVariations", "listingReviews",
    # annotation
    "anno_data_source", "gt_label", "gt_label_5grade",
    "product_type_in_query", "product_type_in_listing_if_mismatch", 
    "subsitute_complementary", "descriptor_mismatch", 
]]

In [19]:
df = df.groupby('row_id').sample(n=1, random_state=42)

In [20]:
df.shape

(1792, 51)

In [21]:
df.to_json(
    "gs://training-dev-search-data-jtzn/semantic_relevance/datasets/v4_eval_golden_data/eval_data_v4-2.jsonl", 
    orient="records", lines=True
)