This notebook reads the v0 eval split, then hydrate title and desc features

In [1]:
import pandas as pd
import numpy as np
import time
import json
from copy import deepcopy
from google.cloud import bigquery

In [2]:
df = pd.read_excel("./data/gsl_eval_v0.xlsx")

print(df.shape)
print(df.columns)

(1200, 16)
Index(['query', 'queryEn', 'listingId', 'etsy_url', 'atlas_url',
       'titleEn_vertica', 'etsyUUID', 'platform', 'userLanguage',
       'anno_data_source', 'is_test', 'labelbox_majority_label',
       'label_annotator_1', 'label_annotator_2', 'label_annotator_3',
       'is_gsl_v0_eval'],
      dtype='object')


In [3]:
client = bigquery.Client(project="etsy-search-ml-dev")

us_v2_query = """SELECT
    etsyUUID, query, listingId, 
    listingTitle usTitle,
    listingDescription usDesc
FROM `etsy-sr-etl-prod.yzhang.sem_rel_human_annotation_v2`
"""
us_job = client.query(us_v2_query)
us_features_df = us_job.result().to_dataframe()


intl_query = """SELECT
    etsyUUID, query, listingId, 
    listingTitle intlTitle, 
    listingTitleEn intlTitleEn, 
    listingDescription intlDesc,
    listingDescriptionEn intlDescEn
FROM `etsy-search-ml-dev.aclapp.isearch_semrel_surveyv1_teacherv2_features_w_primaryLang`
""" 
intl_job = client.query(intl_query)
intl_features_df = intl_job.result().to_dataframe()

I0000 00:00:1738610543.416218 24898453 check_gcp_environment_no_op.cc:29] ALTS: Platforms other than Linux and Windows are not supported
I0000 00:00:1738610547.553376 24898453 check_gcp_environment_no_op.cc:29] ALTS: Platforms other than Linux and Windows are not supported


In [4]:
merged_df = pd.merge(df, us_features_df, on=["etsyUUID", "query", "listingId"], how="left")
merged_df = pd.merge(merged_df, intl_features_df, on=["etsyUUID", "query", "listingId"], how="left")

In [5]:
# sanity checks

# is_us_subset = merged_df.anno_data_source.str.startswith("us")

# print(merged_df.shape)    # 1200 pairs
# print(merged_df.anno_data_source.str.startswith("us").sum())    # 656 US, 544 international

# print((merged_df[is_us_subset].titleEn_vertica != merged_df[is_us_subset].usTitle).sum())    # 0
# print((merged_df[~is_us_subset].titleEn_vertica != merged_df[~is_us_subset].intlTitleEn).sum())    # 0
# print(pd.isnull(merged_df.titleEn_vertica).sum())    # 0
# has_both_en_title = np.logical_and(~pd.isnull(merged_df.usTitle), ~pd.isnull(merged_df.intlTitleEn))
# has_both_en_title.sum()    # 0, none have both en title fields
# ### titleEn_vertica is the English title

# print((~pd.isnull(merged_df.intlTitle)).sum())  # 514 pairs has original language title

# has_both_en_desc = np.logical_and(~pd.isnull(merged_df.usDesc), ~pd.isnull(merged_df.intlDescEn))
# print(has_both_en_desc.sum())    # 0, none have both en desc fields
# has_neither_en_desc = np.logical_and(pd.isnull(merged_df.usDesc), pd.isnull(merged_df.intlDescEn))
# print(has_neither_en_desc.sum())    # 0, each record must have either of the en description field

# print((~pd.isnull(merged_df.intlDesc)).sum())  # 514 pairs has original language desc
# print(np.sum(np.logical_and(~pd.isnull(merged_df.intlDesc), pd.isnull(merged_df.intlTitle)))). # 0

+ 1200 pairs, 656 us, 544 international
+ 514 has primary language title & desc (missing in same listings)

In [6]:
descEn = []
for i in range(merged_df.shape[0]):
    curr_us_desc = merged_df.usDesc.values[i]
    curr_intl_en_desc = merged_df.intlDescEn.values[i]
    en_desc = curr_intl_en_desc if pd.isnull(curr_us_desc) else curr_us_desc
    descEn.append(en_desc)

In [7]:
merged_df["descEn"] = descEn
merged_df = merged_df[[
    'query', 'queryEn', 'listingId', 'etsyUUID', 'platform', 'userLanguage',
    'titleEn_vertica', 'intlTitle', 'descEn', 'intlDesc',
    'anno_data_source', 'is_test', 'labelbox_majority_label',
    'label_annotator_1', 'label_annotator_2', 'label_annotator_3'
]]
merged_df.rename(
    columns={"titleEn_vertica": "en_title", "intlTitle": "non_en_title", "descEn": "en_desc", "intlDesc": "non_en_desc"}, 
    inplace=True
)

In [8]:
merged_df.head()

Unnamed: 0,query,queryEn,listingId,etsyUUID,platform,userLanguage,en_title,non_en_title,en_desc,non_en_desc,anno_data_source,is_test,labelbox_majority_label,label_annotator_1,label_annotator_2,label_annotator_3
0,nautical baby shower favors,,1014137217,EuKO769OGxX4nMXGMf-MqwREhd7a,web,en-US,"Nautical Party Water Bottle Labels, Sailboat W...",,Warp around your water bottle and secure it wi...,,us_v2-broad,True,partial,partial,partial,relevant
1,vivienne westwood lighter,vivienne westwood lighter,1480332546,Eu9Av1tSXJxDI75y9jKwz5_3Np7d,web,es,Viviennme Westwood Orb heart bag designer vega...,Viviennme Westwood Orb bolso corazón diseñador...,I DON&#39;T REPLY TO SCAMMERS\n\n\n-Official V...,NO RESPONDO A ESTAFADORES\n\n\n-Bolsa de coraz...,intl-es,True,not_relevant,not_relevant,not_relevant,not_relevant
2,warrior,warrior,688801429,EufZpYygPek54nBx590oiKchn157,web,nl,"Retro VHS Lamp, The Warriors Retro,Top Quality...","Retro VHS-lamp, The Warriors Retro, topkwalite...",Welcome to our store and thanks for viewing ou...,Welkom in onze winkel en bedankt voor het beki...,intl-nl,True,relevant,relevant,relevant,relevant
3,drum lamp shade,,1425707409,EukA5ureqBynx5-BC_fjpykexS39,web,en-US,Vintage Light Ivory Drum Shade with Spider Shade,,"VINTAGE LAMP SHADE\nThis Vintage Ivory, Drum/B...",,us_v2-direct_unspecified,True,relevant,relevant,relevant,relevant
4,beast ring,beast ring,686462928,Eu1rPkeiSdNa3d2bhPBW3T4FoC92,web,de,His & Hers Personalized Silicone Wedding Ring ...,His & Hers Personalisierte Silikon Ehering Set...,Personalized His & Hers Silicone Wedding Band ...,Personalisierte His & Hers Silikon Ehering Set...,intl-de,True,partial,partial,partial,partial


In [9]:
# merged_df[~pd.isnull(merged_df.non_en_title)].anno_data_source.value_counts()    # all from international
# intl_sub = merged_df[merged_df.anno_data_source.str.startswith("intl")]
# intl_sub[pd.isnull(intl_sub.non_en_title)]
# pd.isnull(merged_df).sum()

In [10]:
merged_df.to_excel("./data/gsl_eval_v0_with_td.xlsx", index=False)