In [1]:
import pandas as pd
import numpy as np

from google.cloud import bigquery

In [2]:
df = pd.read_excel("./gsl_eval_v0_segments.xlsx")

In [3]:
df.shape

(1095, 65)

In [4]:
bq_client = bigquery.Client(project="etsy-search-ml-dev")

sql_query = """with lfb AS (
  select 
    key as listingId,
    IFNULL(verticaShopSettings_primaryLanguage, "") as shop_primaryLanguage,
    IFNULL(localeFeatures_listingCountry, "") as listingCountry
  from `etsy-ml-systems-prod.feature_bank_v2.listing_feature_bank_most_recent`
),
us_v2 as (
  select 
    query, listingId, etsyUUID, 
    "US" as userCountry
  from `etsy-sr-etl-prod.yzhang.sem_rel_human_annotation_v2`
),
intl as (
  select
    query, listingId, etsyUUID, 
    userLocale as userCountry
  from `etsy-search-ml-dev.aclapp.semrel_human_annotation_v1`
),
anno_data as (
  select * from us_v2
  union all
  select * from intl
)
select
  query, listingId, etsyUUID,
  userCountry,
  shop_primaryLanguage,
  listingCountry
from anno_data
left join lfb using (listingId)"""


query_job = bq_client.query(sql_query)
rows = query_job.result()
features_df = rows.to_dataframe()

In [5]:
df = pd.merge(df, features_df, on=["etsyUUID", "query", "listingId"], how="left")

In [6]:
df.shape

(1095, 68)

In [7]:
desired_columns = [
    # request
    "etsyUUID", "platform", "userCountry", "userLanguage",
    # query
    "query", "queryEn", "seg_queryBin", "seg_qisClass", "seg_queryTaxoFullPath", "seg_queryTaxoTop", 
    "tangibleItem", "fandom", "motif", "style", "material", "color", "technique", "size", "occasion", 
    "customization", "age", "price", "quantity", "recipient", "queryEntities", "queryRewrites", "queryIsGift",
    # listing
    "listingId", "listingCountry", "shop_primaryLanguage", "listingTitle", "listingTitleEn",
    "listingTaxo", "listingTags", "listingAttributes", "listingShopName", 
    "listingDescription", "listingDescriptionEn", "listingDescNgrams",
    "listingImageUrls", "listingHeroImageCaption", "listingVariations", "listingReviews",
    # annotation
    "anno_data_source", 
    "label_etsy_1", "label_etsy_2", "label_etsy_3", 
    "etsy_person_1", "etsy_person_2", "etsy_person_3", "etsy_notes", 
    "etsy_majority_label", "etsy_round_label", "etsy_unanimous",
    "label_annotator_1", "label_annotator_2", "label_annotator_3",
    "labelbox_majority_label", "labelbox_round_label", "labelbox_unanimous",
]

In [8]:
set(df.columns) - set(desired_columns)

{'en_title',
 'is_gsl_v0_eval',
 'ori_title',
 'titleEn_vertica',
 'v2_bert_pred_labels',
 'v2_bert_score_not_relevant',
 'v2_bert_score_prob_partial',
 'v2_bert_score_relevant'}

In [9]:
df = df[desired_columns]

In [10]:
df.columns

Index(['etsyUUID', 'platform', 'userCountry', 'userLanguage', 'query',
       'queryEn', 'seg_queryBin', 'seg_qisClass', 'seg_queryTaxoFullPath',
       'seg_queryTaxoTop', 'tangibleItem', 'fandom', 'motif', 'style',
       'material', 'color', 'technique', 'size', 'occasion', 'customization',
       'age', 'price', 'quantity', 'recipient', 'queryEntities',
       'queryRewrites', 'queryIsGift', 'listingId', 'listingCountry',
       'shop_primaryLanguage', 'listingTitle', 'listingTitleEn', 'listingTaxo',
       'listingTags', 'listingAttributes', 'listingShopName',
       'listingDescription', 'listingDescriptionEn', 'listingDescNgrams',
       'listingImageUrls', 'listingHeroImageCaption', 'listingVariations',
       'listingReviews', 'anno_data_source', 'label_etsy_1', 'label_etsy_2',
       'label_etsy_3', 'etsy_person_1', 'etsy_person_2', 'etsy_person_3',
       'etsy_notes', 'etsy_majority_label', 'etsy_round_label',
       'etsy_unanimous', 'label_annotator_1', 'label_annotator_2

In [11]:
df.to_excel("./gsl_eval_v0_output.xlsx", index=False)


```bash
gsutil cp ./gsl_eval_v0_output.xlsx gs://training-dev-search-data-jtzn/semantic_relevance/datasets/v3_eval_golden_standard_labels/
```