In [1]:
from datasets import load_dataset, Dataset, DatasetDict
from google.cloud import bigquery
import numpy as np
import pandas as pd
import re
from transformers import AutoTokenizer

In [2]:
bert_checkpoints = [
    "microsoft/deberta-v3-large"
]

In [3]:
# Load in the raw data
v2_data_base_path = "gs://training-dev-search-data-jtzn/semantic_relevance/datasets/human_annotation_v2_split"
v2_train_path = f"{v2_data_base_path}/annotation_dataset_train_v2.pq"
v2_test_path = f"{v2_data_base_path}/annotation_dataset_test_v2.pq"

# gold_label is either not_sure / not_relevant / partial / relevant
# it is null in some of train but none of test
# other relevant columns are: query, listingTitle ... listingTaxo, listingDescription
v2_full_train_df = pd.read_parquet(v2_train_path)

v2_dfs = {}
# retain data with known labels which are not not_sure
v2_dfs['train'] = v2_full_train_df.query("gold_label.notnull() and gold_label != 'not_sure'").sample(frac=1.0, random_state=42).copy()

v2_test_df = pd.read_parquet(v2_test_path)
v2_dfs['test'] = v2_test_df.query("gold_label != 'not_sure'").copy()

# Create integer v2 label
for _, df in v2_dfs.items():
    df['etsy_v2_label'] = df.gold_label.map({'not_relevant': 0, 'partial': 1, 'relevant': 2})



In [14]:
def get_extra_listing_features(listing_ids: list[int]) -> pd.DataFrame:
    # Initialize BigQuery client
    client = bigquery.Client()

    # Construct the query with a parameter
    query = f"""
    SELECT 
        key AS listingId,
        (SELECT STRING_AGG(element, ';') FROM UNNEST(kbAttributesV2_sellerAttributesV2.list)) AS listingAttributes,
        (SELECT STRING_AGG(element, ';') FROM UNNEST(kbMaterials_materialListingTags.list)) AS listingMaterialTags,
        IFNULL(lfb.verticaListings_tags, "") listingTags
    FROM
        `etsy-ml-systems-prod.feature_bank_v2.listing_feature_bank_most_recent` lfb
    WHERE 
        key IN ({','.join([str(x) for x in listing_ids])})
    """

    # Run the query and return results as a DataFrame
    df = client.query(query).to_dataframe().fillna("")
    return df

In [15]:
full_df = pd.concat(list(v2_dfs.values()))
extra_listing_features = get_extra_listing_features(full_df.listingId.tolist())
extra_listing_features.to_parquet("./v2_extra_listing_features.pq")

