In [1]:
import pandas as pd
import numpy as np
import requests
import json
from google.cloud import bigquery

In [2]:
url = "http://prod-ml-platform.etsycloud.com/barista/query-entity-extraction-v2-ts/predictions/query-entity-extraction-v2"
headers = {"Content-Type": "application/json"}

In [3]:
query_str = """select distinct 
    query, queryEn,
    queryEntities,
    queryEntities_fandom,
    queryEntities_motif,
    queryEntities_style,
    queryEntities_material,
    queryEntities_color,
    queryEntities_technique,
    queryEntities_tangibleItem,
    queryEntities_size,
    queryEntities_occasion,
    queryEntities_customization,
    queryEntities_age,
    queryEntities_price,
    queryEntities_quantity,
    queryEntities_recipient
from `etsy-search-ml-dev.search.yzhang_emqueries_issue_llm_clean`
where queryEntities is null or queryEntities = '' 
"""

client = bigquery.Client(project="etsy-search-ml-dev")
query_job = client.query(query_str)
rows = query_job.result()
query_df = rows.to_dataframe()

In [4]:
query_df.fillna("", inplace=True)
query_df["en_query"] = query_df.apply(
    lambda row: row["queryEn"] if row["queryEn"] != "" else row["query"], axis=1
)

In [5]:
query_df.shape

(2657, 18)

In [6]:
def return_formatting(odata):
    if len(odata) == 0:
        return odata
    elif pd.isnull(odata[0]["raw_value"]):
        return []
    else:
        return [odata[0]["raw_value"]]

In [7]:
for i in query_df.index:
    if i % 200 == 0:
        print(i)
        
    curr_translated_query = query_df.loc[i, "en_query"]
    assert not pd.isnull(curr_translated_query)
    assert curr_translated_query != ""

    resp = requests.post(url, headers=headers, data=json.dumps({"query": curr_translated_query}))
    result_data = resp.json()
    output_data = result_data.get("output", {})
    
    query_df.at[i, "queryEntities"] = json.dumps(result_data["output"])
    query_df.at[i, "queryEntities_tangibleItem"] = return_formatting(output_data["tangible_item"])
    query_df.at[i, "queryEntities_fandom"] = return_formatting(output_data["fandom"])
    query_df.at[i, "queryEntities_motif"] = return_formatting(output_data["motif"])
    query_df.at[i, "queryEntities_style"] = return_formatting(output_data["style"])
    query_df.at[i, "queryEntities_material"] = return_formatting(output_data["material"])
    query_df.at[i, "queryEntities_color"] = return_formatting(output_data["color"])
    query_df.at[i, "queryEntities_technique"] = return_formatting(output_data["technique"])
    query_df.at[i, "queryEntities_size"] = return_formatting(output_data["size"])
    query_df.at[i, "queryEntities_occasion"] = return_formatting(output_data["occasion"])
    query_df.at[i, "queryEntities_customization"] = return_formatting(output_data["customization"])
    query_df.at[i, "queryEntities_age"] = return_formatting(output_data["age"])
    query_df.at[i, "queryEntities_price"] = return_formatting(output_data["price"])
    query_df.at[i, "queryEntities_quantity"] = return_formatting(output_data["quantity"])
    query_df.at[i, "queryEntities_recipient"] = return_formatting(output_data["recipient"])
 

0
200
400
600
800
1000
1200
1400
1600
1800
2000
2200
2400
2600


In [8]:
query_df.drop(columns='en_query', inplace=True)

In [9]:
client = bigquery.Client(project="etsy-search-ml-dev")

existing_table_id = "etsy-search-ml-dev.search.yzhang_emqueries_issue_llm_clean"
table = client.get_table(existing_table_id)
existing_schema = table.schema

new_schema = [x for x in existing_schema if x.name in query_df.columns]

In [10]:
new_schema

[SchemaField('query', 'STRING', 'NULLABLE', None, None, (), None),
 SchemaField('queryEn', 'STRING', 'NULLABLE', None, None, (), None),
 SchemaField('queryEntities', 'STRING', 'NULLABLE', None, None, (), None),
 SchemaField('queryEntities_fandom', 'STRING', 'REPEATED', None, None, (), None),
 SchemaField('queryEntities_motif', 'STRING', 'REPEATED', None, None, (), None),
 SchemaField('queryEntities_style', 'STRING', 'REPEATED', None, None, (), None),
 SchemaField('queryEntities_material', 'STRING', 'REPEATED', None, None, (), None),
 SchemaField('queryEntities_color', 'STRING', 'REPEATED', None, None, (), None),
 SchemaField('queryEntities_technique', 'STRING', 'REPEATED', None, None, (), None),
 SchemaField('queryEntities_tangibleItem', 'STRING', 'REPEATED', None, None, (), None),
 SchemaField('queryEntities_size', 'STRING', 'REPEATED', None, None, (), None),
 SchemaField('queryEntities_occasion', 'STRING', 'REPEATED', None, None, (), None),
 SchemaField('queryEntities_customization',

In [11]:
output_table = "etsy-search-ml-dev.search.yzhang_emqueries_issue_expand_qee"

job_config = bigquery.LoadJobConfig(
    schema= new_schema,
    write_disposition="WRITE_TRUNCATE",
)

upload_job = client.load_table_from_dataframe(
    query_df, output_table, job_config=job_config
)
upload_job.result()

LoadJob<project=etsy-search-ml-dev, location=US, id=00ade738-d30b-4068-a5b2-fe20351ff118>