In [1]:
from google.cloud import bigquery
import pandas as pd
import numpy as np

In [2]:
base_url = "https://atlas.etsycorp.com/recsys/listing_viewer.php?listing_ids="

In [3]:
query_str = """select *
from `etsy-search-ml-dev.search.yzhang_emqueries_issue_problem_requests_full`
order by query, mmxRequestUUID, listingRank
"""

client = bigquery.Client(project="etsy-search-ml-dev")
query_job = client.query(query_str)
rows = query_job.result()
df = rows.to_dataframe()

In [4]:
print(df.shape)
print(len(df.mmxRequestUUID.unique()))
print(len(df["query"].unique()))

(45408, 63)
946
945


In [5]:
df.fillna("", inplace=True)

df.loc[:, "rel_info"] = df.apply(
    lambda row: f"llm:{row['llm_final_label']}({row['llm_consensus_type']})", axis=1
)

In [6]:
output_df = df[[
    "mmxRequestUUID", "query", "queryEn", "queryDate", "platform", "userLanguage", "userCountry", "si_so",
    "queryBin", "qisClass", "queryTaxoFullPath", "queryTaxoTop", "queryEntities", "queryPriorClicks", "queryPriorPurchase",
]].drop_duplicates()

output_df.set_index("mmxRequestUUID", inplace=True)
output_df.loc[:, "pct_em"] = 0.0
output_df.loc[:, "listing_viewer_url"] = ""

In [7]:
unique_request_uuids = list(df.mmxRequestUUID.unique())

In [8]:
for i in range(len(unique_request_uuids)):
    curr_uuid = unique_request_uuids[i]
    curr_df = df[df.mmxRequestUUID == curr_uuid]

    curr_query = curr_df["query"].values[0]
    info_array = curr_df[["rel_info", "listingId"]].values
    info_array = list(info_array.reshape(-1))
    page_info = ",".join([str(x) for x in info_array])
    
    curr_pct_em = sum(curr_df["llm_final_label"] == "relevant") / curr_df.shape[0]
    assert curr_df.shape[0] == 48
    output_df.loc[curr_uuid, "pct_em"] = curr_pct_em

    curr_url = f'=HYPERLINK("{base_url}{curr_query},{page_info}", "link")'
    output_df.loc[curr_uuid, "listing_viewer_url"] = curr_url

In [9]:
top_df = output_df[output_df["queryBin"].isin(["top.01", "top.1"])]
print(top_df.shape)
print(len(top_df["query"].unique()))

(160, 16)
160


In [11]:
others_df = output_df[~output_df["queryBin"].isin(["top.01", "top.1"])]
others_df = others_df[others_df["queryPriorPurchase"] == "no_purchase"]
others_df = others_df.sample(200)

In [12]:
output_df = pd.concat([top_df, others_df])

In [14]:
# merge with existing list
prev_df = pd.read_csv("./explore2_problematic_pages_in_key_segs_with_anno.csv")

In [15]:
prev_df = prev_df[["query", "use_this_query", "note"]]

In [16]:
final_output_df = pd.merge(output_df, prev_df, on="query", how="left")

In [17]:
final_output_df = final_output_df.sort_values(by=["queryBin", 'pct_em'])
final_output_df.to_csv("./explore2_key_queries.csv", index=False)