In [2]:
from google.cloud import bigquery
import pandas as pd
import numpy as np
from sklearn.metrics import f1_score, confusion_matrix, classification_report

In [3]:
df = pd.read_json(
    "gs://training-dev-search-data-jtzn/semantic_relevance/datasets/v4_eval_golden_data/eval_data_v4-2.jsonl", 
    lines=True
)

In [4]:
new_df = df[df.anno_data_source == "problematic_queries_99"]

In [5]:
new_df.shape

(294, 51)

In [6]:
client = bigquery.Client(project="etsy-search-ml-dev")

query_str = """select distinct 
    query, listingId, llm_final_label, llm_consensus_type
from `etsy-search-ml-dev.search.yzhang_emqueries_issue_problem_requests_full`
"""

query_job = client.query(query_str)
rows = query_job.result()
full_df = rows.to_dataframe()



In [7]:
merged_df = pd.merge(new_df, full_df, on=["query", "listingId"], how="left")

In [8]:
merged_df.shape

(294, 53)

In [14]:
merged_df["llm_final_label"] = merged_df.llm_final_label.fillna("partial")

In [17]:
print(classification_report(merged_df.gt_label, merged_df.llm_final_label, digits=3))

              precision    recall  f1-score   support

not_relevant      0.444     0.889     0.593         9
     partial      0.861     0.824     0.842       188
    relevant      0.740     0.732     0.736        97

    accuracy                          0.796       294
   macro avg      0.682     0.815     0.724       294
weighted avg      0.808     0.796     0.800       294



In [18]:
confusion_matrix(merged_df.gt_label, merged_df.llm_final_label, labels=["relevant", "partial", "not_relevant"])

array([[ 71,  25,   1],
       [ 24, 155,   9],
       [  1,   0,   8]])

In [11]:
pd.crosstab(merged_df.gt_label, merged_df.llm_final_label)

llm_final_label,not_relevant,partial,relevant
gt_label,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
not_relevant,8,0,1
partial,9,150,24
relevant,1,24,71


229 / 294 = 77.9%

In [12]:
consensus_df = merged_df[merged_df.llm_consensus_type == "5-0"]

In [14]:
consensus_df.shape

(216, 53)

In [13]:
pd.crosstab(consensus_df.gt_label, consensus_df.llm_final_label)

llm_final_label,not_relevant,partial,relevant
gt_label,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
not_relevant,6,0,0
partial,3,120,15
relevant,0,11,61


187 / 216 = 86.6%