In [1]:
import pandas as pd
import json
import re

examples tracked under [google drive](https://drive.google.com/drive/folders/1Mux1v41ucF1zkp3V22N7UQ4AI336WFJ5?usp=drive_link)

sources of examples
- [False positives](https://docs.google.com/document/d/1ePngvSbSPy3jwZOMg09ubhVIhCDNkR_4T2aAUyKmEv0/edit?tab=t.y2z4czqwdlwf)
- [Learnings from internal annotation](https://docs.google.com/document/d/1UyYeYX3aAfvSyXM1lQeh5Q7SZ0fVIA6r4hrKjMKlj4k/edit?tab=t.s80apu9k7nsg)

In [2]:
df = pd.read_excel("./data/few_shot_examples_v0_full.xlsx")

In [3]:
def clean_query_entities(qe_str):
    # process json formatted strings into natual text strings
    qe_data = json.loads(qe_str)
    qe_output = ""
    for k, v in qe_data.items():
        if len(v) > 0:
            qe_output += f"{k}:{','.join(v)};"
    qe_output = qe_output.strip()
    return qe_output


def clean_tags(tag_str):
    # remove leading dot
    if tag_str.startswith("."):
        tag_str = tag_str[1:]
    # replace dots with comma because dots cannot be saved to excel
    output = tag_str.replace(".", ", ")
    return output

    
def normalize_strings(s):
    s = re.sub(r"&gt;", ">", s)
    s = re.sub(r"&lt;", "<", s)
    s = re.sub(r"&#39;|‘|’", "'", s)
    s = re.sub(r"&quot;|“|”|''", '"', s)
    s = re.sub(r"\x00", "", s)
    return s

In [4]:
df.fillna("", inplace=True)

# clean query entities - transform json format into texts
df["queryEntities"] = df["queryEntities"].apply(lambda x: "" if x == "" else clean_query_entities(x))
# clean attributes - : for separator instead of #, to lower case
df["listingAttributes"] = df["listingAttributes"].str.replace("#",':').str.lower()
# clean tags - replace . with ,
df["listingTags"] = df["listingTags"].apply(clean_tags)

# normalize strings
df["listingTitle"] = df["listingTitle"].apply(normalize_strings)
df["listingTags"] = df["listingTags"].apply(normalize_strings)
df["listingDescription"] = df["listingDescription"].apply(normalize_strings)
df["listingDescNgrams"] = df["listingDescNgrams"].apply(normalize_strings)
df["listingVariations"] = df["listingVariations"].apply(normalize_strings)
df["listingReviews"] = df["listingReviews"].apply(normalize_strings)

# combine description keywords and tags
df["desc"] = df["listingDescNgrams"] + ", " + df["listingTags"]

In [5]:
df.desc[0]

'lovely, ring, feature, branch, finger, small, flower, perfect, lovely ring, ring feature, s finger, small flower, flower floral ring, jewelry women girls, forget me not petal, grandma aunt sister, mom mama mum mother, christmas present, birthday daughter 20, girlfriend wife gift, accessory band stud, nature branch vine, minimalistic midi, hypoallergenic boho, sterling silver tiny'

## Few shot v1

In [6]:
# human_template = """Query: {}
# - Possible rewrites for this query: {}
# - Concepts in this query: {}

# Product:
# - Title: {}
# - Shop name: {}
# - Product image caption: {}
# - Unigrams & bigrams from description: {}
# - Category: {}
# - Attributes: {}
# - Tags: {}
# - Custom options: {}
# - Reviews: {}
# """

human_template_2 = """Query: {}
- Concepts in this query: {}
- Possible rewrites for this query: {}

Product:
- Title: {}
- Shop name: {}
- Image caption: {}
- Description key words and terms: {}
- Attributes: {}
- Custom options: {}
"""

In [7]:
json_output = []
for i in range(df.shape[0]):
    curr_row = df.iloc[i, :]
    curr_human = human_template_2.format(
        curr_row["query"],
        curr_row["queryEntities"] if curr_row["queryEntities"] != "" else "not available",
        curr_row["queryRewrites"] if curr_row["queryRewrites"] != "" else "not available",
        curr_row["listingTitle"] if curr_row["listingTitle"] != "" else "not available",
        curr_row["listingShopName"] if curr_row["listingShopName"] != "" else "not available",
        curr_row["listingHeroImageCaption"] if curr_row["listingHeroImageCaption"] != "" else "not available",
        curr_row["desc"] if curr_row["desc"] != "" else "not available",
        curr_row["listingAttributes"] if curr_row["listingAttributes"] != "" else "not available",
        curr_row["listingVariations"] if curr_row["listingVariations"] != "" else "not available",
    )
    curr_ai = {"label": f"{curr_row['label']}", "reason": f"{curr_row['reason']}"}
    curr_ai = json.dumps(curr_ai)
    curr_json = {"input": curr_human, "output": curr_ai}
    json_output.append(curr_json)

In [8]:
json_output[0]

{'input': 'Query: dainty silver flower ring\n- Concepts in this query: not available\n- Possible rewrites for this query: not available\n\nProduct:\n- Title: Flower ring | .925 sterling silver | nature-inspired jewelry | minimalistic stacking ring | wedding jewelry | gift for her\n- Shop name: 6daycreations2\n- Image caption: A hand wearing silver ring with a flower design and a thin band\n- Description key words and terms: lovely, ring, feature, branch, finger, small, flower, perfect, lovely ring, ring feature, s finger, small flower, flower floral ring, jewelry women girls, forget me not petal, grandma aunt sister, mom mama mum mother, christmas present, birthday daughter 20, girlfriend wife gift, accessory band stud, nature branch vine, minimalistic midi, hypoallergenic boho, sterling silver tiny\n- Attributes: material multi:silver;primary color:silver;band color:silver\n- Custom options: Ring size: 6, 8, 10, 7, 9\n',
 'output': '{"label": "relevant", "reason": "Query requested pro

In [9]:
filename = "./few_shot_examples/few_shot_examples_v1.jsonl"

with open(filename, 'w') as file:
    for entry in json_output:
        json.dump(entry, file)
        file.write('\n')

### Few shot v2

In [6]:
human_template = """Query: {}
- Concepts in this query: {}

Product:
- Title: {}
- Shop: {}
- Image: {}
- Description: {}"""

In [7]:
json_output = []
for i in range(df.shape[0]):
    curr_row = df.iloc[i, :]
    
    curr_human = human_template.format(
        curr_row["query"],
        curr_row["queryEntities"] if curr_row["queryEntities"] != "" else "not available",
        curr_row["listingTitle"] if curr_row["listingTitle"] != "" else "not available",
        curr_row["listingShopName"] if curr_row["listingShopName"] != "" else "not available",
        curr_row["listingHeroImageCaption"] if curr_row["listingHeroImageCaption"] != "" else "not available",
        curr_row["listingDescNgrams"] if curr_row["listingDescNgrams"] != "" else "not available",
    )
    
    curr_ai = {"label": f"{curr_row['label']}", "reason": f"{curr_row['reason']}"}
    curr_ai = json.dumps(curr_ai)
    curr_json = {"input": curr_human, "output": curr_ai}
    json_output.append(curr_json)

In [8]:
json_output[0]

{'input': 'Query: dainty silver flower ring\n- Concepts in this query: not available\n\nProduct:\n- Title: Flower ring | .925 sterling silver | nature-inspired jewelry | minimalistic stacking ring | wedding jewelry | gift for her\n- Shop: 6daycreations2\n- Image: A hand wearing silver ring with a flower design and a thin band\n- Description: lovely, ring, feature, branch, finger, small, flower, perfect, lovely ring, ring feature, s finger, small flower',
 'output': '{"label": "relevant", "reason": "Query requested product type is ring. Query requested product features include: dainty style, silver color and or material, flower motif. Product is ring. For dainty, title includes minimalist and image caption mentioning thin band confirms that product is dainty. For silver, product attributes match both material and color silver. For flower motif, title and description confirms that product has flower motif. Product type match, and all product features match, therefore label is relevant."}

In [9]:
filename = "./few_shot_examples/few_shot_examples_v2.jsonl"

with open(filename, 'w') as file:
    for entry in json_output:
        json.dump(entry, file)
        file.write('\n')

## Few shot v3

In [15]:
human_template = """Query: {}
- Concepts in this query: {}

Product:
- Title: {}
- Shop: {}
- Image: {}
- Description: {}"""

In [16]:
json_output = []
for i in range(df.shape[0]):
    curr_row = df.iloc[i, :]
    
    curr_human = human_template.format(
        curr_row["query"],
        curr_row["queryEntities"] if curr_row["queryEntities"] != "" else "not available",
        curr_row["listingTitle"] if curr_row["listingTitle"] != "" else "not available",
        curr_row["listingShopName"] if curr_row["listingShopName"] != "" else "not available",
        curr_row["listingHeroImageCaption"] if curr_row["listingHeroImageCaption"] != "" else "not available",
        curr_row["listingDescNgrams"] if curr_row["listingDescNgrams"] != "" else "not available",
    )
    
    # curr_ai = {"label": f"{curr_row['label']}"}
    # curr_ai = json.dumps(curr_ai)
    curr_json = {"input": curr_human, "output": curr_row['label']}
    json_output.append(curr_json)

In [17]:
json_output[0]

{'input': 'Query: dainty silver flower ring\n- Concepts in this query: not available\n\nProduct:\n- Title: Flower ring | .925 sterling silver | nature-inspired jewelry | minimalistic stacking ring | wedding jewelry | gift for her\n- Shop: 6daycreations2\n- Image: A hand wearing silver ring with a flower design and a thin band\n- Description: lovely, ring, feature, branch, finger, small, flower, perfect, lovely ring, ring feature, s finger, small flower',
 'output': 'relevant'}

In [18]:
filename = "./few_shot_examples/few_shot_examples_v3.jsonl"

with open(filename, 'w') as file:
    for entry in json_output:
        json.dump(entry, file)
        file.write('\n')