In [None]:
!pip -q install --upgrade openai pandas tqdm tiktoken

In [None]:
'''
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
'''

'\n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\n'

In [None]:
# - Reads /content/try_example.json (
# - For each product, sends metadata + MAIN image URL to gpt-4o-mini
# - Writes one JSON line per product to /content/synthetic_intents.jsonl
# - Keeps outputs short, diverse, and *not* copied verbatim from titles
# ---------------------------------------------------------------------
# Usage:
#   1) Upload try_example.json to Colab (left sidebar → Files → Upload)
#   2) Run this cell; when prompted, paste OpenAI key (we do NOT print it)
#   3) Inspect /content/synthetic_intents.jsonl

!pip -q install --upgrade openai pandas tqdm

import os, json, time, re
import random # Import random for sampling
from typing import Optional, Dict, Any, List
import pandas as pd
from tqdm import tqdm
from openai import OpenAI

# ------- Config -------
JSONL_PATH = "/content/summarized_for_review_merged_final.jsonl"
OUT_PATH   = "/content/synthetic_intents.jsonl"
MODEL      = "gpt-4o"                     # vision-capable; correct id (letter 'o')
MAX_ITEMS = 3000                             # limit for quick tests; set to None for all
INTENTS_PER_ITEM = 5                           # 4–6 is a good range
CONSTRAINT_SETS   = 1
RANDOM_SEED = 42 # Define a random seed for reproducibility

# Paste key when prompted (safer than hard-coding)
if "OPENAI_API_KEY" not in os.environ or not os.environ["OPENAI_API_KEY"]:
    import getpass
    os.environ["OPENAI_API_KEY"] = getpass.getpass("Paste your OpenAI API key (hidden): ")

client = OpenAI()

def read_jsonl(path: str):
    with open(path, "r", encoding="utf-8") as f:
        for line in f:
            line = line.strip()
            if line:
                yield json.loads(line)

def select_main_image(images_field) -> Optional[str]:
    """
    Accepts list[dict] with {hi_res|large|thumb, variant}, list[str], str, or None.
    Prefer variant=='MAIN', then first hi_res/large/thumb URL.
    """
    if not images_field:
        return None
    items = []
    if isinstance(images_field, str):
        items = [{"variant":"MAIN", "hi_res":images_field}]
    elif isinstance(images_field, list):
        for it in images_field:
            if isinstance(it, str):
                items.append({"variant":"", "hi_res":it})
            elif isinstance(it, dict):
                items.append(it)
    # prefer MAIN
    def first_url(d: Dict[str,Any]):
        return d.get("hi_res") or d.get("large") or d.get("thumb") or d.get("url")
    mains = [d for d in items if str(d.get("variant","")).upper()=="MAIN"]
    for d in mains + items:
        url = first_url(d)
        if isinstance(url, str) and url.startswith("http"):
            return url
    return None

def compact_details(details: Any) -> str:
    """Turn details dict into a compact 'Key: Value' string; keep a few useful keys."""
    if not isinstance(details, dict):
        return ""
    keep = ["Brand","Style","Color","Material","Size","Item Form","Hair Type", "Age Range", "Material Feature", "Skin Type"]
    parts = []
    for k in keep:
        v = details.get(k)
        if v:
            parts.append(f"{k}: {v}")
    return " | ".join(parts)

SYSTEM_MSG = (
    "You create short, diverse, natural shopping intents for a beauty product dataset. "
    "DO NOT copy the title verbatim. Avoid brand/product codes unless asked. "
    "Write intents a real shopper could type (6–14 words)."
)

def build_user_parts(product: Dict[str,Any], main_image_url: Optional[str]) -> List[Dict[str,Any]]:
    """Compose a multimodal message with text + optional image_url."""
    title   = product.get("title","")
    summary = product.get("summary","")
    features = product.get("features",[]) or []
    description = product.get("description",[]) or []
    details = product.get("details",{}) or {}
    price   = product.get("price", None)
    avg_rating = product.get("avg_rating", product.get("average_rating", None))
    rating_number = product.get("rating_number", product.get("total_reviews", None))
    main_category = product.get("main_category", "")
    brand = details.get("Brand") or product.get("store") or ""

    # keep it concise
    feat_txt = " • ".join([str(x) for x in features][:5])
    desc_txt = " ".join([str(x) for x in description][:2])[:500]
    det_txt  = compact_details(details)

    meta = {
        "parent_asin": product.get("parent_asin", ""),
        "title": title,
        "summary": summary,
        "main_category": main_category,
        "brand": brand,
        "price": price,
        "avg_rating": avg_rating,
        "rating_number": rating_number,
        "features": feat_txt,
        "description": desc_txt,
        "details_compact": det_txt
    }

    text_block = (
        "You will produce synthetic *user shopping intents* and *structured constraint sets* "
    "for the following product. Intents should be compatible with the product and avoid copying exact phrasing.\n\n"
    f"PRODUCT METADATA (JSON):\n{json.dumps(meta, ensure_ascii=False)}\n\n"
    "Output JSON schema:\n"
    "{\n"
    '  "intents": [short natural language queries, 5 items],\n'
    '  "constraints": [  // 1 object, optional fields if unknown\n'
    '     {"brand": string, "price_min": number, "price_max": number, "rating_min": number}\n'
    "  ]\n"
    "}\n"
    "Rules:\n"
    "- Rephrase with synonyms; avoid any exact phrase of length ≥ 2 words from title/features/description.\n"
    "- Intents: diverse tone & vocabulary; 6–14 tokens; no brand unless natural.\n"
    "- If price exists, set a plausible [price_min, price_max] around it (e.g., ±20%), make sure the actual price is within the range.\n"
    "- If average_rating exists, set rating_min to an integer value equal to or below it (e.g., 4.0).\n"
    "- If brand exists, set brand to the corresponding brand."
    "- If price/brand/rating is unknown or missing (e.g., null/None/NA/N/A/\"nah\"/empty), DO NOT include that field in the constraints object.\n"
    "- Keep outputs concise. Return ONLY valid JSON."

    )

    parts = [{"type":"text","text":text_block}]
    if main_image_url:
        parts.append({"type":"image_url","image_url":{"url": main_image_url}})
    return parts

def call_llm(product: Dict[str,Any], main_image_url: Optional[str]) -> Dict[str,Any]:
    """Calls gpt-4o-mini with text+image to generate intents + constraints. Returns parsed JSON dict."""
    messages = [
        {"role":"system","content":[{"type":"text","text": SYSTEM_MSG}]},
        {"role":"user","content": build_user_parts(product, main_image_url)}
    ]
    resp = client.chat.completions.create(
        model=MODEL,
        messages=messages,
        temperature=0.7,
        max_tokens=600,

    )
    text = resp.choices[0].message.content

    # robust JSON extraction
    def extract_json(s: str):
        start = s.find("{")
        end   = s.rfind("}")
        if start == -1 or end == -1 or end <= start:
            raise ValueError("No JSON object found in response.")
        return json.loads(s[start:end+1])

    data = extract_json(text)
    # light normalization
    intents = [i.strip() for i in data.get("intents", []) if isinstance(i, str) and i.strip()]
    intents = intents[:INTENTS_PER_ITEM] or intents
    cons    = data.get("constraints", [])
    if not isinstance(cons, list): cons = []
    cons = cons[:CONSTRAINT_SETS]
    return {"intents": intents, "constraints": cons, "raw_text": text}

# --------- Run ----------
rows = list(read_jsonl(JSONL_PATH))
if MAX_ITEMS and MAX_ITEMS < len(rows):
    random.seed(RANDOM_SEED) # Set the random seed
    rows = random.sample(rows, MAX_ITEMS) # Randomly sample MAX_ITEMS

out_f = open(OUT_PATH, "w", encoding="utf-8")
empty_constraints_count = 0  # Initialize the counter
processed_count = 0 # Initialize a counter for processed items
try:
    for rec in tqdm(rows, desc="Generating synthetic intents"):
        main_img = select_main_image(rec.get("images"))
        result = {}
        try:
            result = call_llm(rec, main_img)
        except Exception as e:
            # retry once without image if vision fails
            try:
                result = call_llm(rec, None)
            except Exception as e2:
                result = {"intents": [], "constraints": [], "error": f"{type(e2).__name__}: {e2}"}

        # Check if constraints list is empty and increment counter
        if 'constraints' in result and isinstance(result['constraints'], list) and len(result['constraints']) == 0:
            empty_constraints_count += 1

        line = {
            "parent_asin": rec.get("parent_asin"),
            "title": rec.get("title"),
            "brand": (rec.get("details") or {}).get("Brand") or rec.get("store"),
            "main_category": rec.get("main_category"),
            "main_image": select_main_image(rec.get("images")),
            "generated": result
        }
        out_f.write(json.dumps(line, ensure_ascii=False) + "\n")
        out_f.flush()
        time.sleep(0.4)  # gentle pacing

        processed_count += 1
        if processed_count % 100 == 0:  # Report every 100 items
            print(f"Processed {processed_count} items. Empty constraints so far: {empty_constraints_count}")

finally:
    out_f.close()

print(f"✔ Wrote {OUT_PATH}")
print(f"Total number of products with empty generated constraints: {empty_constraints_count}") # Report the final count
# Peek a few lines
for i, line in zip(range(3), read_jsonl(OUT_PATH)):
    print(json.dumps(line, ensure_ascii=False)[:500] + ("..." if len(json.dumps(line, ensure_ascii=False))>500 else ""))

Generating synthetic intents:   3%|▎         | 100/3000 [09:55<5:39:45,  7.03s/it]

Processed 100 items. Empty constraints so far: 0


Generating synthetic intents:   7%|▋         | 200/3000 [21:46<4:55:36,  6.33s/it]

Processed 200 items. Empty constraints so far: 0


Generating synthetic intents:  10%|█         | 300/3000 [30:49<5:48:06,  7.74s/it]

Processed 300 items. Empty constraints so far: 0


Generating synthetic intents:  13%|█▎        | 400/3000 [40:41<3:22:57,  4.68s/it]

Processed 400 items. Empty constraints so far: 0


Generating synthetic intents:  17%|█▋        | 500/3000 [50:35<5:58:53,  8.61s/it]

Processed 500 items. Empty constraints so far: 0


Generating synthetic intents:  20%|██        | 600/3000 [1:00:23<4:24:03,  6.60s/it]

Processed 600 items. Empty constraints so far: 0


Generating synthetic intents:  23%|██▎       | 700/3000 [1:08:03<3:49:41,  5.99s/it]

Processed 700 items. Empty constraints so far: 0


Generating synthetic intents:  27%|██▋       | 800/3000 [1:15:57<2:42:44,  4.44s/it]

Processed 800 items. Empty constraints so far: 0


Generating synthetic intents:  30%|███       | 900/3000 [1:24:11<2:44:54,  4.71s/it]

Processed 900 items. Empty constraints so far: 0


Generating synthetic intents:  33%|███▎      | 1000/3000 [1:33:59<2:38:47,  4.76s/it]

Processed 1000 items. Empty constraints so far: 0


Generating synthetic intents:  37%|███▋      | 1100/3000 [1:41:35<2:14:38,  4.25s/it]

Processed 1100 items. Empty constraints so far: 0


Generating synthetic intents:  40%|████      | 1200/3000 [1:49:49<2:58:33,  5.95s/it]

Processed 1200 items. Empty constraints so far: 0


Generating synthetic intents:  43%|████▎     | 1300/3000 [1:58:10<2:51:22,  6.05s/it]

Processed 1300 items. Empty constraints so far: 0


Generating synthetic intents:  47%|████▋     | 1400/3000 [2:07:21<4:14:00,  9.53s/it]

Processed 1400 items. Empty constraints so far: 0


Generating synthetic intents:  50%|█████     | 1500/3000 [2:15:34<1:36:51,  3.87s/it]

Processed 1500 items. Empty constraints so far: 0


Generating synthetic intents:  53%|█████▎    | 1600/3000 [2:23:16<1:18:46,  3.38s/it]

Processed 1600 items. Empty constraints so far: 1


Generating synthetic intents:  57%|█████▋    | 1700/3000 [2:33:38<1:43:49,  4.79s/it]

Processed 1700 items. Empty constraints so far: 1


Generating synthetic intents:  60%|██████    | 1800/3000 [2:42:49<1:22:16,  4.11s/it]

Processed 1800 items. Empty constraints so far: 1


Generating synthetic intents:  63%|██████▎   | 1900/3000 [2:51:41<1:13:52,  4.03s/it]

Processed 1900 items. Empty constraints so far: 2


Generating synthetic intents:  67%|██████▋   | 2000/3000 [3:01:50<1:59:52,  7.19s/it]

Processed 2000 items. Empty constraints so far: 2


Generating synthetic intents:  70%|███████   | 2100/3000 [3:11:53<1:09:09,  4.61s/it]

Processed 2100 items. Empty constraints so far: 2


Generating synthetic intents:  73%|███████▎  | 2200/3000 [3:20:48<1:10:04,  5.26s/it]

Processed 2200 items. Empty constraints so far: 2


Generating synthetic intents:  77%|███████▋  | 2300/3000 [3:29:06<43:37,  3.74s/it]

Processed 2300 items. Empty constraints so far: 2


Generating synthetic intents:  80%|████████  | 2400/3000 [3:38:05<45:16,  4.53s/it]

Processed 2400 items. Empty constraints so far: 2


Generating synthetic intents:  83%|████████▎ | 2500/3000 [3:46:49<33:36,  4.03s/it]

Processed 2500 items. Empty constraints so far: 2


Generating synthetic intents:  87%|████████▋ | 2600/3000 [3:54:45<29:37,  4.44s/it]

Processed 2600 items. Empty constraints so far: 2


Generating synthetic intents:  90%|█████████ | 2700/3000 [4:02:36<21:25,  4.28s/it]

Processed 2700 items. Empty constraints so far: 2


Generating synthetic intents:  93%|█████████▎| 2800/3000 [4:10:47<14:51,  4.46s/it]

Processed 2800 items. Empty constraints so far: 2


Generating synthetic intents:  97%|█████████▋| 2900/3000 [4:20:11<07:36,  4.56s/it]

Processed 2900 items. Empty constraints so far: 2


Generating synthetic intents: 100%|██████████| 3000/3000 [4:30:03<00:00,  5.40s/it]

Processed 3000 items. Empty constraints so far: 3
✔ Wrote /content/synthetic_intents.jsonl
Total number of products with empty generated constraints: 3
{"parent_asin": "B01FL462KQ", "title": "Lilah James KP Exfoliating Scrub 8oz - A Natural Mask And Scrub For Keratosis Pilaris", "brand": null, "main_category": "All Beauty", "main_image": "https://m.media-amazon.com/images/I/41nx+VYWS3L.jpg", "generated": {"intents": ["Best exfoliator for rough skin bumps", "Natural scrub for dry skin conditions", "Effective keratosis pilaris skin treatment", "Gentle exfoliating mask for smooth skin", "Looking for a scrub to improve dry skin"], "constraints": [{...
{"parent_asin": "B00AQ0RIJK", "title": "Salon Grf H/S Ex/S Trvl S Size Salon Grafix Extra Super Hold Shaping Hair Spray, 1.5 Oz, Pack of 1", "brand": "Salon Grafix", "main_category": "All Beauty", "main_image": "https://m.media-amazon.com/images/I/61z23vWDUCL._SL1500_.jpg", "generated": {"intents": ["Travel-size strong hold hair spray", "Port




In [None]:
# Filters synthetic_intents.jsonl using meta_All_Beauty.jsonl
# Adds a report that ALWAYS includes the two policy rules even if counts are 0.

import json, re, math, pandas as pd
from collections import Counter

META_PATH   = "/content/summarized_for_review_merged_final.jsonl"
SYN_PATH    = "/content/synthetic_intents.jsonl"
KEEP_PATH   = "/content/filtered_synthetic_intents.jsonl"
DROP_PATH   = "/content/filtered_synthetic_intents.dropped.jsonl"
REPORT_CSV  = "/content/synthetic_filter_report.csv"


MISSING_STRINGS = {
    "", "none", "null", "na", "n/a", "nan", "nah", "unknown", "undefined",
    "n\\a", "not available", "not applicable", "tbd", "to be determined", "nil",
    "-", "—", "nill", "n/a.", "na.", "n.a.", "unk", "unspecified"
}

# ---------- helpers ----------
def read_jsonl(path):
    with open(path, "r", encoding="utf-8") as f:
        for line in f:
            line = line.strip()
            if line:
                yield json.loads(line)

def nows(s):
    return (s if isinstance(s, str) else "").strip().lower()

def is_missing(v):
    if v is None:
        return True
    if isinstance(v, float) and math.isnan(v):
        return True
    if isinstance(v, str) and nows(v) in MISSING_STRINGS:
        return True
    return False

_num_pat = re.compile(r"[-+]?\d*\.?\d+")
def to_float(v):
    if v is None:
        return None
    if isinstance(v, (int, float)):
        return float(v)
    if isinstance(v, dict):
        for k in ["amount", "value", "price", "price_amount"]:
            if k in v:
                return to_float(v[k])
        return None
    if isinstance(v, str):
        s = v.strip().replace(",", "")
        try:
            return float(s)
        except:
            m = _num_pat.search(s)
            if m:
                try:
                    return float(m.group(0))
                except:
                    return None
    return None

def normalize_brand(s):
    if not isinstance(s, str):
        return None
    s = s.lower().strip()
    s = re.sub(r"[^a-z0-9\s]", "", s)
    s = re.sub(r"\s+", " ", s).strip()
    return s if s else None

def extract_meta_fields(rec):
    details = rec.get("details") or {}
    brand = details.get("Brand") or rec.get("brand") or rec.get("store")
    price = rec.get("price")
    rating = rec.get("avg_rating", rec.get("average_rating"))
    brand_norm = None if is_missing(brand) else normalize_brand(brand)
    price_val  = None if is_missing(price)  else to_float(price)
    rating_val = None if is_missing(rating) else to_float(rating)
    return brand_norm, price_val, rating_val

def extract_constraint_fields(obj):
    if not isinstance(obj, dict):
        return None, None, None, None
    brand_c  = obj.get("brand")
    pmin_c   = obj.get("price_min")
    pmax_c   = obj.get("price_max")
    rmin_c   = obj.get("rating_min")
    brand_c_norm = None if is_missing(brand_c) else normalize_brand(brand_c)
    pmin_val     = None if is_missing(pmin_c) else to_float(pmin_c)
    pmax_val     = None if is_missing(pmax_c) else to_float(pmax_c)
    rmin_val     = None if is_missing(rmin_c) else to_float(rmin_c)
    return brand_c_norm, pmin_val, pmax_val, rmin_val

# ---------- load META ----------
meta_map = {}
for rec in read_jsonl(META_PATH):
    asin = rec.get("parent_asin") or rec.get("asin") or rec.get("id")
    if not asin:
        continue
    brand_norm, price_val, rating_val = extract_meta_fields(rec)
    meta_map[asin] = {"brand_norm": brand_norm, "price": price_val, "rating": rating_val}

# ---------- filtering ----------
keep_f = open(KEEP_PATH, "w", encoding="utf-8")
drop_f = open(DROP_PATH, "w", encoding="utf-8")
reason_counts = Counter()
dropped_examples = []

def constraints_satisfy_meta(constraint_obj, meta_fields):
    reasons = []
    brand_m, price_m, rating_m = meta_fields["brand_norm"], meta_fields["price"], meta_fields["rating"]
    brand_c, pmin_c, pmax_c, rmin_c = extract_constraint_fields(constraint_obj)

    # BRAND (if meta has brand)
    if brand_m is not None:
        if brand_c is None:
            reasons.append("brand_missing")
        elif brand_c != brand_m:
            reasons.append("brand_mismatch")
    # PRICE (if meta has price)
    if price_m is not None:
        if pmin_c is None or pmax_c is None:
            reasons.append("price_missing")
        else:
            if pmin_c > pmax_c:
                pmin_c, pmax_c = pmax_c, pmin_c
            if not (pmin_c <= price_m <= pmax_c):
                reasons.append("price_mismatch")
    # RATING (if meta has rating)
    if rating_m is not None:
        if rmin_c is None:
            reasons.append("rating_missing")
        else:
            if rmin_c > rating_m:
                reasons.append("rating_mismatch")

    return (len(reasons) == 0, reasons)

total = kept = dropped = 0

for rec in read_jsonl(SYN_PATH):
    total += 1
    asin = rec.get("parent_asin") or rec.get("asin") or rec.get("id")

    if asin not in meta_map:
        keep_f.write(json.dumps(rec, ensure_ascii=False) + "\n")
        kept += 1
        continue

    meta_fields = meta_map[asin]
    gen = rec.get("generated") or {}
    cons = gen.get("constraints")
    constraints_list = cons if isinstance(cons, list) else ([cons] if isinstance(cons, dict) else [])

    meta_has_any = any(v is not None for v in [meta_fields["brand_norm"], meta_fields["price"], meta_fields["rating"]])

    if meta_has_any and not constraints_list:
        reasons = []
        if meta_fields["brand_norm"] is not None: reasons.append("brand_missing")
        if meta_fields["price"]      is not None: reasons.append("price_missing")
        if meta_fields["rating"]     is not None: reasons.append("rating_missing")
        reason_counts.update(reasons)
        rec_copy = dict(rec)
        rec_copy["_filter_reasons"] = reasons
        drop_f.write(json.dumps(rec_copy, ensure_ascii=False) + "\n")
        dropped += 1
        if len(dropped_examples) < 5:
            dropped_examples.append((asin, reasons))
        continue

    any_valid = False
    all_reasons_aggregate = []
    for cobj in constraints_list:
        ok, reasons = constraints_satisfy_meta(cobj, meta_fields)
        if ok:
            any_valid = True
            break
        else:
            all_reasons_aggregate.extend(reasons)

    if any_valid:
        keep_f.write(json.dumps(rec, ensure_ascii=False) + "\n")
        kept += 1
    else:
        if not all_reasons_aggregate and meta_has_any:
            all_reasons_aggregate = ["unspecified_mismatch"]
        reason_counts.update(all_reasons_aggregate)
        rec_copy = dict(rec)
        rec_copy["_filter_reasons"] = sorted(set(all_reasons_aggregate))
        drop_f.write(json.dumps(rec_copy, ensure_ascii=False) + "\n")
        dropped += 1
        if len(dropped_examples) < 5:
            dropped_examples.append((asin, sorted(set(all_reasons_aggregate))))

keep_f.close()
drop_f.close()

# ---------- build ALWAYS-ON report ----------
# Ensure zero entries exist so they appear even when no drops occurred.
for k in ["brand_missing", "price_missing", "rating_missing",
          "brand_mismatch", "price_mismatch", "rating_mismatch",
          "unspecified_mismatch"]:
    _ = reason_counts[k]  # touch to create with 0 if absent

RULE_ROWS = [
    {
        "reason": "RULE_1_OMISSION",
        "count": (reason_counts["brand_missing"] +
                  reason_counts["price_missing"] +
                  reason_counts["rating_missing"]),
        "explanation": "If META has a real value for brand/rating/price but the constraint omits that field -> DROP"
    },
    {
        "reason": "RULE_2_MISMATCH",
        "count": (reason_counts["brand_mismatch"] +
                  reason_counts["price_mismatch"] +
                  reason_counts["rating_mismatch"]),
        "explanation": "If META has a real value and the constraint is present but doesn't match: brand/rating/price"
    },
    # Sub-reasons (always shown, even if 0)
    {"reason": "brand_missing",  "count": reason_counts["brand_missing"],
     "explanation": "META brand present, constraint 'brand' missing -> DROP"},
    {"reason": "price_missing",  "count": reason_counts["price_missing"],
     "explanation": "META price present, constraint price_min/price_max missing -> DROP"},
    {"reason": "rating_missing", "count": reason_counts["rating_missing"],
     "explanation": "META rating present, constraint rating_min missing -> DROP"},
    {"reason": "brand_mismatch", "count": reason_counts["brand_mismatch"],
     "explanation": "Normalized constraint brand != normalized META brand -> DROP"},
    {"reason": "price_mismatch", "count": reason_counts["price_mismatch"],
     "explanation": "META price not inside [price_min, price_max] -> DROP"},
    {"reason": "rating_mismatch","count": reason_counts["rating_mismatch"],
     "explanation": "rating_min > META avg_rating -> DROP"},
]

# Include any other reasons that might have been used
other_rows = []
for r, c in reason_counts.items():
    if r not in {row["reason"] for row in RULE_ROWS}:
        other_rows.append({"reason": r, "count": c, "explanation": ""})

report_df = pd.DataFrame(RULE_ROWS + other_rows)
report_df.to_csv(REPORT_CSV, index=False)

print(f"Total: {total} | Kept: {kept} | Dropped: {dropped}")
print(f"Kept -> {KEEP_PATH}")
print(f"Dropped -> {DROP_PATH}")
print(f"Report -> {REPORT_CSV}")
if dropped_examples:
    print("Examples of dropped [asin, reasons]:")
    for asin, rs in dropped_examples:
        print(" -", asin, rs)


Total: 3000 | Kept: 2725 | Dropped: 275
Kept -> /content/filtered_synthetic_intents.jsonl
Dropped -> /content/filtered_synthetic_intents.dropped.jsonl
Report -> /content/synthetic_filter_report.csv
Examples of dropped [asin, reasons]:
 - B01ESPRJEC ['brand_missing']
 - B088GMVYQX ['brand_missing']
 - B09XK4NNGW ['brand_missing']
 - B08XB7PK7B ['brand_missing']
 - B09BN5KT7S ['brand_missing']
