In [None]:
!pip install emoji
!pip install langdetect



In [None]:
import os
import json
import random
import time
import re
import shutil
import gzip
import urllib.request
from pathlib import Path
from typing import Dict, List, Optional
from collections import defaultdict
from openai import OpenAI
from tqdm.auto import tqdm


API_KEY = "sk-xxxxxxxxxxxxxxxxxxx"
BASE_URL = "https://dashscope.aliyuncs.com/compatible-mode/v1"
MODEL_NAME = "qwen-plus"

SAMPLES_PER_CATEGORY = 200
MIN_REVIEWS_PER_PRODUCT = 5
MAX_REVIEWS_INPUT = 30

try:
    from google.colab import drive
    drive.mount('/content/drive')
    BASE_DIR = Path("/content/drive/MyDrive/ETSP")
except ImportError:
    BASE_DIR = Path("ETSP")

REVIEW_BASE_URL = "https://mcauleylab.ucsd.edu/public_datasets/data/amazon_2023/raw/review_categories/"
META_BASE_URL = "https://mcauleylab.ucsd.edu/public_datasets/data/amazon_2023/raw/meta_categories/"

client = OpenAI(api_key=API_KEY, base_url=BASE_URL)


def download_file(url: str, output_path: str) -> bool:
    """Download file from URL."""
    try:
        print(f"Downloading {Path(output_path).name}...")
        urllib.request.urlretrieve(url, output_path)
        return True
    except Exception as e:
        print(f"Download failed: {e}")
        return False

def decompress_gz(gz_path: str) -> Optional[str]:
    """Decompress .gz file and remove archive."""
    output_path = gz_path[:-3]
    try:
        print(f"Decompressing {Path(gz_path).name}...")
        with gzip.open(gz_path, 'rb') as f_in:
            with open(output_path, 'wb') as f_out:
                shutil.copyfileobj(f_in, f_out)
        os.remove(gz_path)
        return output_path
    except Exception as e:
        print(f"Decompression failed: {e}")
        return None

def ensure_data_files(category: str) -> tuple:
    """Ensure review/meta files exist via move or download."""
    category_dir = BASE_DIR / category
    category_dir.mkdir(parents=True, exist_ok=True)
    target_reviews = category_dir / f"{category}.jsonl"
    target_meta = category_dir / f"meta_{category}.jsonl"

    # Reviews
    if not target_reviews.exists():
        root_review = BASE_DIR / f"{category}.jsonl"
        if root_review.exists():
            shutil.move(str(root_review), str(target_reviews))
        else:
            gz_path = category_dir / f"{category}.jsonl.gz"
            if download_file(f"{REVIEW_BASE_URL}{category}.jsonl.gz", str(gz_path)):
                decompress_gz(str(gz_path))

    # Metadata
    if not target_meta.exists():
        root_meta = BASE_DIR / f"meta_{category}.jsonl"
        if root_meta.exists():
            shutil.move(str(root_meta), str(target_meta))
        else:
            gz_path = category_dir / f"meta_{category}.jsonl.gz"
            if download_file(f"{META_BASE_URL}meta_{category}.jsonl.gz", str(gz_path)):
                decompress_gz(str(gz_path))

    return str(target_reviews), str(target_meta)

def clean_text(text: str) -> str:
    """Remove HTML tags and normalize whitespace."""
    if not isinstance(text, str):
        return ""
    if '<' in text and '>' in text:
        text = re.sub(r'<[^>]+>', '', text)
    return re.sub(r"\s+", " ", text).strip()

def prepare_review_context(reviews: List[Dict]) -> str:
    """
    Format review context for API:
    - Filter: 20-1500 chars
    - Format: 'Review N: {text}' (no title)
    - Limit: 30 reviews max
    """
    texts = []
    count = 0

    for r in reviews:
        if count >= MAX_REVIEWS_INPUT:
            break

        text = clean_text(r.get('text', ''))

        if len(text) < 20 or len(text) > 1500:
            continue

        texts.append(f'Review {count+1}: {text}')
        count += 1

    return '\n\n'.join(texts)

def validate_triplets_strict(triplets: Dict) -> bool:
    """
    Validate triplet quality:
    - Simple: no bullets
    - Complex/Hallucinated: must have (+), (-), or (~)
    - All: min 10 words, no rejection phrases
    """
    if not triplets or len(triplets) != 3:
        return False

    rejection_phrases = ["cannot", "unable to", "sorry", "language model", "ai assistant"]

    for key, text in triplets.items():
        if not text or len(text.split()) < 10:
            return False
        if any(p in text.lower() for p in rejection_phrases):
            return False

    # Simple: no bullets
    if re.search(r'\(\+\)|\(-\)', triplets['simple']):
        return False

    # Complex/Hallucinated: must have bullets
    for key in ['complex', 'hallucinated']:
        if not re.search(r'\(\+\)|\(-\)|\(~\)', triplets[key]):
            return False

    return True

def generate_triplet_data(reviews: List[Dict]) -> Optional[Dict]:
    """
    Generate SFT/DPO triplets via API:
    - Simple (A1): beginner-friendly paragraph
    - Complex (C1): professional bulleted list with controversy handling
    - Hallucinated: Complex-style with 1 subtle error
    """
    context = prepare_review_context(reviews)

    if not context.strip():
        return None

    tasks = {
        "simple": {
            "temp": 0.3,
            "max_tokens": 150,
            "prompt": f"""Summarize these reviews for a beginner (CEFR A1).

Requirements:
- Use simple present tense, basic vocabulary.
- Write 1 paragraph of 3-4 short sentences.
- NO bullet points.
- Structure: [Overall] + [Feature] + [Conclusion].
- Reflect what most people say, but mention important issues if some people have them.

Reviews:
{context}

Output ONLY the summary."""
        },

        "complex": {
            "temp": 0.3,
            "max_tokens": 300,
            "prompt": f"""Summarize these reviews in a professional, analytical style (CEFR C1).

Requirements:
1. **Style**: Use sophisticated vocabulary and phrasing, identical to a high-quality expert review.

2. **Format**: Use a bulleted list with EXACTLY 3-6 points total.
   - **CRITICAL**: EVERY point MUST start with `(+)`, `(-)`, or `(~)`.
   - Use `(+)` for consensus strengths.
   - Use `(-)` for consensus weaknesses.
   - Use `(~)` for mixed/controversial opinions (CRITICAL).
   - **Order**: List all `(+)` first, then `(-)`, then `(~)`. Do NOT mix them randomly.
   - **Compact output**: No blank lines between points. Each point on a new line immediately after the previous one.

3. **Handling Contradictions**:
   - If User A says "great battery" but User B says "battery died", you MUST report this as an inconsistency.
   - Use phrases like "Polarized feedback regarding...", "Inconsistent reports on...", or "While most praise X, some users note Y...".

4. **Length**: Total summary under 180 words. Each point 18-30 words.

Reviews:
{context}

Output ONLY the structured summary."""
        },

        "hallucinated": {
            "temp": 0.5,
            "max_tokens": 300,
            "prompt": f"""Summarize these reviews in a professional, analytical style (CEFR C1). CRITICAL: Include EXACTLY ONE subtle factual error for training purposes.

Requirements:
1. **Style**: Use sophisticated vocabulary and phrasing, identical to a high-quality expert review.

2. **Format**: Use a bulleted list with EXACTLY 3-6 points total.
   - **CRITICAL**: EVERY point MUST start with `(+)`, `(-)`, or `(~)`.
   - Use `(+)` for consensus strengths.
   - Use `(-)` for consensus weaknesses.
   - Use `(~)` for mixed/controversial opinions (CRITICAL).
   - **Order**: List all `(+)` first, then `(-)`, then `(~)`. Do NOT mix them randomly.
   - **Compact output**: No blank lines between points. Each point on a new line immediately after the previous one.

3. **Handling Contradictions**:
   - If User A says "great battery" but User B says "battery died", you MUST report this as an inconsistency.
   - Use phrases like "Polarized feedback regarding...", "Inconsistent reports on...", or "While most praise X, some users note Y...".

4. **Error Injection (CRITICAL)**:
   Inject EXACTLY ONE subtle factual error. Randomly choose ONE type:
   - **Attribute Error**: Mention a feature or accessory NOT present.
   - **Quantity Error**: Alter a specific number or measurement mentioned.
   - **Sentiment Error**: Reverse the consensus opinion on one specific aspect.
   - **Comparison Error**: Add a false competitive claim.

   *The error must be SUBTLE and realistic. It should blend in naturally with the rest of the text.*

5. **Length**: Total summary under 180 words. Each point 18-30 words.

Reviews:
{context}

Output ONLY the summary with the error. DO NOT include any notes or explanations about the error."""
        }
    }

    # Retry logic (3 attempts)
    for attempt in range(3):
        try:
            batch = {}
            for key, config in tasks.items():
                response = client.chat.completions.create(
                    model=MODEL_NAME,
                    messages=[{"role": "user", "content": config["prompt"]}],
                    temperature=config["temp"],
                    max_tokens=config["max_tokens"]
                )
                content = response.choices[0].message.content.strip()
                content = re.sub(r'^["\']|["\']$', '', content)
                content = re.sub(r'^(Summary|Output):\s*', '', content, flags=re.IGNORECASE)
                batch[key] = content

            if validate_triplets_strict(batch):
                return batch

        except Exception as e:
            if "429" in str(e):
                time.sleep(3 * (2 ** attempt))
            elif "context_length" in str(e):
                return None
            else:
                time.sleep(1)

    return None


def main():
    categories = [
        "Electronics", "Books", "Home_and_Kitchen", "Beauty_and_Personal_Care",
        "Clothing_Shoes_and_Jewelry", "Toys_and_Games", "Sports_and_Outdoors",
        "Pet_Supplies", "Automotive", "Office_Products"
    ]

    total_generated = 0
    print(f"üöÄ Starting DPO Data Generation (V2.0) | Target: {len(categories)} √ó {SAMPLES_PER_CATEGORY}")

    for category in tqdm(categories, desc="Categories"):

        reviews_path, _ = ensure_data_files(category)
        output_dir = BASE_DIR / category
        output_file = output_dir / f"{category}_dpo_v2.jsonl"
        processed_input_path = output_dir / f"{category}_products.jsonl"

        # Load or aggregate products
        products_pool = []
        if processed_input_path.exists():
            with open(processed_input_path, 'r', encoding='utf-8') as f:
                for line in f:
                    try:
                        products_pool.append(json.loads(line))
                    except:
                        continue
        else:
            print(f"Aggregating raw reviews for {category}...")
            raw_map = defaultdict(list)
            with open(reviews_path, 'r', encoding='utf-8') as f:
                for line in f:
                    try:
                        r = json.loads(line)
                        if 'parent_asin' in r and 'text' in r:
                            raw_map[r['parent_asin']].append(r)
                    except:
                        continue

            for asin, revs in raw_map.items():
                if len(revs) >= MIN_REVIEWS_PER_PRODUCT:
                    products_pool.append({"parent_asin": asin, "reviews": revs})

        if not products_pool:
            continue
        selected_products = random.sample(products_pool, min(len(products_pool), SAMPLES_PER_CATEGORY))

        # Resume from checkpoint
        existing_asins = set()
        if output_file.exists():
            with open(output_file, 'r', encoding='utf-8') as f:
                for line in f:
                    try:
                        existing_asins.add(json.loads(line).get('parent_asin'))
                    except:
                        continue

        # Generation loop
        category_sample_printed = False

        with open(output_file, 'a', encoding='utf-8') as f_out:
            for product in tqdm(selected_products, desc=f"Gen {category}", leave=False):
                asin = product.get('parent_asin')
                if asin in existing_asins:
                    continue

                triplets = generate_triplet_data(product.get('reviews', []))

                if triplets:
                    output_data = {
                        "parent_asin": asin,
                        "category": category,
                        "product_metadata": product.get('product_metadata', {}),
                        "reviews": product.get('reviews', []),
                        "summary_simple": triplets['simple'],
                        "summary_complex": triplets['complex'],
                        "summary_hallucinated": triplets['hallucinated']
                    }

                    f_out.write(json.dumps(output_data, ensure_ascii=False) + '\n')
                    f_out.flush()
                    total_generated += 1

                    # Preview first sample per category
                    if not category_sample_printed:
                        tqdm.write(f"\n{'='*70}")
                        tqdm.write(f"üì¶ PREVIEW (V2.0): {category} | ASIN: {asin}")
                        tqdm.write(f"{'='*70}")
                        tqdm.write(f"\nüî∑ [Simple A1] ({len(triplets['simple'].split())} words):")
                        tqdm.write(triplets['simple'])
                        tqdm.write(f"\n{'-'*70}")
                        tqdm.write(f"\nüî∂ [Complex C1] ({len(triplets['complex'].split())} words):")
                        tqdm.write(triplets['complex'])
                        tqdm.write(f"\n{'-'*70}")
                        tqdm.write(f"\n‚ö†Ô∏è  [Hallucinated] ({len(triplets['hallucinated'].split())} words):")
                        tqdm.write(triplets['hallucinated'])
                        tqdm.write(f"\n{'='*70}\n")
                        category_sample_printed = True

                time.sleep(0.5)

    print(f"\n‚úÖ Done! Generated: {total_generated} samples in '_v2.jsonl' files.")

if __name__ == "__main__":
    main()

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
üöÄ Starting DPO Data Generation (V2.0) | Target: 10 √ó 200


Categories:   0%|          | 0/10 [00:00<?, ?it/s]

Gen Electronics:   0%|          | 0/200 [00:00<?, ?it/s]


üì¶ PREVIEW (V2.0): Electronics | ASIN: B0047AQI4Q

üî∑ [Simple A1] (71 words):
Most people like this TV because the picture is good and it gets local channels without cable. It has a built-in DVD player and is easy to set up and use, which many find helpful. Some say the sound is weak, especially at high volume, and a few think the screen is too small or the picture not perfect. Still, it works well for small spaces like kitchens, bathrooms, or campers.

----------------------------------------------------------------------

üî∂ [Complex C1] (163 words):
(+) Consistently praised for its excellent picture quality, particularly among over-the-air users, with multiple reviewers highlighting reliable reception of local channels and seamless integration of built-in DVD functionality.  
(+) Highly valued as a compact, lightweight solution for secondary spaces‚Äîsuch as bathrooms, kitchens, or campers‚Äîwhere ease of setup, space efficiency, and integrated features outweigh the need for 

Gen Books:   0%|          | 0/200 [00:00<?, ?it/s]


üì¶ PREVIEW (V2.0): Books | ASIN: 1612126642

üî∑ [Simple A1] (55 words):
Most people like this cookbook because it has tasty recipes with maple syrup. The book shows nice pictures and gives useful tips about real maple syrup. Some say a few recipes are hard or need special ingredients, but most find them easy and delicious. This is a good book for people who love maple flavor.

----------------------------------------------------------------------

üî∂ [Complex C1] (127 words):
(+) Exceptional appeal for maple syrup enthusiasts, with numerous reviewers praising the diverse, flavorful, and inventive recipes that extend beyond conventional uses.  
(+) Rich educational value highlighted, particularly regarding maple syrup production and quality, enhancing the book‚Äôs utility beyond mere recipe instruction.  
(+) Lavishly praised for its aesthetic presentation, including beautiful photography and engaging design, contributing to its perceived craftsmanship and desirability.  
(-) Som

Gen Home_and_Kitchen:   0%|          | 0/200 [00:00<?, ?it/s]


üì¶ PREVIEW (V2.0): Home_and_Kitchen | ASIN: B00QIQZ21M

üî∑ [Simple A1] (66 words):
Most people like these lights because they are bright and have many color options. The remote control is easy to use and the lights work well for holidays and rooms. Some say the string is shorter or thinner than expected and a few had problems after a month. Overall, it is a good buy for the price but keep the remote and check the size first.

----------------------------------------------------------------------

üî∂ [Complex C1] (155 words):
(+) AC-powered design ensures consistent operation without battery dependency, while durable rubber-jacketed construction enhances longevity and safety in both indoor and outdoor settings.  
(+) Remote functionality is widely praised for intuitive control over diverse lighting modes, color options, and brightness levels, significantly enhancing user experience and decorative versatility.  
(+) Offers exceptional value with vibrant, bright LEDs that effective

Gen Beauty_and_Personal_Care:   0%|          | 0/200 [00:00<?, ?it/s]


üì¶ PREVIEW (V2.0): Beauty_and_Personal_Care | ASIN: B09GF6K8B5

üî∑ [Simple A1] (56 words):
Overall, these perm rods are cheap and work well for curling hair. Some people say the rubber ends break or snap off easily, and they can hurt if they hit your skin. Many find them poorly made, with stiff plastic and weak bands. They are okay to use if needed, but not very strong or long-lasting.

----------------------------------------------------------------------

üî∂ [Complex C1] (135 words):
(+) Offer exceptional value for money, with users consistently noting the high quantity provided at a low cost compared to retail alternatives.  
(+) Effectively curl hair, with multiple reviewers confirming strong performance in achieving desired wave and curl patterns when rods remain secured.  
(-) Suffer from significant durability issues‚Äînumerous reports cite broken or poorly knotted rubber ends, with elastics snapping during application, under heat, or while rinsing.  
(-) Frequent complai

Gen Clothing_Shoes_and_Jewelry:   0%|          | 0/200 [00:00<?, ?it/s]


üì¶ PREVIEW (V2.0): Clothing_Shoes_and_Jewelry | ASIN: B0074GKXZK

üî∑ [Simple A1] (45 words):
The bag is cute and many people like it. It is small and only fits things like keys or chapstick, not a big phone. Some say it is smaller or lighter in color than expected. Most are happy with it but check the size first.

----------------------------------------------------------------------

üî∂ [Complex C1] (117 words):
(+) Widely praised for its charming, compact design, with multiple users highlighting its aesthetic appeal and suitability as a stylish, minimalist accessory.  
(+) Appreciated for its balanced size‚Äîneither overly bulky nor impractically tiny‚Äîmaking it ideal for carrying small essentials with ease and elegance.  
(-) Frequently criticized for being significantly smaller than expected, particularly in relation to the price, which some consider disproportionate given its limited capacity.  
(-) Several users reported discrepancies in color accuracy, noting that the ac

Gen Toys_and_Games:   0%|          | 0/200 [00:00<?, ?it/s]


üì¶ PREVIEW (V2.0): Toys_and_Games | ASIN: B07Y1JC859

üî∑ [Simple A1] (44 words):
Most people say these balloons are great for party decoration and good value. They are bright, big, and many like the Fortnite design. Some balloons had holes and lost air, so they did not last. But most are happy and would buy them again.

----------------------------------------------------------------------

üî∂ [Complex C1] (122 words):
(+) Exceptional value for money, with multiple reviewers emphasizing high quality relative to price and abundant quantity per pack.  
(+) Ideal for Fortnite-themed events, consistently praised for vibrant colors, appealing design, and effectiveness in enhancing party decor.  
(+) Praised for fast shipping and versatility‚Äîusers creatively repurposed deflated balloons as makeshift volleyballs, adding unexpected entertainment value.  
(-) Significant durability issues reported: several balloons developed leaks immediately upon inflation, rendering a portion of the 

Gen Sports_and_Outdoors:   0%|          | 0/200 [00:00<?, ?it/s]


üì¶ PREVIEW (V2.0): Sports_and_Outdoors | ASIN: B07HCV3R3X

üî∑ [Simple A1] (62 words):
This golf mat is good for practicing at home and helps stop damage to the grass. It comes with tees and balls and works on different surfaces like grass or concrete. Some people say it moves when hit or does not last long. Overall, many are happy with it for the price, but a few find it too light or poor quality.

----------------------------------------------------------------------

üî∂ [Complex C1] (143 words):
(+) Effectively prevents turf damage and enables convenient home practice, particularly valued for backyard, garage, and indoor use with foam or practice balls.  
(+) Includes useful accessories such as tees, practice balls, and rubber inserts, enhancing versatility for chipping, driving, and swing training across varied surfaces.  
(+) Offers good initial value for the price, with many users reporting satisfaction in short-term use and ease of setup on concrete, grass, and tile.  
(-)

Gen Pet_Supplies:   0%|          | 0/200 [00:00<?, ?it/s]


üì¶ PREVIEW (V2.0): Pet_Supplies | ASIN: B096FL718B

üî∑ [Simple A1] (70 words):
Most people like this hideout because it is strong, easy to clean, and a good size for small hamsters. It stays cool and helps pets feel safe, and many say their hamsters sleep in it. Some gerbils can get stuck if the hole faces down, and it is too small for big hamsters like Syrians. Overall, it is a nice home for dwarf or robo hamsters and chew-proof for gerbils.

----------------------------------------------------------------------

üî∂ [Complex C1] (140 words):
(+) Exceptional durability and chew resistance, particularly valued by owners of destructive rodents; ceramic construction ensures longevity and ease of cleaning.  
(+) Ideal thermal regulation and odor control, with multiple users noting its effectiveness in maintaining comfortable temperatures and hygiene for dwarf and Syrian hamsters.  
(+) Spacious interior accommodates bedding and allows natural nesting behaviors, widely praised as coz

Gen Automotive:   0%|          | 0/200 [00:00<?, ?it/s]


üì¶ PREVIEW (V2.0): Automotive | ASIN: B004QL6UOO

üî∑ [Simple A1] (46 words):
Most people like these tires. They fit well on trucks and SUVs and give a smooth, quiet ride. Some say they feel bouncy at first but get better after a few hundred miles. The tires are strong and last long, even in rough or rocky conditions.

----------------------------------------------------------------------

üî∂ [Complex C1] (114 words):
(+) Delivers a smooth, quiet ride across varied terrain, with multiple users highlighting comfort and immediate performance post-installation.  
(+) Constructed with durable, heavy-duty materials; praised for long tread life and resilience in off-road conditions, including rock-laden paths.  
(+) Offers excellent value, consistently noted for competitive pricing compared to alternatives, enhancing cost-effectiveness for bulk purchases.  
(-) Requires extended break-in period‚Äîapproximately 100 miles‚Äîduring which handling may be unstable, particularly on lifted tr

Gen Office_Products:   0%|          | 0/200 [00:00<?, ?it/s]


üì¶ PREVIEW (V2.0): Office_Products | ASIN: B07WCQQMXQ

üî∑ [Simple A1] (60 words):
Most people say this sharpener works well and is easy to use. It sharpens pencils by itself and pushes them out when done, which many like. It is good for regular and colored pencils but some say it can chew them up or take too much off. A few people had problems with it not working or hurting their pencils.

----------------------------------------------------------------------

üî∂ [Complex C1] (125 words):
(+) Effortless automatic operation praised for hands-free sharpening, with smooth performance and self-ejecting mechanism ideal for classrooms and office use.  
(+) Sturdy build quality and large shaving canister noted as durable and practical for frequent use, especially with standard pencils.  
(+) Delivers precise, long-point sharpening particularly effective for colored pencils, enhancing usability for artistic and educational purposes.  
(-) Multiple users report destructive malfunctioning