In [None]:
!pip -q install --upgrade transformers==4.44.2 accelerate==0.34.2 torch torchvision torchaudio --extra-index-url https://download.pytorch.org/whl/cu121
!pip -q install --upgrade pandas tqdm

import os, json, math, gc
import pandas as pd
from typing import Any, Dict, List, Optional
from tqdm.auto import tqdm

import torch
from transformers import BartTokenizer, BartForConditionalGeneration

# -----------------------------
# Config
# -----------------------------
REVIEWS_JSONL = "/content/All_Beauty.jsonl"
OUT_JSONL     = "/content/summarized_for_review.jsonl"

MODEL_NAME = "facebook/bart-large-cnn"
DEVICE = "cuda" if torch.cuda.is_available() else "cpu"

# Summarization sampling (we still cap prompt size for safety)
SAMPLE_K = 10            # max sampled reviews per heavy ASIN
MAX_INPUT_LENGTH = 1024
MAX_OUTPUT_LENGTH = 200


[0m[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
google-colab 1.0.0 requires pandas==2.2.2, but you have pandas 2.3.3 which is incompatible.
cudf-cu12 25.6.0 requires pandas<2.2.4dev0,>=2.0, but you have pandas 2.3.3 which is incompatible.
dask-cudf-cu12 25.6.0 requires pandas<2.2.4dev0,>=2.0, but you have pandas 2.3.3 which is incompatible.[0m[31m
[0m

In [None]:
print("Loading reviews…")
df = pd.read_json(REVIEWS_JSONL, lines=True)

assert "parent_asin" in df.columns, "Input must have 'parent_asin' per review."

# Ensure expected columns exist
defaults = {
    "title": "",
    "text": "",
    "rating": None,
    "helpful_vote": 0,
}
for col, default in defaults.items():
    if col not in df.columns:
        df[col] = default

# Coerce rating to numeric
df["rating"] = pd.to_numeric(df["rating"], errors="coerce")

# Count reviews per ASIN
asin_counts = df.groupby("parent_asin").size().reset_index(name="review_count")

# Split by threshold 10
heavy_asins = asin_counts.loc[asin_counts["review_count"] > 10, "parent_asin"].tolist()
light_asins = asin_counts.loc[asin_counts["review_count"] <= 10, "parent_asin"].tolist()

heavy_df = df[df["parent_asin"].isin(heavy_asins)].copy()
light_df = df[df["parent_asin"].isin(light_asins)].copy()

print(f"Total ASINs: {asin_counts.shape[0]}")
print(f"  >10 reviews (summarize): {len(heavy_asins)}")
print(f"  ≤10 reviews (concat):    {len(light_asins)}")


Loading reviews…
Total ASINs: 112565
  >10 reviews (summarize): 11981
  ≤10 reviews (concat):    100584


In [None]:
import numpy as np

def safe_len_words(s: Any) -> int:
    try:
        return len(str(s).split())
    except Exception:
        return 0

def compute_weight(row: pd.Series) -> float:
    """
    Weight uses helpful_votes, title presence, length band, and extreme ratings.
    Larger is better. (All reviews considered; no TOP_N cap.)
    """
    hv = row.get("helpful_vote", 0) or 0
    weight = float(np.log1p(hv))

    title = row.get("title", "")
    if isinstance(title, str) and title.strip():
        weight += 1.5

    text_len = safe_len_words(row.get("text", ""))
    if 10 <= text_len <= 200:
        weight += 2.0
    elif text_len > 200:
        weight += 1.0

    rating = row.get("rating", None)
    if rating in [1, 2, 5]:
        weight += 1.0

    return float(weight)

def combine_title_text(row: pd.Series, limit_chars: Optional[int] = None) -> str:
    title = str(row.get("title") or "").strip()
    text  = str(row.get("text")  or "").strip()
    s = (f"{title}. {text}" if title else text).strip()
    if limit_chars is not None and len(s) > limit_chars:
        s = s[:limit_chars]
    return s

def build_structured_input(sampled_df: pd.DataFrame) -> str:
    """
    Build a structured prompt for BART using a few positive/negative/mixed snippets.
    """
    positive, negative, mixed = [], [], []

    for _, row in sampled_df.iterrows():
        review_text = combine_title_text(row, limit_chars=300)
        r = row.get("rating", None)
        if pd.isna(r):
            mixed.append(review_text)
        elif r >= 4:
            positive.append(review_text)
        elif r <= 2:
            negative.append(review_text)
        else:
            mixed.append(review_text)

    sections = []
    if positive:
        sections.append("Positive feedback: " + " | ".join(positive[:4]))
    if negative:
        sections.append("Negative feedback: " + " | ".join(negative[:4]))
    if mixed and len(mixed) > 2:
        sections.append("Mixed reviews: " + " | ".join(mixed[:2]))

    prompt = (
        "Summarize the key advantages and disadvantages of this product based on customer reviews. "
        "Focus on specific features mentioned. "
        + " ".join(sections)
    )
    return prompt

@torch.no_grad()
def summarize_group_bart(text_prompt: str, tokenizer: BartTokenizer, model: BartForConditionalGeneration) -> str:
    try:
        input_ids = tokenizer(
            text_prompt,
            return_tensors="pt",
            truncation=True,
            max_length=MAX_INPUT_LENGTH
        ).input_ids.to(DEVICE)

        summary_ids = model.generate(
            input_ids,
            max_length=MAX_OUTPUT_LENGTH,
            min_length=50,               # ensure some detail
            num_beams=4,
            length_penalty=2.0,
            early_stopping=True,
            no_repeat_ngram_size=3
        )
        return tokenizer.decode(summary_ids[0], skip_special_tokens=True)
    except Exception as e:
        return f"Error generating summary: {e}"

def rating_distribution(series: pd.Series) -> Dict[int, int]:
    cnt = series.dropna().astype(int).value_counts().to_dict()
    # keep only stars 1..5 (if present)
    ordered = {}
    for k in sorted(cnt.keys()):
        if 1 <= k <= 5:
            ordered[k] = cnt[k]
    return ordered


In [None]:
def combine_title_text(row: pd.Series, limit_chars: Optional[int] = None) -> str:
    title = str(row.get("title") or "").strip()
    text = str(row.get("text") or "").strip()
    # Ensure text is not a list, join if it's a list
    if isinstance(text, list):
        text = " ".join([str(t) for t in text])
    if isinstance(title, list):
        title = " ".join([str(t) for t in title])

    s = (f"{title}. {text}" if title else text).strip()
    if limit_chars is not None and len(s) > limit_chars:
        s = s[:limit_chars]
    return s

# ---- Summarize heavy groups (>5) ----
if len(heavy_asins) > 0:
    print("Loading BART model…")
    tokenizer = BartTokenizer.from_pretrained(MODEL_NAME)
    model = BartForConditionalGeneration.from_pretrained(MODEL_NAME).to(DEVICE)
    model.eval()

    summaries = []  # Initialize summaries list before the loop

    for asin, group in tqdm(heavy_df.groupby("parent_asin"), desc="Summarizing heavy ASINs"):
        g = group.copy()
        # compute weights over ALL reviews (no top cap)
        g["weight"] = g.apply(compute_weight, axis=1)

        # stratified picks by rating, each from the highest weights
        picked = []
        for r in [5, 1, 4, 2, 3]:
            sub = g[g["rating"] == r]
            if len(sub) > 0:
                n_take = min(3, len(sub))
                picked.append(sub.nlargest(n_take, "weight"))
        sampled = pd.concat(picked).drop_duplicates(subset=[col for col in g.columns if col not in ["text", "title", "images"]])

        # If we still have too many, trim to SAMPLE_K by global weight
        if sampled.shape[0] > SAMPLE_K:
            sampled = sampled.nlargest(SAMPLE_K, "weight")

        # structured prompt and summarization
        prompt = build_structured_input(sampled)
        summary_text = summarize_group_bart(prompt, tokenizer, model)

        # stats on FULL group
        rdist = rating_distribution(g["rating"])
        avg = float(g["rating"].dropna().mean()) if g["rating"].notna().any() else None
        combined_reviews_text = " | ".join(combine_title_text(r, limit_chars=300) for _, r in sampled.iterrows())

        summaries.append({
            "parent_asin": asin,
            "combined_text": combined_reviews_text,     # text used for summary
            "summary": summary_text,                    # BART summary
            "avg_rating": round(avg, 2) if avg is not None else None,
            "total_reviews": int(g.shape[0]),
            "rating_distribution": rdist,
            "sample_size": int(sampled.shape[0]),
            "mode": "summarized"
        })

    # free
    del tokenizer, model
    gc.collect()
    if torch.cuda.is_available():
        torch.cuda.empty_cache()

# ---- Concat light groups (≤5) ----
for asin, group in tqdm(light_df.groupby("parent_asin"), desc="Concatenating light ASINs"):
    g = group.copy()

    # concat all reviews (title + text)
    all_concat = " | ".join(combine_title_text(r) for _, r in g.iterrows())

    rdist = rating_distribution(g["rating"])
    avg = float(g["rating"].dropna().mean()) if g["rating"].notna().any() else None

    summaries.append({
        "parent_asin": asin,
        "combined_text": all_concat,
        "summary": all_concat,                # per spec: summary is concat for light groups
        "avg_rating": round(avg, 2) if avg is not None else None,
        "total_reviews": int(g.shape[0]),
        "rating_distribution": rdist,
        "sample_size": int(g.shape[0]),
        "mode": "concatenated"
    })

# ---- Build final DF & save ----
summarized_df = pd.DataFrame(summaries).sort_values("parent_asin").reset_index(drop=True)

with open(OUT_JSONL, "w", encoding="utf-8") as f:
    for rec in summarized_df.to_dict(orient="records"):
        f.write(json.dumps(rec, ensure_ascii=False) + "\n")

print(f"✔ Wrote {OUT_JSONL}  ({len(summarized_df)} products)")
display(summarized_df.head(10))

Loading BART model…




Summarizing heavy ASINs:   0%|          | 0/11981 [00:00<?, ?it/s]

Concatenating light ASINs:   0%|          | 0/100584 [00:00<?, ?it/s]

✔ Wrote /content/summarized_for_review.jsonl  (112565 products)


Unnamed: 0,parent_asin,combined_text,summary,avg_rating,total_reviews,rating_distribution,sample_size,mode
0,0005946468,Five Stars. great,Five Stars. great,5.0,1,{5: 1},1,concatenated
1,0123034892,Five Stars. Good product,Five Stars. Good product,5.0,1,{5: 1},1,concatenated
2,0124784577,Product delivers. Makes my hair look healthy |...,Product delivers. Makes my hair look healthy |...,4.33,3,"{3: 1, 5: 2}",3,concatenated
3,0515059560,Barbara's best work is in her Regency period. ...,Barbara's best work is in her Regency period. ...,4.0,1,{4: 1},1,concatenated
4,0692508988,Delightful and Profound. This is a wonderful t...,Delightful and Profound. This is a wonderful t...,5.0,1,{5: 1},1,concatenated
5,069267599X,I am calling it! This is my all time favorite ...,"The print quality is top notch, and the art is...",4.78,41,"{2: 1, 3: 2, 4: 2, 5: 36}",8,summarized
6,0764490117,Amazing VBS kit!. We just completed our week o...,Amazing VBS kit!. We just completed our week o...,5.0,2,{5: 2},2,concatenated
7,0816091846,Three for the price of 1. I 've seen 2 of the ...,"Packaged well,quick delivery. She's one of our...",4.64,14,"{1: 1, 4: 1, 5: 12}",5,summarized
8,0963416391,Eyes Open.... Every once in a while a book com...,Eyes Open.... Every once in a while a book com...,5.0,1,{5: 1},1,concatenated
9,0966068432,Its a Great book to learn the human body Funct...,Its a Great book to learn the human body Funct...,5.0,2,{5: 2},2,concatenated


In [None]:
print("Rows by mode:")
print(summarized_df["mode"].value_counts(dropna=False))

print("\nBasic stats on total_reviews:")
print(summarized_df["total_reviews"].describe())

print("\nPeek a couple of concatenated examples:")
display(summarized_df[summarized_df["mode"] == "concatenated"].head(3)[["parent_asin","total_reviews","summary"]])

print("\nPeek a couple of summarized examples:")
display(summarized_df[summarized_df["mode"] == "summarized"].head(3)[["parent_asin","total_reviews","summary"]])


Rows by mode:
mode
concatenated    100584
summarized       11981
Name: count, dtype: int64

Basic stats on total_reviews:
count    112565.000000
mean          6.232204
std          25.189840
min           1.000000
25%           1.000000
50%           2.000000
75%           4.000000
max        1962.000000
Name: total_reviews, dtype: float64

Peek a couple of concatenated examples:


Unnamed: 0,parent_asin,total_reviews,summary
0,5946468,1,Five Stars. great
1,123034892,1,Five Stars. Good product
2,124784577,3,Product delivers. Makes my hair look healthy |...



Peek a couple of summarized examples:


Unnamed: 0,parent_asin,total_reviews,summary
5,069267599X,41,"The print quality is top notch, and the art is..."
7,0816091846,14,"Packaged well,quick delivery. She's one of our..."
16,0985393300,30,The guide is packed full of really useful info...


In [None]:
import pandas as pd, json

META_JSONL = "/content/meta_All_Beauty.jsonl"          # adjust if your filename differs
SUMM_JSONL = "/content/summarized_for_review.jsonl"    # produced in previous step
OUT_MERGED = "/content/summarized_for_review_merged.jsonl"
OUT_MERGED_FINAL = "/content/summarized_for_review_merged_final.jsonl"

# Load meta (product-level) and summaries (asin-level)
meta_df = pd.read_json(META_JSONL, lines=True)
summ_df = pd.read_json(SUMM_JSONL, lines=True)

# Ensure unique product rows in meta
if "parent_asin" not in meta_df.columns:
    raise ValueError("meta file must contain 'parent_asin'")
meta_df = meta_df.drop_duplicates(subset=["parent_asin"])

# Inner-join: keep only products we summarized
merged_df = pd.merge(meta_df, summ_df, on="parent_asin", how="inner")

# (Optional) Reorder columns for convenience
front_cols = [
    "parent_asin", "main_category", "title", "price", "average_rating", "rating_number",
    "store", "details", "images", "videos"
]
summary_cols = ["summary", "combined_text", "avg_rating", "total_reviews", "rating_distribution", "sample_size", "mode"]
cols = [c for c in front_cols if c in merged_df.columns] + [c for c in summary_cols if c in merged_df.columns] + \
       [c for c in merged_df.columns if c not in set(front_cols + summary_cols)]
merged_df = merged_df[cols]

# Drop the 'mode' column
merged_df_final = merged_df.drop(columns=["mode"])

# Save JSONL
with open(OUT_MERGED_FINAL, "w", encoding="utf-8") as f:
    for rec in merged_df_final.to_dict(orient="records"):
        f.write(json.dumps(rec, ensure_ascii=False) + "\n")

print(f"✔ Wrote {OUT_MERGED_FINAL}  (rows: {len(merged_df_final)})")
display(merged_df_final.head(5))

✔ Wrote /content/summarized_for_review_merged_final.jsonl  (rows: 112565)


Unnamed: 0,parent_asin,main_category,title,price,average_rating,rating_number,store,details,images,videos,summary,combined_text,avg_rating,total_reviews,rating_distribution,sample_size,features,description,categories,bought_together
0,B01CUPMQZE,All Beauty,"Howard LC0008 Leather Conditioner, 8-Ounce (4-...",,4.8,10,Howard Products,{'Package Dimensions': '7.1 x 5.5 x 3 inches; ...,[{'thumb': 'https://m.media-amazon.com/images/...,[],Absolutely fabulous - I will never use anythin...,Absolutely fabulous - I will never use anythin...,4.6,5,"{'4': 2, '5': 3}",5,[],[],[],
1,B076WQZGPM,All Beauty,Yes to Tomatoes Detoxifying Charcoal Cleanser ...,,4.5,3,Yes To,"{'Item Form': 'Powder', 'Skin Type': 'Acne Pro...",[{'thumb': 'https://m.media-amazon.com/images/...,[],Five Stars. Grandson has had very good results...,Five Stars. Grandson has had very good results...,5.0,1,{'5': 1},1,[],[],[],
2,B000B658RI,All Beauty,Eye Patch Black Adult with Tie Band (6 Per Pack),,4.4,26,Levine Health Products,{'Manufacturer': 'Levine Health Products'},[{'thumb': 'https://m.media-amazon.com/images/...,[],Ties are great and much more comfortable than ...,Oh Happy Mistake!. I ordered the tie on eye pa...,4.36,14,"{'3': 4, '4': 1, '5': 9}",7,[],[],[],
3,B088FKY3VD,All Beauty,"Tattoo Eyebrow Stickers, Waterproof Eyebrow, 4...",,3.1,102,Cherioll,"{'Brand': 'Cherioll', 'Item Form': 'Powder', '...",[{'thumb': 'https://m.media-amazon.com/images/...,[],Customer reviews: Amazingly Realistic. Easy to...,Amazingly Realistic. I have been drawing on my...,2.91,11,"{'1': 4, '2': 2, '4': 1, '5': 4}",9,[],[],[],
4,B07NGFDN6G,All Beauty,Precision Plunger Bars for Cartridge Grips – 9...,,4.3,7,Precision,{'UPC': '644287689178'},[{'thumb': 'https://m.media-amazon.com/images/...,[],Missing Review.......... It’s lost at Amazon S...,Missing Review.......... It’s lost at Amazon S...,5.0,1,{'5': 1},1,"[Material: 304 Stainless Steel; Brass tip, Len...",[The Precision Plunger Bars are designed to wo...,[],
