# BONUSES

### First, let set the environment!

In [32]:
from transformers import pipeline
import pandas as pd
import tensorflow as tf
from collections import defaultdict
import plotly.express as px

### For a product category, we created a summary of all reviews broken down by each star rating:

In [5]:
required_columns = ['primaryCategories', 'reviews.rating', 'reviews.text']
df = df[required_columns].dropna()  # Drop missing values

df['reviews.rating'] = df['reviews.rating'].astype(int)

top_categories = df['primaryCategories'].value_counts().head(10).index
df_top = df[df['primaryCategories'].isin(top_categories)]

summarizer = pipeline("summarization", model="facebook/bart-large-cnn")

def summarize_reviews(group):
    text = " ".join(group['reviews.text'].tolist())[:1024]  # Limit input length to avoid model errors
    summary = summarizer(text, max_length=10, min_length=1, do_sample=False)[0]['summary_text']
    return summary
    
summary_df = (
    df_top.groupby(['primaryCategories', 'reviews.rating', 'reviews.text'])
    .apply(summarize_reviews)
    .reset_index()
)
summary_df.columns = ['Category', 'Rating', 'Summary']
print(summary_df.head())

      Category  Rating                                            Summary
0  Electronics       1  I was looking for a kindle whitepaper. I saw o...
1  Electronics       2  The screen is too dark, and cannot adjust the ...
2  Electronics       3  Kindle Black was not a good fit for me. Would ...
3  Electronics       4  The Amazon Kindle is light weight and easy to ...
4  Electronics       5  This kindle is light and easy to use especiall...


### Here we use Generative AI to summaraize reviews broken down by review scores:

In [None]:
top_k = 10  # Selecting the top 10 categories
top_categories = df["primaryCategories"].value_counts().nlargest(top_k).index

# Filter the dataset to include only the top categories
filtered_df = df[df["primaryCategories"].isin(top_categories)]

# Group reviews by category and rating
grouped_reviews = filtered_df.groupby(["primaryCategories", "reviews.rating"])["reviews.text"].apply(lambda x: " ".join(x))

# Convert to a DataFrame for processing
grouped_reviews_df = grouped_reviews.reset_index()


# Function to summarize reviews using Generative AI
def summarize_reviews(text):
    if len(text) > 1000:  # Limit text length for processing efficiency
        text = text[:1000]
    summary_prompt = f"Summarize the following product reviews concisely:\n{text}"
    return summary_prompt  # Placeholder for Generative AI

# Apply summarization to each review group
grouped_reviews_df["summary"] = grouped_reviews_df["reviews.text"].apply(summarize_reviews)

### Finally, we create the dinamic Plotly.

In [30]:
df["reviews.rating"] = df["reviews.rating"].astype(str)  

review_counts = df.groupby(["primaryCategories", "reviews.rating"]).size().reset_index(name="count")

review_counts["reviews.rating"] = pd.Categorical(review_counts["reviews.rating"], categories=["1", "2", "3", "4", "5"], ordered=True)

fig = px.bar(
    review_counts,
    x="primaryCategories",
    y="count",
    color="reviews.rating",
    title="Review Distribution by Product Category and Rating",
    labels={"primaryCategories": "Product Category", "count": "Number of Reviews", "reviews.rating": "Review Rating"},
    barmode="group"
)
fig

NameError: name 'px' is not defined