#### Load the packages

In [None]:
import fasttext.util
import numpy as np
import pandas as pd
from collections import defaultdict
from sklearn.metrics.pairwise import cosine_similarity

import os
import sys

import asyncio
import json
import re
import textwrap
import time
from collections import defaultdict
from uuid import uuid4

from asynciolimiter import Limiter, StrictLimiter
from google.cloud import aiplatform
from langchain_google_vertexai import VertexAI
from tqdm.asyncio import tqdm_asyncio

import random
random.seed(42)

import pickle
import pandas as pd

PROJECT_ID = "YOUR-GCP-PROJECT-ID"
LOCATION = "us-central1" # Project location
aiplatform.init(project=PROJECT_ID, location=LOCATION)

#### Load the LLM

In [None]:
LLM_NAME = 'gemini-1.5-flash'

llm1 = VertexAI(
    model_name=LLM_NAME,
    max_output_tokens=2048,
    temperature=0,
    top_p=0.8,
    top_k=40,
    verbose=True,
)

In [None]:
requests_per_minute = 50
time_window = 60
rate_limiter = StrictLimiter(requests_per_minute/60)

#### Load the distilled features

In [None]:
with open('../data/results-topic-modeling-revieweaver.pkl', 'rb') as f:
    distilled_features = pickle.load(f)

In [None]:
# Load the primary review details
review_data = pd.read_pickle('../data/product_reviews.pkl')
review_data.head()

In [None]:
review_data_grouped = review_data.groupby("product_family_id")

In [None]:
prompt_distill = """You are a helpful assistant and you are tasked with writing a summary from some given information about a product. We have a list of PROS and CONS of the product, number of times they were mentioned, and a list of representative quotes speaking about the PROS or CONS.

- Write a short and concise summary with no more than four sentences and no less than three sentences on how customers are speaking about different pros and cons.
- Use the statement '#STATEMENT#' to begin the summary.
- Skip reporting how many times a pro/con was mentioned.
- The summary should only highlight pros and cons that are mentioned frequently.
- The summary should use a short name of the product.
- Avoid or rephrase customer mentioned terms that are derogatory, disrespectful, harmful, sexually explicit, hate speech, or harassment.


The PROS and CONS are listed below:

====================================
PROS_AND_CONS
====================================

{SUMMARY}
"""

In [None]:
prompt_llm = """You are a helpful assistant and you are tasked with writing a summary from a list of customer reviews.

- Write a short and concise summary with no more than four sentences and no less than three sentences on how customers are speaking about different pros and cons.
- Use the statement '#STATEMENT#' to begin the summary.
- Skip reporting how many times a pro/con was mentioned.
- The summary should only highlight pros and cons that are mentioned frequently.
- The summary should use a short name of the product.
- Avoid or rephrase customer mentioned terms that are derogatory, disrespectful, harmful, sexually explicit, hate speech, or harassment.


The reviews are listed below:

====================================
ALL_REVIEWS
====================================

{SUMMARY}
"""

In [None]:
# List of phrases we will use to begin a summary
SUMMARY_PREFIXES = [
    "Customers appreciate",
    "Customers value", 
    "Customers highly value", 
    "Customers are impressed with", 
    "Customers praise", 
    "Customers are positive/negative about",
    "Customers admire",
    "Customers frequently mention",
    "Customers commend",
    "Customers are satisfied with",
    "Customers often highlight",
    "Customers consistently note",
    "Customers find value in",
    "Customers enjoy",
    "Customers are enthusiastic about",
    "Customers are pleased with",
    "Customers recognize",
    "Customers express satisfaction with",
    "Customers love",
    "Customers regard",
    "Customers have good things to say about",
    "Customers are delighted by",
]

In [None]:
import heapq

def find_priority(feature_name, repr_sentence):
    # low value represents higher priority
    num_words = len(repr_sentence.split(" "))
    num_chars = len(repr_sentence)
    is_feature_present = 0 if feature_name.lower() in repr_sentence.lower() else 1
    bucket = None
    if num_words < 5:
        bucket = 1
    elif num_words >= 5 and num_words < 12:
        bucket = 0
    elif num_words >= 12 and num_words < 20:
        bucket = 2
    else:
        bucket = 3

    return bucket, is_feature_present, num_words

In [None]:
def prepare_data(distilled_features):
    content_for_summarization = defaultdict()
    content_for_summarization_llm = defaultdict()
        
    print(f"Started preparing the content for review summary generation...")

    for family in review_data['product_family_id'].unique():

        if family in review_data_grouped.groups.keys():
            product_info = review_data_grouped.get_group(family).iloc[0]

            metadata = dict()
            metadata["brand"] = product_info["brand"]


            ddd = dict()
            ddd["PRODUCT_NAME"] = product_info["short_name"]
            ddd["PROS"] = []
            ddd["CONS"] = []
            
            all_features = distilled_features[f"{family}"] if family in distilled_features else []                                 
            
            if len(all_features) == 0:
                continue
            
            if len(all_features[0]) == 8:
                distilled_features_family = pd.DataFrame(all_features, columns=["feature_name", "count", "review_ids", "other_names", "feature_id", "quotes", "val", "embedding"])
                distilled_features_family.drop(['val', 'embedding'], axis=1, inplace=True)
            else:
                distilled_features_family = pd.DataFrame(all_features, columns=["feature_name", "count", "review_ids", "other_names", "feature_id", "quotes"])
            distilled_features_family["sentiment"] = distilled_features_family["feature_id"].apply(lambda x: "Positive" if "Positive" in x else "Negative")
            
            for row in distilled_features_family.values.tolist():
                feature_name = row[0]
                sentiment = row[6]
                representative_quotes = list(row[5])
                review_ids = set(row[2])
                mentions = int(row[1]) # len(review_ids)
                
                top_repr_quotes = []
                
                for repr_quote in representative_quotes:
                    a, b, c = find_priority(feature_name, repr_quote)
                    heapq.heappush(top_repr_quotes, (a, b, c, repr_quote))
                # print(top_repr_quotes[:5])

                dd = dict()
                dd["feature"] = feature_name
                dd["mentions"] = mentions
                dd["comments"] = []

                # print(representative_quotes)
                for i in range(min(len(top_repr_quotes), 10)):
                    val = heapq.heappop(top_repr_quotes)
                    dd["comments"].append(val[3])

                if mentions > 0:
                    if sentiment.upper() == "POSITIVE":
                        ddd["PROS"].append(dd)

                    elif sentiment.upper() == "NEGATIVE":
                        ddd["CONS"].append(dd)

            # ddd["PROS"] = sorted(ddd["PROS"], key=lambda d: d['mentions'], reverse=True)
            ddd["PROS"] = sorted(ddd["PROS"], key=lambda d: d['mentions'], reverse=True)[:min(10, len(ddd["PROS"]))]

            # Do not add more than 5/10 cons
            ddd["CONS"] = sorted(ddd["CONS"], key=lambda d: d['mentions'], reverse=True)[:min(10, len(ddd["CONS"]))]
            
            randind = random.randrange(len(SUMMARY_PREFIXES))
            new_prompt = prompt_distill.replace("#STATEMENT#", SUMMARY_PREFIXES[randind])
            

            content_for_summarization[family] = {
                "product_family_id": family,
                "metadata": metadata, 
                "content": ddd,
                "summary": "",
                "safety_info": dict(),
                "prompt": new_prompt
            }
            
            new_prompt = prompt_llm.replace("#STATEMENT#", SUMMARY_PREFIXES[randind])
            
            content_for_summarization_llm[family] = {
                "product_family_id": family,
                "metadata": metadata, 
                "content": review_data_grouped.get_group(family)["review_text"].values,
                "summary": "",
                "safety_info": dict(),
                "prompt": new_prompt
            }
            
    print(f"Finished preparing the content for review summary generation...")

    return content_for_summarization, content_for_summarization_llm

# content_for_summarization, content_for_summarization_llm = prepare_data(distilled_features)

# We can test it out with few products
content_for_summarization, content_for_summarization_llm = prepare_data({k: distilled_features[k] for k in list(distilled_features)[:10]})

In [None]:
async def executer(family_n_prompt, content_for_summarization):
    await rate_limiter.wait()
    max_tries = 5

    family = family_n_prompt[0]
    new_prompt = family_n_prompt[1]
    response = None

    for i in range(max_tries):
        try:
            r = await llm1.ainvoke(new_prompt)
            content_for_summarization[family]["summary"] = r
            return 
        except Exception as e:
            print(f"{e}")
            await asyncio.sleep(3)
    print(f"Skipping {family}")

In [None]:
async def generate_summary_with_rate_limit(content_for_summarization, use_all=False):
    print(f"Started generating review summaries for {len(content_for_summarization.keys())} families")

    per_family_prompt = []

    for family in content_for_summarization.keys():
        if use_all:
            new_prompt = content_for_summarization[family]['prompt'].replace("ALL_REVIEWS", "\n----\n".join(content_for_summarization[family]["content"]))
            per_family_prompt.append((family, new_prompt))
            
        else:
            if len(content_for_summarization[family]["content"]["PROS"]) > 0 or len(content_for_summarization[family]["content"]["CONS"]) > 0:
                new_prompt = content_for_summarization[family]['prompt'].replace("PROS_AND_CONS", str(content_for_summarization[family]["content"]))
                per_family_prompt.append((family, new_prompt))

    await tqdm_asyncio.gather(
        *(executer(fam_and_prompt, content_for_summarization) for fam_and_prompt in per_family_prompt)
    )

    print(f"Finished generating review summaries")

In [None]:
await generate_summary_with_rate_limit(content_for_summarization)

In [None]:
await generate_summary_with_rate_limit(content_for_summarization_llm, use_all=True)

In [None]:
for key, value in content_for_summarization.items():
    print(f"Product family: {key}")
    print(f"Summary ReviewWeaver: {content_for_summarization_llm[key]['summary']}\nSummary LLM: {value['summary']}")
    print("-"*50)

#### Uncomment the following section to save the files. But these files are already in the directory having summaries.

In [1]:
# import pickle

# # Save the defaultdict to a file
# with open('../data/summaries-revieweaver.pkl', 'wb') as f:
#     pickle.dump(content_for_summarization, f)

# with open('../data/summaries-llm.pkl', 'wb') as f:
#     pickle.dump(content_for_summarization_llm, f)