In [None]:
import pandas as pd
from transformers import pipeline, BartTokenizer
from bertopic import BERTopic
from sentence_transformers import SentenceTransformer
import nltk
from collections import defaultdict
from bs4 import BeautifulSoup
import re

nltk.download('punkt')

def clean_text(text):
    text = BeautifulSoup(text, "html.parser").get_text()
    text = re.sub(r'[^a-zA-Z0-9\s\.,!?]', '', text)
    text = re.sub(r'\s+', ' ', text).strip()
    return text

df = pd.read_csv('preprocessed_combined.tsv', sep='\t')

summarizer = pipeline('summarization', model='facebook/bart-large-cnn', device=0)
sentiment_pipeline = pipeline('sentiment-analysis', model='cardiffnlp/twitter-roberta-base-sentiment', device=1)
sentence_model = SentenceTransformer('all-MiniLM-L6-v2', device='cuda:0')
topic_model = BERTopic(embedding_model=sentence_model)
tokenizer = BartTokenizer.from_pretrained('facebook/bart-large-cnn')

grouped = df.groupby('product_id')

for product_id, group in grouped:
    reviews = group['review_body'].tolist()
    
    # Cleaning reviews
    cleaned_reviews = [clean_text(review) for review in reviews]
    
    concatenated_reviews = ' '.join(cleaned_reviews)
    
    # Customer Summarization
    prompt_customer = "Summarize the customer reviews: "
    input_text_customer = prompt_customer + concatenated_reviews
    tokens = tokenizer(input_text_customer, return_tensors='pt', truncation=True, max_length=1024)
    input_text = tokenizer.decode(tokens['input_ids'][0], skip_special_tokens=True)
    customer_summary = summarizer(input_text, max_length=100, min_length=30, do_sample=False)[0]['summary_text']
    
    # Manufacturer Summarization
    prompt_manufacturer = "Provide a detailed summary of the customer reviews, focusing on specific feedback and insights: "
    input_text_manufacturer = prompt_manufacturer + concatenated_reviews
    tokens = tokenizer(input_text_manufacturer, return_tensors='pt', truncation=True, max_length=1024)
    input_text = tokenizer.decode(tokens['input_ids'][0], skip_special_tokens=True)
    manufacturer_summary = summarizer(input_text, max_length=200, min_length=50, do_sample=False)[0]['summary_text']
    
    # Manufacturer Insights
    sentences = []
    for review in cleaned_reviews:
        sentences.extend(nltk.sent_tokenize(review))
    sentences = [s.strip() for s in sentences if s.strip()]
    
    if len(sentences) > 0:
        topics, probs = topic_model.fit_transform(sentences)
        aspect_sentiments = {}
        
        for topic in set(topics):
            if topic == -1: 
                continue
            # top 3 words as aspect name
            aspect_name = ' '.join([word for word, _ in topic_model.get_topic(topic)[:3]])
            topic_sentences_indices = [i for i, t in enumerate(topics) if t == topic]
            topic_sentences = [sentences[i] for i in topic_sentences_indices]
            
            if topic_sentences:
                # Sentiment analysis 
                sentiments = sentiment_pipeline(topic_sentences)
                sentiment_counts = {'positive': 0, 'negative': 0, 'neutral': 0}
                label_map = {'LABEL_0': 'negative', 'LABEL_1': 'neutral', 'LABEL_2': 'positive'}
                for sent in sentiments:
                    sentiment_counts[label_map[sent['label']]] += 1
                total = sum(sentiment_counts.values())
                if total > 0:
                    dominant_sentiment = max(sentiment_counts, key=sentiment_counts.get)
                else:
                    dominant_sentiment = 'neutral'
                
                aspect_sentiments[aspect_name] = dominant_sentiment
        
        #  key excerpts
        all_sentiments = sentiment_pipeline(sentences)
        sentiment_to_sentences = defaultdict(list)
        for sent, sentiment in zip(sentences, all_sentiments):
            sentiment_to_sentences[label_map[sentiment['label']]].append(sent)
        
        key_excerpts = []
        for sentiment in ['positive', 'negative', 'neutral']:
            if sentiment_to_sentences[sentiment]:
                key_excerpts.extend(sentiment_to_sentences[sentiment][:2])  
            if len(key_excerpts) >= 5:
                break
        key_excerpts = key_excerpts[:5] 
    else:
        aspect_sentiments = {}
        key_excerpts = []
    
    print(f"Product ID: {product_id}")
    print("Customer Summary:")
    print(customer_summary)
    print("\nManufacturer Summary:")
    print(manufacturer_summary)
    print("\nKey Excerpts:")
    for excerpt in key_excerpts:
        print(f"- {excerpt}")
    print("\nAspect Sentiments:")
    for aspect, sentiment in aspect_sentiments.items():
        print(f"{aspect}: {sentiment.capitalize()}")
    print("\n")

[nltk_data] Downloading package punkt to
[nltk_data]     /usr4/cs640/yash0512/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
Device set to use cuda:0


config.json:   0%|          | 0.00/747 [00:00<?, ?B/s]

Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


pytorch_model.bin:   0%|          | 0.00/499M [00:00<?, ?B/s]

Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


model.safetensors:   0%|          | 0.00/499M [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/150 [00:00<?, ?B/s]

Device set to use cuda:1


modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/10.5k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

You seem to be using the pipelines sequentially on GPU. In order to maximize efficiency please use a dataset


Product ID: 0060193395
Customer Summary:
Bill Phillips will guide you through the world of weight training, cardio training and Myoplex. You can find all of the information on the body for life website! ... You can still read the inspiring stories, get LOADS of recipes literally hundreds if not thousands.

Manufacturer Summary:
Bill Phillips will guide you through the world of weight training, cardio training and Myoplex. You can find all of the information on the body for life website. Bill Phillips might do a decent job at getting you excited, but if you already know in your heart that this is what you want to do, then I suggest you dig through the website first.

Key Excerpts:
- In the beginning you will read about some inspiring stories, and then begin to learn about healthy dieting and proper exercise.
- Yes, he does push the Myoplex gig on you quite a bit, but thats okay, afterall, it is pretty good stuff albiet a bit expensive.
- But there is a major setback in my opinion.
- Thi

KeyboardInterrupt: 

In [None]:
import pandas as pd
from transformers import pipeline, BartTokenizer
from bs4 import BeautifulSoup
import re
import nltk
from collections import defaultdict

nltk.download('punkt')

def clean_text(text):
    text = BeautifulSoup(text, "html.parser").get_text()
    text = re.sub(r'[^a-zA-Z0-9\s\.,!?]', '', text)
    text = re.sub(r'\s+', ' ', text).strip()
    return text

def generalize_pronouns(text):
    text = re.sub(r'\bI\b', 'Customers', text, flags=re.IGNORECASE)
    text = re.sub(r'\bmy\b', 'their', text, flags=re.IGNORECASE)
    return text

def star_rating_visualization(rating):
    stars = '★' * int(rating) + '☆' * (5 - int(rating))
    return stars

df = pd.read_csv('preprocessed_combined.tsv', sep='\t')

summarizer = pipeline('summarization', model='facebook/bart-large-cnn', device=0)
sentiment_pipeline = pipeline('sentiment-analysis', model='cardiffnlp/twitter-roberta-base-sentiment', device=0)
tokenizer = BartTokenizer.from_pretrained('facebook/bart-large-cnn')

grouped = df.groupby('product_id')

for product_id, group in grouped:
    reviews = group['review_body'].tolist()
    star_rating = group['star_rating'].mean() 
    
    cleaned_reviews = [clean_text(review) for review in reviews]
    generalized_reviews = [generalize_pronouns(review) for review in cleaned_reviews]
    concatenated_reviews = ' '.join(generalized_reviews)
    
    # Customer Summary
    prompt_customer = "Provide a concise general overview of customer opinions about the product based on their reviews: "
    input_text_customer = prompt_customer + concatenated_reviews
    tokens = tokenizer(input_text_customer, return_tensors='pt', truncation=True, max_length=1024)
    input_text = tokenizer.decode(tokens['input_ids'][0], skip_special_tokens=True)
    customer_summary = summarizer(input_text, max_length=100, min_length=30, do_sample=False)[0]['summary_text']
    
    # Manufacturer Insights
    sentences = []
    for review in generalized_reviews:
        sentences.extend(nltk.sent_tokenize(review))
    sentences = [s.strip() for s in sentences if s.strip()]
    
    if len(sentences) > 0:
        sentiments = sentiment_pipeline(sentences)
        label_map = {'LABEL_0': 'negative', 'LABEL_1': 'neutral', 'LABEL_2': 'positive'}
        sentiment_counts = defaultdict(int)
        for sent in sentiments:
            sentiment_counts[label_map[sent['label']]] += 1
        
        key_points = []
        for sent, sentiment in zip(sentences, sentiments):
            if sentiment['label'] == 'LABEL_2':  
                key_points.append(f"Praise: {sent}")
            elif sentiment['label'] == 'LABEL_0':  
                key_points.append(f"Complaint: {sent}")
        
        key_points = key_points[:5]
    else:
        key_points = []
    
    sentiment_summary = f"Sentiment: {sentiment_counts['positive']} positive, {sentiment_counts['negative']} negative, {sentiment_counts['neutral']} neutral sentences."
    
    print(f"Product: {group['product_title'].iloc[0]}")
    print("### Customer Summary:")
    print(customer_summary)
    print("\n### Manufacturer Insights:")
    for point in key_points:
        print(f"- {point}")
    print(f"- {sentiment_summary}")
    print(f"\n### Star Rating: {star_rating_visualization(star_rating)} ({star_rating:.1f}/5)")
    print("\n### Call to Action:")
    print("- Customers: Discover this product today!")
    print("- Manufacturers: Leverage the feedback to enhance your product.")
    print("\n### Feedback Prompt:")
    print("How can we make this summary better? Share your thoughts!")
    print("\n")

[nltk_data] Downloading package punkt to
[nltk_data]     /usr4/cs640/yash0512/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
Device set to use cuda:0
Device set to use cuda:0
  text = BeautifulSoup(text, "html.parser").get_text()


Product: Body for Life: 12 Weeks to Mental and Physical Strength
### Customer Summary:
Bill Phillips will guide you through the world of weight training, cardio training and Myoplex. You can find all of the information on the body for life website! ... You can still read the inspiring stories, get LOADS of recipes literally hundreds if not thousands.

### Manufacturer Insights:
- Praise: In the beginning you will read about some inspiring stories, and then begin to learn about healthy dieting and proper exercise.
- Praise: Yes, he does push the Myoplex gig on you quite a bit, but thats okay, afterall, it is pretty good stuff albiet a bit expensive.
- Complaint: But there is a major setback in their opinion.
- Praise: You can find all of the information on the body for life website!
- Praise: ... You can still read the inspiring stories, get LOADS of recipes literally hundreds if not thousands, your training journal is ready to download.
- Sentiment: 2835 positive, 1324 negative, 2910 n

KeyboardInterrupt: 