### 1. Importing Modules

In [7]:
import os
import time
import pandas as pd
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.chrome.options import Options
from nltk.sentiment.vader import SentimentIntensityAnalyzer
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
import string
from sklearn.feature_extraction.text import CountVectorizer
from gensim.models import LdaModel
from gensim import corpora
import re
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import LatentDirichletAllocation
import plotly.express as px
from dash import Dash, html, dcc
import plotly.graph_objs as go
from dash.dependencies import Input, Output


### 2a. Webscraping (for Safari)

In [2]:
# Initialize SafariDriver
driver = webdriver.Safari()

# Load the CSV with Amazon product links and product info
file_path = '/Users/valerielim/Documents/DSA3101/E-commerce-Performance-Analysis-and-Optimization/LLM/data/amazon.csv'
amazon_data = pd.read_csv(file_path)

# Assuming the columns are named 'product_link', 'product_id', 'product_name', 'product_category'
urls = amazon_data['product_link'] # Process the first 3 links
product_ids = amazon_data['product_id']
product_names = amazon_data['product_name']
product_categories = amazon_data['category']

# Create a DataFrame to store all the reviews from all products
all_reviews = []

# Loop through each URL and its corresponding product data in the CSV file
for index, url in enumerate(urls):
    print(f"Scraping URL: {url}")
    driver.get(url)
    
    # Wait for the page to load fully (adjust the delay as needed)
    time.sleep(10)  # Wait for 10 seconds to ensure the page fully loads
    
    # Scroll to load the reviews section
    driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
    
    # Find the reviews, ratings, and dates
    reviews = driver.find_elements(By.CLASS_NAME, 'review-text-content')
    ratings = driver.find_elements(By.CSS_SELECTOR, '.a-icon-alt')  # More specific selector
    dates = driver.find_elements(By.CLASS_NAME, 'review-date')
    
    # Print counts for debugging
    review_count = len(reviews)
    rating_count = len(ratings)
    date_count = len(dates)
    print(f"Found {review_count} reviews, {rating_count} ratings, and {date_count} dates.")
    
    # Summary for each URL
    summary = f"Found {review_count} reviews, {rating_count} ratings, and {date_count} dates."
    
    # Combine the reviews, ratings, and dates into single strings
    reviews_combined = " | ".join([review.text.strip() if review.text.strip() else "N/A" for review in reviews])
    ratings_combined = " | ".join([rating.get_attribute("innerHTML").strip() if rating.get_attribute("innerHTML").strip() else "N/A" for rating in ratings])
    dates_combined = " | ".join([date.text.strip() if date.text.strip() else "N/A" for date in dates])

    # Append the data for this URL along with product details
    all_reviews.append({
        'url': url,
        'product_id': product_ids[index],          # Add product_id
        'product_name': product_names[index],      # Add product_name
        'product_category': product_categories[index],  # Add product_category
        'summary': summary,
        'reviews': reviews_combined,
        'ratings': ratings_combined,
        'dates': dates_combined
    })

# Close the WebDriver
driver.quit()

Scraping URL: https://www.amazon.in/Wayona-Braided-WN3LG1-Syncing-Charging/dp/B07JW9H4J1/ref=sr_1_1?qid=1672909124&s=electronics&sr=1-1
Found 9 reviews, 41 ratings, and 9 dates.
Scraping URL: https://www.amazon.in/Ambrane-Unbreakable-Charging-Braided-Cable/dp/B098NS6PVG/ref=sr_1_2?qid=1672909124&s=electronics&sr=1-2
Found 8 reviews, 39 ratings, and 8 dates.
Scraping URL: https://www.amazon.in/Sounce-iPhone-Charging-Compatible-Devices/dp/B096MSW6CT/ref=sr_1_3?qid=1672909124&s=electronics&sr=1-3
Found 8 reviews, 50 ratings, and 8 dates.
Scraping URL: https://www.amazon.in/Deuce-300-Resistant-Tangle-Free-Transmission/dp/B08HDJ86NZ/ref=sr_1_4?qid=1672909124&s=electronics&sr=1-4
Found 8 reviews, 41 ratings, and 8 dates.
Scraping URL: https://www.amazon.in/Portronics-Konnect-POR-1080-Charging-Function/dp/B08CF3B7N1/ref=sr_1_5?qid=1672909124&s=electronics&sr=1-5
Found 9 reviews, 42 ratings, and 9 dates.
Scraping URL: https://www.amazon.in/Solero-TB301-Charging-480Mbps-1-5-Meter/dp/B08Y1TFSP6/

In [3]:
# Convert the list of dictionaries into a DataFrame
df = pd.DataFrame(all_reviews)

# Define the output directory and ensure it exists
output_dir = '/Users/valerielim/Documents/DSA3101/E-commerce-Performance-Analysis-and-Optimization/SubgroupB/webscraping/'
os.makedirs(output_dir, exist_ok=True)

# Save the reviews from the links into a CSV file
output_file = os.path.join(output_dir, 'all_amazon_reviews_with_summary_safari.csv')
df.to_csv(output_file, index=False)

print(f"Scraped reviews from {len(urls)} URLs.")

Scraped reviews from 1465 URLs.


### 2b. Webscraping (for Chrome)

In [None]:
# Path to your ChromeDriver executable
chrome_driver_path = '/Users/valerielim/Downloads/chromedriver-mac-arm64/chromedriver'  # Adjust the path to your actual chromedriver file

# Initialize ChromeDriver with binary and service path
chrome_options = Options()
service = Service(executable_path=chrome_driver_path)
driver = webdriver.Chrome(service=service, options=chrome_options)

# Load the CSV with Amazon product links and product info
file_path = '/Users/valerielim/Documents/DSA3101/E-commerce-Performance-Analysis-and-Optimization/LLM/data/amazon.csv'
amazon_data = pd.read_csv(file_path)

# Assuming the columns are named 'product_link', 'product_id', 'product_name', 'product_category'
urls = amazon_data['product_link'][:3]  # Process the first 3 links
product_ids = amazon_data['product_id'][:3]
product_names = amazon_data['product_name'][:3]
product_categories = amazon_data['category'][:3]

# Create a DataFrame to store all the reviews from all products
all_reviews = []

# Loop through each URL and its corresponding product data in the CSV file
for index, url in enumerate(urls):
    print(f"Scraping URL: {url}")
    driver.get(url)
    
    # Wait for the page to load fully (adjust the delay as needed)
    time.sleep(10)  # Wait for 10 seconds to ensure the page fully loads
    
    # Scroll to load the reviews section
    driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
    
    # Find the reviews, ratings, and dates
    reviews = driver.find_elements(By.CLASS_NAME, 'review-text-content')
    ratings = driver.find_elements(By.CSS_SELECTOR, '.a-icon-alt')  # More specific selector
    dates = driver.find_elements(By.CLASS_NAME, 'review-date')
    
    # Print counts for debugging
    review_count = len(reviews)
    rating_count = len(ratings)
    date_count = len(dates)
    print(f"Found {review_count} reviews, {rating_count} ratings, and {date_count} dates.")
    
    # Summary for each URL
    summary = f"Found {review_count} reviews, {rating_count} ratings, and {date_count} dates."
    
    # Combine the reviews, ratings, and dates into single strings
    reviews_combined = " | ".join([review.text.strip() if review.text.strip() else "N/A" for review in reviews])
    ratings_combined = " | ".join([rating.get_attribute("innerHTML").strip() if rating.get_attribute("innerHTML").strip() else "N/A" for rating in ratings])
    dates_combined = " | ".join([date.text.strip() if date.text.strip() else "N/A" for date in dates])

    # Append the data for this URL along with product details
    all_reviews.append({
        'url': url,
        'product_id': product_ids[index],          # Add product_id
        'product_name': product_names[index],      # Add product_name
        'product_category': product_categories[index],  # Add product_category
        'summary': summary,
        'reviews': reviews_combined,
        'ratings': ratings_combined,
        'dates': dates_combined
    })

# Close the WebDriver
driver.quit()

In [None]:
# Convert the list of dictionaries into a DataFrame
df = pd.DataFrame(all_reviews)

# Define the output directory and ensure it exists
output_dir = '/Users/valerielim/Documents/DSA3101/E-commerce-Performance-Analysis-and-Optimization/SubgroupB/webscraping/'
os.makedirs(output_dir, exist_ok=True)

# Save the reviews from the links into a CSV file
output_file = os.path.join(output_dir, 'all_amazon_reviews_with_summary_chrome.csv')
df.to_csv(output_file, index=False)

print(f"Scraped reviews from {len(urls)} URLs.")

### 3. To reset if needed

In [21]:
driver.quit() 

### 4. Sentiment Analysis

Between the two options:

1. Sentiment analysis first, then group by categories to find the mean: This approach allows you to capture the sentiment for each individual review or product before aggregating the results. It gives more granular insights, and then grouping by category will provide a mean sentiment score that reflects the overall feeling of buyers within that category. This method retains more detailed information and might provide a better understanding of how individual products within a category contribute to the overall sentiment.
2. Group by categories first, find the mean, then do sentiment analysis: If you aggregate the data before performing sentiment analysis, you might lose specific details related to each review. Sentiment analysis on a pre-aggregated summary of reviews might not be as precise because it will analyze the combined text from multiple reviews, which could mix positive and negative feedback, making the sentiment score less accurate.

Chosen method: Option 1 (sentiment analysis first, then group by categories) is typically better because it allows for more precise sentiment analysis on individual reviews before aggregating. This way, you can capture sentiment variations and avoid diluting insights when multiple reviews are combined.

#### 4a. Perform Sentiment Analysis

In [11]:
# Load the CSV file
file_path = '/Users/valerielim/Documents/DSA3101-project/E-commerce-Performance-Analysis-and-Optimization/SubgroupB/webscraping/all_amazon_reviews_with_summary_safari.csv'
data = pd.read_csv(file_path)

# Initialize VADER sentiment analyzer
nltk.download('vader_lexicon')
sid = SentimentIntensityAnalyzer()

# Perform sentiment analysis on the 'reviews' column
data['sentiment_scores'] = data['reviews'].apply(lambda review: sid.polarity_scores(str(review)))

# Extract compound sentiment scores and sentiment labels
data['compound'] = data['sentiment_scores'].apply(lambda score_dict: score_dict['compound'])
data['sentiment'] = data['compound'].apply(lambda c: 'positive' if c >= 0.05 else ('negative' if c <= -0.05 else 'neutral'))

# Save the sentiment results back to a CSV file
output_file = '/Users/valerielim/Documents/DSA3101-project/E-commerce-Performance-Analysis-and-Optimization/SubgroupB/webscraping/sentiment_analysis_results.csv'
data.to_csv(output_file, index=False)

[nltk_data] Error loading vader_lexicon: <urlopen error [SSL:
[nltk_data]     CERTIFICATE_VERIFY_FAILED] certificate verify failed:
[nltk_data]     unable to get local issuer certificate (_ssl.c:1006)>


#### 4b. Finding the average compound sentiment scores and sentiment labels by category

In [12]:
# Load the sentiment analysis results CSV
file_path = '/Users/valerielim/Documents/DSA3101-project/E-commerce-Performance-Analysis-and-Optimization/SubgroupB/webscraping/sentiment_analysis_results.csv'
data = pd.read_csv(file_path)

# Function to count the number of concatenated items (reviews/ratings)
def count_concatenated_items(column):
    return column.apply(lambda x: len(x.split(" | ")) if pd.notnull(x) and x.strip() != "" else 0)

# Group by product category
grouped_data = data.groupby('product_category')

# Calculate the mean of compound scores for each group
mean_compound = grouped_data['compound'].mean()

# Find the most common sentiment label for each group
most_common_sentiment = grouped_data['sentiment'].agg(lambda x: x.value_counts().index[0])

# Count the number of products in each category
product_count = grouped_data['product_id'].nunique()

# Accurately count the total number of reviews and ratings
total_reviews = grouped_data['reviews'].apply(lambda x: count_concatenated_items(x).sum())
total_ratings = grouped_data['ratings'].apply(lambda x: count_concatenated_items(x).sum())

# Combine all the metrics into a single DataFrame
summary = pd.DataFrame({
    'mean_compound': mean_compound,
    'most_common_sentiment': most_common_sentiment,
    'number_of_products': product_count,
    'total_reviews': total_reviews,
    'total_ratings': total_ratings
})

# Save the summary to a CSV file
output_file = '/Users/valerielim/Documents/DSA3101-project/E-commerce-Performance-Analysis-and-Optimization/SubgroupB/webscraping/sentiment_summary_by_category_with_counts.csv'
summary.to_csv(output_file, index=True)

print("\nSentiment Scores and Labels by Product Category:\n")
for category, row in summary.iterrows():
    print(f"Category: {category}")
    print(f"  Mean Compound Score: {row['mean_compound']}")
    print(f"  Most Common Sentiment: {row['most_common_sentiment']}")
    print(f"  Number of Products: {row['number_of_products']}")
    print(f"  Total Reviews: {row['total_reviews']}")
    print(f"  Total Ratings: {row['total_ratings']}")
    print("\n")



Sentiment Scores and Labels by Product Category:

Category: Car&Motorbike|CarAccessories|InteriorAccessories|AirPurifiers&Ionizers
  Mean Compound Score: 0.999
  Most Common Sentiment: positive
  Number of Products: 1
  Total Reviews: 13
  Total Ratings: 42


Category: Computers&Accessories|Accessories&Peripherals|Adapters|USBtoUSBAdapters
  Mean Compound Score: 0.96975
  Most Common Sentiment: positive
  Number of Products: 2
  Total Reviews: 16
  Total Ratings: 89


Category: Computers&Accessories|Accessories&Peripherals|Audio&VideoAccessories|PCHeadsets
  Mean Compound Score: 0.9923
  Most Common Sentiment: positive
  Number of Products: 1
  Total Reviews: 8
  Total Ratings: 33


Category: Computers&Accessories|Accessories&Peripherals|Audio&VideoAccessories|PCMicrophones
  Mean Compound Score: 0.9863999999999999
  Most Common Sentiment: positive
  Number of Products: 2
  Total Reviews: 21
  Total Ratings: 79


Category: Computers&Accessories|Accessories&Peripherals|Audio&VideoAcces

### 5. Topic Modelling

In [2]:
# Load the reviews from the CSV file
df = pd.read_csv('all_amazon_reviews_with_summary_safari.csv')

# Handle missing values by filling NaNs in the 'reviews' column with empty strings
df['reviews'] = df['reviews'].fillna('')

# Preprocessing: we focus on terms likely to relate to supplier performance
tfidf_vectorizer = TfidfVectorizer(
    stop_words='english',
    max_df=0.95,  # Ignore terms that appear in more than 95% of documents
    min_df=2,     # Ignore terms that appear in less than 2 documents
    ngram_range=(1, 2)  # Include bigrams (two-word combinations) for phrases like "delivery time"
)

# Transform the reviews into the TF-IDF matrix
X = tfidf_vectorizer.fit_transform(df['reviews'])

# Set up LDA to extract a few specific topics (for instance, 5 topics)
lda_model = LatentDirichletAllocation(n_components=5, random_state=42)
lda_model.fit(X)

# Function to display the topics with top words
def display_topics(model, feature_names, no_top_words):
    for topic_idx, topic in enumerate(model.components_):
        print(f"Topic {topic_idx}:")
        print(" | ".join([feature_names[i] for i in topic.argsort()[:-no_top_words - 1:-1]]))

# Display the top words in each topic
feature_names = tfidf_vectorizer.get_feature_names_out()
display_topics(lda_model, feature_names, 10)

# Assign topics to each review
def assign_topics_to_reviews(lda_model, tfidf_matrix):
    topic_assignments = lda_model.transform(tfidf_matrix)
    return topic_assignments.argmax(axis=1)

df['topic'] = assign_topics_to_reviews(lda_model, X)

# Save the data with topics assigned to each review
output_file = 'reviews_with_supplier_topics.csv'
df.to_csv(output_file, index=False)

print(f"Topic modeling completed and saved to: {output_file}")


Topic 0:
m33 | chal rhi | rhi | chal | charging output | boat micro | usb 55 | galaxy m33 | m33 5g | 4a fast
Topic 1:
good | product | cable | quality | charging | use | tv | price | working | phone
Topic 2:
cable protectors | watch faces | faces | soooo | ui | 10a | redmi 10a | longer cable | protectors | m13
Topic 3:
boat 350 | printer laptop | visionary | ehs64 | sd cards | 350 type | memory card | 4k video | boltt visionary | ninja
Topic 4:
smartwatch | ninja pro | boltt ninja | ninja | tracking features | phoenix | m04 | galaxy m04 | health tracking | unbreakable
Topic modeling completed and saved to: reviews_with_supplier_topics.csv


Topic 0:
- Top Words: m33 | chal rhi | rhi | charging output | boat micro | usb 55 | galaxy m33 | m33 5g | 4a fast
- Interpretation: This topic seems to focus on technical product details, specifically related to mobile devices (e.g., Galaxy M33) and charging outputs. The mention of "USB" and "fast charging" could indicate discussions about compatibility, speed, and output issues. This topic might reflect customer feedback on charging speeds and whether the products meet expectations.

Topic 1:
- Top Words: good | product | cable | quality | charging | use | tv | price | working | phone
- Interpretation: This is a more general topic about product quality, where users talk about the quality of cables, the price, and whether the product is working as intended. There may be discussions about the functionality of products related to charging or general use with phones or TVs. This topic reflects overall satisfaction with product quality and value for money.

Topic 2:
- Top Words: cable protectors | watch faces | faces | soooo | ui | 10a | redmi 10a | longer cable | protectors | m13
- Interpretation: This topic seems to focus on accessories, such as cable protectors and watch faces, as well as references to specific mobile devices like Redmi 10a and M13. The mention of "longer cable" might indicate feedback on cable lengths. This topic could involve feedback about the durability or design of accessories and additional protective items.

Topic 3:
- Top Words: boat 350 | printer laptop | visionary | ehs64 | sd cards | 350 type | memory card | 4k video | boltt visionary | ninja
- Interpretation: This topic includes a variety of tech gadgets, such as printers, SD cards, and memory cards, as well as mentions of 4K video. It's likely related to storage, memory, and possibly connectivity with laptops or devices. Customers might be discussing performance of these products, especially when handling high-quality video or storage needs.

Topic 4:
- Top Words: smartwatch | ninja pro | boltt ninja | ninja | tracking features | phoenix | m04 | galaxy m04 | health tracking | unbreakable
- Interpretation: This topic focuses on smartwatches and fitness-related features like health tracking. Words like unbreakable and mentions of specific smartwatch models (e.g., Boltt Ninja, Galaxy M04) suggest discussions around durability and functionality of wearables, particularly in terms of tracking fitness activities.

Summary of Insights:
- Supplier Performance: Several topics seem to address customer feedback on product quality and functionality (e.g., charging speeds, cable durability, health tracking features). There may be insights related to compatibility with specific devices (e.g., Galaxy M33, Redmi 10a).
- Potential Bottlenecks: Topic 0 and Topic 1 could provide insights into possible performance issues, such as charging speeds or product working as expected, which may be related to supplier performance.
- Durability and Design: Topics 1, 2, and 4 mention features like cable protectors, longer cables, and unbreakable features, indicating that durability and product design are key concerns for customers.
- Tech Gadgets: Topic 3 suggests feedback about more niche products like memory cards, SD cards, and printers, which may help identify any performance issues or variability in delivery times for these items.

### 6. Topic Modelling on Neutral/Negative Reviews

In [6]:
# Load the CSV with sentiment analysis results
file_path = 'sentiment_analysis_results.csv'  # Replace with your actual file path
df = pd.read_csv(file_path)

# Filter out neutral and negative sentiment reviews
filtered_reviews = df[(df['sentiment'] == 'neutral') | (df['sentiment'] == 'negative')]

# Drop rows with NaN values in the 'reviews' column
filtered_reviews = filtered_reviews.dropna(subset=['reviews'])

# Check how many reviews we have after filtering and dropping NaN values
print(f"Filtered reviews count: {len(filtered_reviews)}")

# Preprocessing: we'll use TF-IDF for topic modeling
tfidf_vectorizer = TfidfVectorizer(
    stop_words='english',
    max_df=0.95,  # Ignore terms that appear in more than 95% of documents
    min_df=2,     # Ignore terms that appear in less than 2 documents
    ngram_range=(1, 2)  # Consider bigrams to capture phrases like "delivery time"
)

# Transform the filtered reviews into the TF-IDF matrix
X = tfidf_vectorizer.fit_transform(filtered_reviews['reviews'])

# Apply LDA to extract 5 topics (adjust number of topics as needed)
lda_model = LatentDirichletAllocation(n_components=5, random_state=42)
lda_model.fit(X)

# Display the topics with top words
def display_topics(model, feature_names, no_top_words):
    for topic_idx, topic in enumerate(model.components_):
        print(f"Topic {topic_idx}:")
        print(" | ".join([feature_names[i] for i in topic.argsort()[:-no_top_words - 1:-1]]))

# Extract the feature names from the TF-IDF vectorizer
feature_names = tfidf_vectorizer.get_feature_names_out()

# Display the top 10 words for each topic
display_topics(lda_model, feature_names, 10)

# Assign topics to each review and add it to the DataFrame
def assign_topics_to_reviews(lda_model, tfidf_matrix):
    topic_assignments = lda_model.transform(tfidf_matrix)
    return topic_assignments.argmax(axis=1)

filtered_reviews['topic'] = assign_topics_to_reviews(lda_model, X)

# Save the filtered reviews with topics to a new CSV file
output_file = 'filtered_reviews_with_topics.csv'
filtered_reviews.to_csv(output_file, index=False)

print(f"Topic modeling completed and saved to: {output_file}")

Filtered reviews count: 39
Topic 0:
product | cable | waste | quality | hair | buy | working | wire | health | original
Topic 1:
remote | tv | good | product | bajaj | dust | room | paper | cleaning | job
Topic 2:
water | product | quality | filter | hp | working | use | service | plastic | cartridge
Topic 3:
cable | room | charging | button | sound | volume | hot | blade | product | type
Topic 4:
product | good | batteries | charger | warrenty | working | quality | charging | saw | amazon
Topic modeling completed and saved to: filtered_reviews_with_topics.csv


### 7. Analyze Supplier Performance and Identify Bottlenecks

Topic 0: 
- Keywords like product, cable, waste, quality, working, wire, and health suggest issues with cable products, potentially related to poor quality or product waste. There might be concerns about health or environmental impact related to the materials used in these products.
- Bottleneck: Quality control issues with cable products, possibly leading to higher return rates or dissatisfaction.

Topic 1: 
- The keywords include remote, tv, good, product, dust, and cleaning. This could indicate that certain remote control or TV products require frequent cleaning or have issues with dust buildup.
- Bottleneck: The design of these products may not be conducive to long-term cleanliness, affecting performance and user satisfaction.

Topic 2: 
- Words like water, filter, working, service, plastic, and cartridge indicate issues with water filter products, possibly related to product durability or replacement issues (e.g., cartridges).
- Bottleneck: The frequent need for cartridge replacement or service could be a supply chain challenge, leading to delays or higher costs for maintaining these products.

Topic 3:
- Keywords like charging, button, sound, volume, and blade suggest issues related to electronics or devices with buttons, sound control, and charging functionality.
- Bottleneck: Charging problems and malfunctioning buttons could point to product defects or short lifespan, affecting customer experience and returns.

Topic 4: 
- This topic revolves around batteries, charger, warranty, and amazon. It seems to cover battery and charger-related products, possibly highlighting warranty issues or the need for frequent replacements.
- Bottleneck: Frequent battery or charger replacements due to malfunction or insufficient warranty coverage could be a bottleneck in terms of both product quality and customer service.

### 8. Possible Optimisation (Based on bottleneck identified in topic modelling)

Quality Control for Cable Products (Topic 0)
- Optimization: Implement stricter quality checks during the manufacturing process, focusing on durability tests for cables and wires. Collaborating with suppliers to improve material quality could reduce waste and improve health and safety standards.
- Automation: Use automated testing systems to check for cable integrity before shipping to reduce faulty products reaching customers.

Design Optimization for Remote/TV Products (Topic 1)
- Optimization: Redesign products to minimize dust buildup or improve ease of cleaning. Alternatively, offer accessories or cleaning kits bundled with the products.
- Automation: Automate customer service notifications for cleaning and maintenance advice, or offer subscription services for regular product upkeep.

Inventory and Replacement Strategy for Water Filter Products (Topic 2)
- Optimization: Optimize inventory management to ensure that replacement cartridges are always available when customers need them. Consider offering subscription services for regular cartridge replacement to reduce delivery times and improve customer satisfaction.
- Automation: Automate order tracking and notifications for cartridge replacements, ensuring that users are reminded before they need new cartridges.

Improved Product Durability for Electronic Devices (Topic 3)
- Optimization: Collaborate with manufacturers to improve button durability and charging reliability through better design and testing. Offering extended warranties could also alleviate customer concerns.
- Automation: Automate fault detection using smart sensors in electronic devices, which could notify the supplier of potential failures before they occur, improving service times and reducing product returns.

Battery and Charger Warranty Extension (Topic 4)
- Optimization: Negotiate with suppliers to offer better warranties on batteries and chargers, which could reduce the number of dissatisfied customers. Offering extended warranty packages or providing free replacements within a certain period could enhance customer loyalty.
- Automation: Use predictive analytics to anticipate battery or charger failures based on usage patterns and offer proactive replacements, reducing customer downtime.

### 9. Possible Improvement to Order Fulfilment

To improve order fulfillment and reduce delivery times, the following strategies can be employed:

Demand Forecasting:
- Utilize demand forecasting models to anticipate peak periods for replacement products like cartridges, batteries, and chargers, ensuring sufficient stock is available to avoid delays.

Supplier Performance Analysis:
- Use the sentiment data from your reviews (especially from Topic 0, 2, and 4) to evaluate suppliers' performance based on customer feedback. Terminate contracts with underperforming suppliers or negotiate performance improvements.

Inventory Optimization:
- Implement a Just-In-Time (JIT) inventory system for fast-moving replacement products (cartridges, batteries), ensuring that you don't run out of stock while minimizing holding costs.

Automating Order Fulfillment:
- Integrate automated systems to manage inventory levels, track orders, and optimize delivery routes. Tools like barcode scanning and real-time inventory updates can help ensure accuracy in order fulfillment.

Delivery Time Reduction:
- Consider using localized fulfillment centers for popular products to reduce shipping times. Partner with logistics providers to optimize last-mile delivery, ensuring products reach customers faster.

### 10. Dashboard

In [12]:
# Example data from your project
data = {
    'Category': [
        'Stovetop Espresso Pots', 'Sewing & Embroidery Machines', 'Small Kitchen Appliances', 
        'Air Fryers', 'Digital Kitchen Scales', 'Egg Boilers', 'Hand Blenders'
    ],
    'Mean Compound Score': [0.9965, 0.9955, 0.9997, 0.712, 0.8540, 0.9884, 0.8579],
    'Total Reviews': [13, 32, 8, 45, 77, 90, 148],
    'Total Ratings': [36, 153, 44, 237, 361, 474, 701],
    'Number of Products': [1, 4, 1, 5, 10, 11, 19],
    'Most Common Sentiment': ['positive', 'positive', 'positive', 'positive', 'positive', 'positive', 'positive']
}

# Convert the data to a pandas DataFrame
df = pd.DataFrame(data)

# Create a column for average ratings per product
df['Average Ratings'] = df['Total Ratings'] / df['Number of Products']

# Initialize Dash app
app = Dash(__name__)

# Layout of the dashboard
app.layout = html.Div([
    html.H1("Amazon Reviews Sentiment Analysis Dashboard"),

    # Dropdown filter for category selection
    dcc.Dropdown(
        id='category-filter',
        options=[{'label': cat, 'value': cat} for cat in df['Category']],
        value=df['Category'].tolist(),  # Default to all categories
        multi=True,
        placeholder="Select categories"
    ),

    # Bar chart for mean compound scores
    dcc.Graph(id='sentiment-bar'),

    # Pie chart for total reviews
    dcc.Graph(id='reviews-pie'),

    # Bar chart for total ratings
    dcc.Graph(id='ratings-bar'),

    # Bar chart for average ratings per product
    dcc.Graph(id='avg-ratings-bar'),

    # Summary Table for key metrics
    html.Div(id='summary-table')
])

# Update all graphs when category filter changes
@app.callback(
    [Output('sentiment-bar', 'figure'),
     Output('reviews-pie', 'figure'),
     Output('ratings-bar', 'figure'),
     Output('avg-ratings-bar', 'figure'),
     Output('summary-table', 'children')],
    [Input('category-filter', 'value')]
)
def update_dashboard(selected_categories):
    # Filter data based on selected categories
    filtered_df = df[df['Category'].isin(selected_categories)]

    # Bar chart for mean compound scores
    sentiment_bar = px.bar(
        filtered_df, 
        x='Category', 
        y='Mean Compound Score', 
        color='Most Common Sentiment',
        title="Mean Compound Sentiment Score by Category"
    )

    # Pie chart for total reviews
    reviews_pie = px.pie(
        filtered_df, 
        names='Category', 
        values='Total Reviews', 
        title="Distribution of Total Reviews by Category"
    )

    # Bar chart for total ratings
    ratings_bar = px.bar(
        filtered_df, 
        x='Category', 
        y='Total Ratings', 
        title="Total Ratings by Category"
    )

    # Bar chart for average ratings per product
    avg_ratings_bar = px.bar(
        filtered_df, 
        x='Category', 
        y='Average Ratings',
        title="Average Ratings per Product by Category (out of 50)
        "
    )

    # Summary table for key metrics
    summary_table = html.Table([
        html.Tr([html.Th("Metric"), html.Th("Value")]),
        html.Tr([html.Td("Total Categories"), html.Td(len(filtered_df))]),
        html.Tr([html.Td("Total Products"), html.Td(filtered_df['Number of Products'].sum())]),
        html.Tr([html.Td("Total Reviews"), html.Td(filtered_df['Total Reviews'].sum())]),
        html.Tr([html.Td("Total Ratings"), html.Td(filtered_df['Total Ratings'].sum())]),
        html.Tr([html.Td("Average Sentiment Score"), html.Td(round(filtered_df['Mean Compound Score'].mean(), 4))])
    ])

    return sentiment_bar, reviews_pie, ratings_bar, avg_ratings_bar, summary_table

# Run the Dash app
if __name__ == '__main__':
    app.run_server(debug=True)
