### PHASE 1---------- Load all required libraries ---------- ###

In [None]:
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.chrome.options import Options
import os
import pandas as pd
import time
import random
import nltk
from pymongo import MongoClient
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer
import matplotlib.pyplot as plt
from nltk.tokenize import sent_tokenize, word_tokenize
from wordcloud import WordCloud
nltk.download('stopwords')
from nltk.corpus import stopwords
nltk.download('words')
nltk.download('punkt') 
from collections import Counter


### PHASE 1---------- Loading hyperlinks of best buy headphone categories ---------- ###

In [None]:
localpath = r"C:/Users/user/Desktop/BI CASE STUDY 3/Marimo/Headphone On Sale _ Best Buy Canada.html"
file_url = localpath

options = Options()
#options.add_argument("--headless")
service = Service(r"C:/Users/user/Desktop/BI CASE STUDY 3/Marimo/Chrome Driver/chromedriver-win64/chromedriver.exe")


driver = webdriver.Chrome(service=service, options=options)
driver.get(file_url)


links = driver.find_elements('xpath', '//a[@class="link_3hcyN inline-block h-full w-full"]')
final_Dat = []
for link in links:
    print("---------------------------------------")
    linktxt = link.text
    link = link.get_attribute("href")
    print("Link Text: ", linktxt)
    print("URL Text: ", link)
    linktxt = linktxt.split('\n')
    prdnm = linktxt[0]
    prdprc = linktxt[2]
    prdlnk = link+"/review"
    tmplst = [prdnm, prdprc, prdlnk]
    final_Dat.append(tmplst)
    print("")

time.sleep(2) #IN Seconds
driver.quit()

df = pd.DataFrame(final_Dat, columns=["Product_Name", "Price", "Review URL"])
df.to_excel(r"C:/Users/user/Desktop/BI CASE STUDY 3/Marimo/Output/Fetched_Links.xlsx", index=False)
# -------------------------------------------------------------------------------------------
time.sleep(5)

options = Options()
options.add_argument("--headless")
service = Service(r"C:/Users/user/Desktop/BI CASE STUDY 3/Marimo/Chrome Driver/chromedriver-win64/chromedriver.exe")
driver = webdriver.Chrome(service=service, options=options)

revlist = []
for index, row in df.iterrows():
    url = row["Review URL"]
    print("Current URL: ", url)
    driver.get(url)
    time.sleep(random.randint(6, 9)) #Ensures that the links are closed in random time
    review = driver.find_elements('xpath', '//div[@class="reviewContent_XCspv"]')
    time.sleep(1)
    
    if review:
        tmplst=[]
        for rev in review:
            tmplst.append(rev.text)
        revlist.append(tmplst)
        time.sleep(1)  
    else:
        revlist.append("No reviews found")
    
    #reviewnm = driver.find_elements('xpath', '//span[@class="author_20vgR"]')
    time.sleep(random.randint(7, 10))

    # ---------- Below are the links used from the html file ---------- #


In [None]:
# ---------- Save to CSV for viewing/reference purpose ---------- #
out_dir = r"C:/Users/user/Desktop/BI CASE STUDY 3/Marimo/Output"
os.makedirs(out_dir, exist_ok=True)               # creates the folder once
csv_path = os.path.join(out_dir, "fetched_reviews.csv")

df.to_csv(csv_path, index=False)
print(f"[✓] {len(df)} rows written ➜ {csv_path}")

### PHASE 2---------- Data preprocessing and applying NLP Techniques---------- ###

In [None]:
df # we can see the data frame is unstructured and not cleaned

In [None]:

#Make sure 'Review' column is a list
df["Review"] = df["Review"].apply(lambda x: x if isinstance(x, list) else [x])

# Remove unwanted promotion text inside each review
df["Review"] = df["Review"].apply(lambda reviews: [r.replace("[This review was collected as part of a promotion.]", "").strip() for r in reviews])

# Explode 'Review' column to create one row per review
df_exploded = df.explode("Review").reset_index(drop=True)

# Drop rows where 'Review' is missing, empty, or says No reviews found
df_exploded = df_exploded[df_exploded["Review"].notnull()]
df_exploded = df_exploded[df_exploded["Review"].str.strip() != ""]
df_exploded = df_exploded[df_exploded["Review"] != "No reviews found"]

print(f"[✓] Reshaped DataFrame: {len(df_exploded)} rows")

print(df_exploded.head())


In [None]:
df_exploded #Transformed data structure to show unique reviews per row

In [None]:
#  Sentence Segmentation
df_exploded["Sentences"] = df_exploded["Review"].apply(lambda text: sent_tokenize(text))

#  Word Tokenization
def tokenize_sentences(sentences):
    word_tokens = []
    for sent in sentences:
        tokens = word_tokenize(sent)
        word_tokens.append(tokens)
    return word_tokens

df_exploded["Word_Tokens"] = df_exploded["Sentences"].apply(tokenize_sentences)

df_exploded.head(10)

### PHASE 3 ---------- After PHASE 2, we will load reviews in MongoDB for Sentiment Classification ---------- ###

In [None]:
# Load in MongoDB
client = MongoClient("mongodb://localhost:27017/")
db = client["customer_reviews"]
collection = db["reviews"]

# Insert each row from df_exploded
for idx, row in df_exploded.iterrows():
    review_doc = {
        "product_name": row["Product_Name"],
        "price": row["Price"],
        "review_url": row["Review URL"],
        "review": row["Review"],               # single reviews
        "sentences": row["Sentences"],          # list of sentences
        "word_tokens": row["Word_Tokens"]       # list of word tokens
    }
    collection.insert_one(review_doc)

print(f"[✓] Inserted {len(df_exploded)} documents into MongoDB.")

client.close()


In [None]:
# Using Sentiment Analysis Based on Word Tokens
client = MongoClient("mongodb://localhost:27017/")
db = client["customer_reviews"]
collection = db["reviews"]

# Set up Sentiment Analyzer
analyzer = SentimentIntensityAnalyzer()

# Process each document
updates = []
print("[...] Starting sentiment analysis based on word_tokens...")
for i, doc in enumerate(collection.find(), start=1):
    word_tokens = doc.get("word_tokens", [])
    product = doc.get("product_name", "N/A")
    review_preview = doc.get("review", "")[:60]  # show start of the review
    
    print(f"\n Processing {i}: {product}")
    print(f"    Review: {review_preview}...")

    if word_tokens:
        # Flatten the list of lists
        flat_tokens = [token for sentence in word_tokens for token in sentence]
        text = " ".join(flat_tokens)  # recreate a string

        # VADER sentiment scoring
        score = analyzer.polarity_scores(text)
        compound = score["compound"]

       # Stronger opinion thresholds
        if compound >= 0.2:
            sentiment = "Positive"
        elif compound <= -0.2:
            sentiment = "Negative"
        else:
            sentiment = "Neutral"

        print(f"    Sentiment Score: {score}")
        print(f"    ➜ Classified as: {sentiment}")
    else:
        sentiment = "Unknown"
        print("No tokens found. Marked as 'Unknown'.")

    updates.append(
        {"_id": doc["_id"], "sentiment_token_based": sentiment}
    )

# Bulk update back into MongoDB
print("\n Updating documents in MongoDB...")
for upd in updates:
    collection.update_one(
        {"_id": upd["_id"]},
        {"$set": {"sentiment_token_based": upd["sentiment_token_based"]}}
    )

print(f"\n[✓] Updated {len(updates)} documents with token-based sentiment.")
client.close()

In [None]:
# Creating visuals to better understand
client = MongoClient("mongodb://localhost:27017/")
db = client["customer_reviews"]
collection = db["reviews"]

# Aggregate sentiment counts
sentiment_counts = collection.aggregate([
    {"$group": {"_id": "$sentiment_token_based", "count": {"$sum": 1}}}
])

labels = []
counts = []
for entry in sentiment_counts:
    labels.append(entry["_id"])
    counts.append(entry["count"])

plt.figure(figsize=(8, 8))
plt.pie(counts, labels=labels, autopct='%1.1f%%', startangle=140)
plt.title('Sentiment Distribution Based on Word Tokens')
plt.axis('equal')  # Equal aspect ratio makes the pie circular
plt.show()


plt.figure(figsize=(8, 6))
plt.bar(labels, counts)
plt.title('Sentiment Distribution Based on Word Tokens')
plt.xlabel('Sentiment')
plt.ylabel('Number of Reviews')
plt.grid(axis='y')
plt.show()

client.close()

### PHASE 4---------- Visualisations for questions---------- ###

In [None]:
# Connect to MongoDB
client = MongoClient("mongodb://localhost:27017/")
db = client["customer_reviews"]
collection = db["reviews"]


# Percentage of Positive and Negative Reviews (Pie Chart)

sentiment_counts = collection.aggregate([
    {"$match": {"sentiment_token_based": {"$in": ["Positive", "Negative"]}}},
    {"$group": {"_id": "$sentiment_token_based", "count": {"$sum": 1}}}
])

labels = []
counts = []
for entry in sentiment_counts:
    labels.append(entry["_id"])
    counts.append(entry["count"])

plt.figure(figsize=(8, 6))
plt.pie(counts, labels=labels, autopct='%1.1f%%', startangle=140)
plt.title('Percentage of Positive and Negative Reviews (Token-Based)')
plt.axis('equal')
plt.show()

client.close()

In [None]:
# Connect to MongoDB
client = MongoClient("mongodb://localhost:27017/")
db = client["customer_reviews"]
collection = db["reviews"]


# Prepare Stopwords
stop_words = set(stopwords.words('english'))
# Add custom extra stopwords if you want
extra_stopwords = {"product", "one", "also", "get", "would", "use", "buy"}
stop_words.update(extra_stopwords)


# Collect Positive and Negative Tokens
positive_tokens = []
negative_tokens = []

for doc in collection.find({"sentiment_token_based": {"$in": ["Positive", "Negative"]}}):
    word_tokens = doc.get("word_tokens", [])
    if word_tokens:
        # Flatten the list of lists and lower-case
        flat_tokens = [token.lower() for sentence in word_tokens for token in sentence]
        if doc["sentiment_token_based"] == "Positive":
            positive_tokens.extend(flat_tokens)
        else:
            negative_tokens.extend(flat_tokens)


# Remove Stopwords first
positive_tokens_clean = [token for token in positive_tokens if token.isalpha() and token not in stop_words]
negative_tokens_clean = [token for token in negative_tokens if token.isalpha() and token not in stop_words]


#  Find actual common words and remove
# Turn tokens into Counters
positive_counter = Counter(positive_tokens_clean)
negative_counter = Counter(negative_tokens_clean)

# Find common words
positive_set = set(positive_tokens_clean)
negative_set = set(negative_tokens_clean)
common_words = positive_set.intersection(negative_set)

# Calculate total frequency of each common word
common_word_frequencies = {}
for word in common_words:
    total_count = positive_counter.get(word, 0) + negative_counter.get(word, 0)
    common_word_frequencies[word] = total_count

# Sort common words by frequency (descending)
common_words_sorted = sorted(common_word_frequencies.items(), key=lambda x: x[1], reverse=True)

# Remove only top 50% most frequent common words
half_common = len(common_words_sorted) // 2
words_to_remove = set([word for word, count in common_words_sorted[:half_common]])

print(f"[INFO] Total common words found: {len(common_words)}")
print(f"[INFO] Removing top 50% ({len(words_to_remove)}) common words based on frequency.")

# Now create unique lists by removing only those
positive_unique = [token for token in positive_tokens_clean if token not in words_to_remove]
negative_unique = [token for token in negative_tokens_clean if token not in words_to_remove]

from nltk.corpus import words

# Load set of all English words
english_words = set(words.words())

# Now filter positive and negative unique tokens
positive_unique = [token for token in positive_unique if token.lower() in english_words]
negative_unique = [token for token in negative_unique if token.lower() in english_words]

print(f"[INFO] After English filtering: {len(positive_unique)} positive tokens, {len(negative_unique)} negative tokens.")

print(f"[INFO] {len(positive_unique)} unique positive tokens left after partial removal.")
print(f"[INFO] {len(negative_unique)} unique negative tokens left after partial removal.")

#Create WordClouds

# Positive Word Cloud
positive_text = " ".join(positive_unique)

plt.figure(figsize=(10, 6))
wordcloud = WordCloud(width=800, height=400, background_color='white').generate(positive_text)
plt.imshow(wordcloud, interpolation='bilinear')
plt.axis('off')
plt.title('Unique Positive Words (after stopwords and common words removal)')
plt.show()

# Negative Word Cloud
negative_text = " ".join(negative_unique)

plt.figure(figsize=(10, 6))
wordcloud = WordCloud(width=800, height=400, background_color='white').generate(negative_text)
plt.imshow(wordcloud, interpolation='bilinear')
plt.axis('off')
plt.title('Unique Negative Words (after stopwords and common words removal)')
plt.show()

client.close()


In [None]:
# Connect to MongoDB
client = MongoClient("mongodb://localhost:27017/")
db = client["customer_reviews"]
collection = db["reviews"]

# Key Products with the Most Positive or Negative Feedback (Bar Chart)

# Top Products with Positive Feedback
top_positive_products = collection.aggregate([
    {"$match": {"sentiment_token_based": "Positive"}},
    {"$group": {"_id": "$product_name", "count": {"$sum": 1}}},
    {"$sort": {"count": -1}},
    {"$limit": 5}
])

# Top Products with Negative Feedback
top_negative_products = collection.aggregate([
    {"$match": {"sentiment_token_based": "Negative"}},
    {"$group": {"_id": "$product_name", "count": {"$sum": 1}}},
    {"$sort": {"count": -1}},
    {"$limit": 5}
])

# Plot Positive Products
products_pos = []
counts_pos = []
for doc in top_positive_products:
    products_pos.append(doc["_id"])
    counts_pos.append(doc["count"])

plt.figure(figsize=(10, 6))
plt.barh(products_pos, counts_pos, color="green")
plt.xlabel("Number of Positive Reviews")
plt.title("Top 5 Products with Most Positive Feedback")
plt.gca().invert_yaxis()  # Highest at top
plt.grid(axis='x')
plt.show()

# Plot Negative Products
products_neg = []
counts_neg = []
for doc in top_negative_products:
    products_neg.append(doc["_id"])
    counts_neg.append(doc["count"])

plt.figure(figsize=(10, 6))
plt.barh(products_neg, counts_neg, color="red")
plt.xlabel("Number of Negative Reviews")
plt.title("Top 5 Products with Most Negative Feedback")
plt.gca().invert_yaxis()  # Highest at top
plt.grid(axis='x')
plt.show()


client.close()