# Sentiment-Based Product Recommendation System

## Import Libraries

In [20]:
# -*- coding: utf-8 -*-
# =========================================
# Step 0: Import Libraries
# =========================================
import pandas as pd
import numpy as np
import re
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer

from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import MultinomialNB
from xgboost import XGBClassifier
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import classification_report
from sklearn.metrics.pairwise import cosine_similarity

import joblib, warnings
warnings.filterwarnings('ignore')

nltk.download('stopwords')
nltk.download('wordnet')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [21]:
from google.colab import drive
drive.mount('/content/gdrive')

Drive already mounted at /content/gdrive; to attempt to forcibly remount, call drive.mount("/content/gdrive", force_remount=True).


# Task-1: Data Cleaning and Pre-Processing

In [25]:
# =========================================
# Load Dataset
# =========================================
df = pd.read_csv('/content/gdrive/MyDrive/Colab Notebooks/sample30.csv')
print(df.shape)
print(df.info())

(30000, 15)
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 30000 entries, 0 to 29999
Data columns (total 15 columns):
 #   Column                Non-Null Count  Dtype 
---  ------                --------------  ----- 
 0   id                    30000 non-null  object
 1   brand                 30000 non-null  object
 2   categories            30000 non-null  object
 3   manufacturer          29859 non-null  object
 4   name                  30000 non-null  object
 5   reviews_date          29954 non-null  object
 6   reviews_didPurchase   15932 non-null  object
 7   reviews_doRecommend   27430 non-null  object
 8   reviews_rating        30000 non-null  int64 
 9   reviews_text          30000 non-null  object
 10  reviews_title         29810 non-null  object
 11  reviews_userCity      1929 non-null   object
 12  reviews_userProvince  170 non-null    object
 13  reviews_username      29937 non-null  object
 14  user_sentiment        29999 non-null  object
dtypes: int64(1), object(14)


In [26]:
# =========================================
# Step 1: Data Cleaning & Preprocessing
# =========================================
irrelevant_columns = [
    'reviews_userCity',
    'reviews_userProvince',
    'reviews_date',
    'reviews_didPurchase',
    'reviews_doRecommend'
]
df.drop(columns=irrelevant_columns, inplace=True)

df['reviews_title'] = df['reviews_title'].fillna("")
df['manufacturer'] = df['manufacturer'].fillna("Unknown")
df = df.dropna(subset=['reviews_username'])

df['full_review'] = df['reviews_title'] + " " + df['reviews_text']
df['reviews_rating'] = df['reviews_rating'].astype(int)
df['id'] = df['id'].astype(str)
df['reviews_username'] = df['reviews_username'].astype(str)

def map_rating_to_sentiment(r):
    if r >= 4: return 'Positive'
    elif r <= 2: return 'Negative'
    else: return 'Neutral'

df['user_sentiment'] = df['user_sentiment'].fillna(df['reviews_rating'].apply(map_rating_to_sentiment))
le = LabelEncoder()
df['sentiment_label'] = le.fit_transform(df['user_sentiment'])

# final dataset
print(df.info())

<class 'pandas.core.frame.DataFrame'>
Index: 29937 entries, 0 to 29999
Data columns (total 12 columns):
 #   Column            Non-Null Count  Dtype 
---  ------            --------------  ----- 
 0   id                29937 non-null  object
 1   brand             29937 non-null  object
 2   categories        29937 non-null  object
 3   manufacturer      29937 non-null  object
 4   name              29937 non-null  object
 5   reviews_rating    29937 non-null  int64 
 6   reviews_text      29937 non-null  object
 7   reviews_title     29937 non-null  object
 8   reviews_username  29937 non-null  object
 9   user_sentiment    29937 non-null  object
 10  full_review       29937 non-null  object
 11  sentiment_label   29937 non-null  int64 
dtypes: int64(2), object(10)
memory usage: 3.0+ MB
None


# Task-2: Text Preprocessing

In [27]:
# =========================================
# Step 2: Text Preprocessing
# =========================================
stop_words = set(stopwords.words('english'))
lemmatizer = WordNetLemmatizer()

def preprocess(text):
    text = re.sub(r'[^a-z\s]', '', text.lower())
    tokens = text.split()
    tokens = [lemmatizer.lemmatize(t) for t in tokens if t not in stop_words]
    return " ".join(tokens)

df['cleaned_review'] = df['full_review'].apply(preprocess)

# Task-3: Feature Extraction

In [28]:
# =========================================
# Step 3: Feature Extraction
# =========================================
vectorizer = TfidfVectorizer(max_features=5000, ngram_range=(1,2))
X = vectorizer.fit_transform(df['cleaned_review'])
y = df['sentiment_label']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, stratify=y, random_state=42)

# Task-4: Model Building

In [29]:
# =========================================
# Step 4: Model Building
# =========================================
models = {
    'LogisticRegression': LogisticRegression(max_iter=500, class_weight='balanced', random_state=42),
    'RandomForest': RandomForestClassifier(n_estimators=100, class_weight='balanced', random_state=42),
    'NaiveBayes': MultinomialNB(),
    'XGBoost': XGBClassifier(use_label_encoder=False, eval_metric='logloss', random_state=42)
}

model_performance = {}

for name, model in models.items():
    print(f"Training {name}...")
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    report = classification_report(y_test, y_pred, output_dict=True)
    model_performance[name] = report['accuracy']  # Store accuracy for comparison
    print(classification_report(y_test, y_pred))
    print("="*60)

# Select the best model based on overall accuracy
best_model_name = max(model_performance, key=model_performance.get)
best_model = models[best_model_name]
print(f"Selected best model: {best_model_name} with accuracy: {model_performance[best_model_name]:.2f}")

# RandomForest is selected because:
# 1. It achieves the highest overall accuracy (0.93) among all models.
# 2. Weighted F1-score is the best, giving good balance between majority and minority classes.
# 3. Handles class imbalance better in practice, with robust performance on unseen data.


Training LogisticRegression...
              precision    recall  f1-score   support

           0       0.44      0.85      0.58       671
           1       0.98      0.86      0.92      5317

    accuracy                           0.86      5988
   macro avg       0.71      0.86      0.75      5988
weighted avg       0.92      0.86      0.88      5988

Training RandomForest...
              precision    recall  f1-score   support

           0       0.85      0.44      0.58       671
           1       0.93      0.99      0.96      5317

    accuracy                           0.93      5988
   macro avg       0.89      0.71      0.77      5988
weighted avg       0.92      0.93      0.92      5988

Training NaiveBayes...
              precision    recall  f1-score   support

           0       0.51      0.13      0.21       671
           1       0.90      0.98      0.94      5317

    accuracy                           0.89      5988
   macro avg       0.70      0.56      0.58      

# Task-5: Building the Recommendation System

In [30]:
# =========================================
# Step 5: Build Recommendation Systems
# =========================================
rating_matrix = df.pivot_table(index='reviews_username', columns='id', values='reviews_rating', aggfunc='mean')
rating_matrix_filled = rating_matrix.fillna(0)

# --- Item-based ---
item_similarity = cosine_similarity(rating_matrix_filled.T)
item_similarity_df = pd.DataFrame(item_similarity, index=rating_matrix_filled.columns, columns=rating_matrix_filled.columns)

def recommend_products_item(username, rating_matrix, item_similarity_df, top_n=20):
    if username not in rating_matrix.index:
        return []
    user_ratings = rating_matrix.loc[username].fillna(0)
    scores = item_similarity_df.dot(user_ratings)
    scores = scores / item_similarity_df.abs().sum(axis=1).replace(0, 1e-9)
    return scores.sort_values(ascending=False).head(top_n).index.tolist()

# --- User-based ---
user_similarity = cosine_similarity(rating_matrix_filled)
user_similarity_df = pd.DataFrame(user_similarity, index=rating_matrix_filled.index, columns=rating_matrix_filled.index)

def recommend_products_user(username, rating_matrix, user_similarity_df, top_n=20):
    if username not in rating_matrix.index:
        return []
    sim_scores = user_similarity_df[username]
    weighted_ratings = rating_matrix.T.dot(sim_scores) / sim_scores.sum()
    return weighted_ratings.sort_values(ascending=False).head(top_n).index.tolist()


# Task-6: Recommendation of TOP 20 Products

In [31]:
# =========================================
# Step 6: Recommend Top 20 Products for a Specified User
# =========================================

# Pick any sample user from the dataset
sample_username = df['reviews_username'].dropna().iloc[0]
best_system = ''
if best_system == 'Item-based':
    top20_products = recommend_products_item(sample_username, rating_matrix, item_similarity_df, top_n=20)
else:
    top20_products = recommend_products_user(sample_username, rating_matrix, user_similarity_df, top_n=20)

print(f"\nTop 20 products recommended for user '{sample_username}' using the {best_system} system:")
print(top20_products)

# --- Evaluate which system is better ---
def evaluate_system(usernames, top_n=20):
    item_scores, user_scores = [], []
    for u in usernames:
        if u not in rating_matrix_filled.index:
            continue
        top_items = recommend_products_item(u, rating_matrix_filled, item_similarity_df, top_n)
        top_users = recommend_products_user(u, rating_matrix_filled, user_similarity_df, top_n)
        item_scores.append(rating_matrix_filled.loc[u, top_items].mean())
        user_scores.append(rating_matrix_filled.loc[u, top_users].mean())
    avg_item = np.mean(item_scores)
    avg_user = np.mean(user_scores)
    print(f"Avg rating item-based: {avg_item:.2f}, user-based: {avg_user:.2f}")
    return 'Item-based' if avg_item >= avg_user else 'User-based'

best_system = evaluate_system(rating_matrix_filled.index.tolist())
print(f"Best recommendation system: {best_system}")



Top 20 products recommended for user 'joshua' using the  system:
['AV13O1A8GV-KLJ3akUyj', 'AV14LG0R-jtxr-f38QfS', 'AV16khLE-jtxr-f38VFn', 'AV1YGDqsGV-KLJ3adc-O', 'AV1YIch7GV-KLJ3addeG', 'AV1YlENIglJLPUi8IHsX', 'AV1YmBrdGV-KLJ3adewb', 'AV1YmDL9vKc47QAVgr7_', 'AV1Ymf_rglJLPUi8II2v', 'AV1Yn94nvKc47QAVgtst', 'AV1YnUMYglJLPUi8IJpK', 'AV1Ynb3bglJLPUi8IJxJ', 'AV1YneDPglJLPUi8IJyQ', 'AV1Yo6FPglJLPUi8IK3u', 'AV1YpiJvvKc47QAVguxy', 'AV1YqAaMGV-KLJ3adiDj', 'AV1Ys0kTvKc47QAVgx1C', 'AV1YtGjdglJLPUi8IOfJ', 'AV1ZSp2uglJLPUi8IQFy', 'AV1ZT7GLglJLPUi8IQLI']
Avg rating item-based: 0.25, user-based: 0.25
Best recommendation system: Item-based


# Task-7: Fine-tune Top 5 Products with Sentiment

In [32]:
# =========================================
# Step 7: Fine-tune Top 5 Products with Sentiment
# =========================================
def filter_top5_products(username, top_products, df, sentiment_model, vectorizer, le):
    product_sentiment_score = {}
    positive_label = le.transform(['Positive'])[0]
    for product in top_products:
        reviews = df[df['id'] == product]['full_review']
        if reviews.empty:
            continue
        reviews_clean = reviews.apply(preprocess)
        X_reviews = vectorizer.transform(reviews_clean)
        preds = sentiment_model.predict(X_reviews)
        product_sentiment_score[product] = np.mean(preds == positive_label)
    top5 = sorted(product_sentiment_score, key=product_sentiment_score.get, reverse=True)[:5]
    return top5

def recommend(username):
    if username not in rating_matrix_filled.index:
        return ["No recommendations available for this user."]

    if best_system == 'Item-based':
        top_20 = recommend_products_item(username, rating_matrix_filled, item_similarity_df)
    else:
        top_20 = recommend_products_user(username, rating_matrix_filled, user_similarity_df)

    top_5 = filter_top5_products(username, top_20, df, best_model, vectorizer, le)
    return top_5

# Task-8: Flask Deployment

In [33]:
# =========================================
# Step 8: Flask Deployment
# =========================================
import flask
from flask import Flask, request
from pyngrok import ngrok

# ------------------------
# Flask App
# ------------------------
app = Flask(__name__)

@app.route("/", methods=["GET", "POST"])
def home():
    recommendations = None
    username = None
    if request.method == "POST":
        username = request.form["username"]
        recommendations = recommend_products(username)

    # Updated HTML
    html_content = """
    <!DOCTYPE html>
    <html>
    <head>
        <title>Welcome to Sentiment-Based Product Recommendation</title>
        <style>
            body {{ font-family: Arial, sans-serif; margin: 40px; }}
            h2, h3 {{ color: #333; }}
            input {{ padding: 8px; margin-right: 10px; width: 250px; }}
            button {{ padding: 8px 16px; }}
            ul {{ list-style-type: none; padding-left: 0; }}
            li {{ margin: 5px 0; }}
            .no-recommend {{ color: red; }}
        </style>
    </head>
    <body>
        <h1>Welcome to Sentiment-Based Product Recommendation</h1>
        <h2>Enter Username for Recommendations</h2>
        <form method="POST">
            <input type="text" name="username" placeholder="Enter username" required>
            <button type="submit">Submit</button>
        </form>
        {recommendations_html}
    </body>
    </html>
    """

    # Build recommendations HTML
    if recommendations:
        if recommendations[0] == "No recommendations available for this user.":
            recommendations_html = f'<p class="no-recommend">{recommendations[0]}</p>'
        else:
            recommendations_html = f"<h3>Top 5 Product Recommendations for <b>{username}</b>:</h3><ul>"
            for product in recommendations:
                recommendations_html += f"<li>{product}</li>"
            recommendations_html += "</ul>"
    else:
        recommendations_html = ""

    return html_content.format(recommendations_html=recommendations_html)


In [None]:
# ------------------------
# Ngrok setup
# ------------------------
ngrok.set_auth_token("XXXXXXXX") # Put your keys here 
public_url = ngrok.connect(5000)
print("Your app is live at:", public_url)

# ------------------------
# Run Flask App
# ------------------------
app.run(port=5000)

Your app is live at: NgrokTunnel: "https://cursedly-undebilitating-jeanetta.ngrok-free.dev" -> "http://localhost:5000"
 * Serving Flask app '__main__'
 * Debug mode: off


 * Running on http://127.0.0.1:5000
INFO:werkzeug:[33mPress CTRL+C to quit[0m
INFO:werkzeug:127.0.0.1 - - [24/Sep/2025 18:14:37] "GET / HTTP/1.1" 200 -
INFO:werkzeug:127.0.0.1 - - [24/Sep/2025 18:14:41] "POST / HTTP/1.1" 200 -
INFO:werkzeug:127.0.0.1 - - [24/Sep/2025 18:14:44] "POST / HTTP/1.1" 200 -
INFO:werkzeug:127.0.0.1 - - [24/Sep/2025 18:14:48] "POST / HTTP/1.1" 200 -
INFO:werkzeug:127.0.0.1 - - [24/Sep/2025 18:14:53] "POST / HTTP/1.1" 200 -
INFO:werkzeug:127.0.0.1 - - [24/Sep/2025 18:15:00] "POST / HTTP/1.1" 200 -
INFO:werkzeug:127.0.0.1 - - [24/Sep/2025 18:15:05] "POST / HTTP/1.1" 200 -
