# =========================================
# Sentiment-Based Product Recommendation System
# =========================================

In [None]:
# -----------------------------
# Step 0: Import Libraries
# -----------------------------
import pandas as pd
import numpy as np
import re
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer

from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.preprocessing import LabelEncoder

from sklearn.metrics.pairwise import cosine_similarity
import pickle

import warnings
warnings.filterwarnings('ignore')

nltk.download('stopwords')
nltk.download('wordnet')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [None]:
from google.colab import drive
drive.mount('/content/gdrive')

Drive already mounted at /content/gdrive; to attempt to forcibly remount, call drive.mount("/content/gdrive", force_remount=True).


# Data Cleaning and Pre-Processing

In [None]:
import pandas as pd
df = pd.read_csv('/content/gdrive/MyDrive/Colab Notebooks/sample30.csv')

In [None]:
# -----------------------------
# Step 1: Load Dataset
# -----------------------------
df = pd.read_csv('/content/gdrive/MyDrive/Colab Notebooks/sample30.csv')
print(df.shape)
print(df.info())
print(df.isnull().sum())

(30000, 15)
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 30000 entries, 0 to 29999
Data columns (total 15 columns):
 #   Column                Non-Null Count  Dtype 
---  ------                --------------  ----- 
 0   id                    30000 non-null  object
 1   brand                 30000 non-null  object
 2   categories            30000 non-null  object
 3   manufacturer          29859 non-null  object
 4   name                  30000 non-null  object
 5   reviews_date          29954 non-null  object
 6   reviews_didPurchase   15932 non-null  object
 7   reviews_doRecommend   27430 non-null  object
 8   reviews_rating        30000 non-null  int64 
 9   reviews_text          30000 non-null  object
 10  reviews_title         29810 non-null  object
 11  reviews_userCity      1929 non-null   object
 12  reviews_userProvince  170 non-null    object
 13  reviews_username      29937 non-null  object
 14  user_sentiment        29999 non-null  object
dtypes: int64(1), object(14)


In [None]:
# -----------------------------
# Step 2: Data Cleaning
# -----------------------------
# Fill missing titles
df['reviews_title'] = df['reviews_title'].fillna("")

# Drop rows with missing usernames
df = df.dropna(subset=['reviews_username'])

# Fill missing manufacturer
df['manufacturer'] = df['manufacturer'].fillna("Unknown")

# Combine title + text
df['full_review'] = df['reviews_title'] + " " + df['reviews_text']

In [None]:
print(df.isnull().sum())

id                          0
brand                       0
categories                  0
manufacturer                0
name                        0
reviews_date               40
reviews_didPurchase     14006
reviews_doRecommend      2541
reviews_rating              0
reviews_text                0
reviews_title               0
reviews_userCity        28037
reviews_userProvince    29770
reviews_username            0
user_sentiment              1
full_review                 0
dtype: int64


In [71]:
df['reviews_username'].unique()[:5] # see first 5 usernames

array(['joshua', 'dorothy w', 'rebecca', 'walker557', 'samantha'],
      dtype=object)

# Text Preprocessing

In [None]:
# -----------------------------
# Step 3: Text Preprocessing
# -----------------------------
stop_words = set(stopwords.words('english'))
lemmatizer = WordNetLemmatizer()

def preprocess(text):
    text = text.lower()
    text = re.sub(r'[^a-z\s]', '', text)
    tokens = text.split()
    tokens = [lemmatizer.lemmatize(t) for t in tokens if t not in stop_words]
    return " ".join(tokens)

df['cleaned_review'] = df['full_review'].apply(preprocess)

# Feature Extraction

In [None]:
# Derive sentiment if missing
def map_rating_to_sentiment(r):
    if r >= 4: return 'Positive'
    elif r <= 2: return 'Negative'
    else: return 'Neutral'

# -----------------------------
# Step 4: Sentiment Labeling
# -----------------------------

df['user_sentiment'] = df['user_sentiment'].fillna(df['reviews_rating'].apply(map_rating_to_sentiment))

le = LabelEncoder()
df['sentiment_label'] = le.fit_transform(df['user_sentiment'])

# -----------------------------
# Step 5: Feature Extraction
# -----------------------------

vectorizer = TfidfVectorizer(max_features=5000, ngram_range=(1,2))
X = vectorizer.fit_transform(df['cleaned_review'])
y = df['sentiment_label']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, stratify=y, random_state=42)

# Model Building

In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import MultinomialNB
from xgboost import XGBClassifier
from sklearn.metrics import classification_report

models = {
    'LogisticRegression': LogisticRegression(max_iter=500, class_weight='balanced', random_state=42),
    'RandomForest': RandomForestClassifier(n_estimators=100, class_weight='balanced', random_state=42),
    'NaiveBayes': MultinomialNB(),
    'XGBoost': XGBClassifier(use_label_encoder=False, eval_metric='logloss', random_state=42)
}

for name, model in models.items():
    print(f"Training {name}...")
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    print(classification_report(y_test, y_pred))
    print("="*60)

# Choose Logistic Regression as best model
best_model = models['LogisticRegression']

Training LogisticRegression...
              precision    recall  f1-score   support

           0       0.44      0.85      0.58       671
           1       0.98      0.86      0.92      5317

    accuracy                           0.86      5988
   macro avg       0.71      0.86      0.75      5988
weighted avg       0.92      0.86      0.88      5988

Training RandomForest...
              precision    recall  f1-score   support

           0       0.85      0.44      0.58       671
           1       0.93      0.99      0.96      5317

    accuracy                           0.93      5988
   macro avg       0.89      0.71      0.77      5988
weighted avg       0.92      0.93      0.92      5988

Training NaiveBayes...
              precision    recall  f1-score   support

           0       0.51      0.13      0.21       671
           1       0.90      0.98      0.94      5317

    accuracy                           0.89      5988
   macro avg       0.70      0.56      0.58      



### Reason for Selecting Logistic Regression as Best Model

Although Random Forest and XGBoost achieved slightly higher overall accuracy, Logistic Regression was chosen as the best model because:

#### Better Handling of Minority Class (Negative Reviews):
Logistic Regression achieved a recall of 0.85 for the negative class, compared to only 0.36–0.42 for Random Forest and XGBoost.
This means it is much better at correctly identifying unhappy customers, which is crucial for business use cases (preventing churn, handling complaints).

#### Balanced Performance Across Classes:
Its macro F1-score (0.74) is higher than other models, indicating more balanced performance between positive and negative classes.
Random Forest and XGBoost, while strong on positives, struggled more with negatives, leading to imbalance.

#### Interpretability & Deployment Simplicity:
Logistic Regression is simpler, easier to interpret, and faster to train/predict, making it suitable for real-time applications like sentiment-based product recommendation.

# Build Recommendation System

In [None]:
from sklearn.metrics.pairwise import cosine_similarity

rating_matrix = df.pivot_table(index='reviews_username', columns='id', values='reviews_rating', aggfunc='mean')
rating_matrix_filled = rating_matrix.fillna(0)

item_similarity = cosine_similarity(rating_matrix_filled.T)
item_similarity_df = pd.DataFrame(item_similarity, index=rating_matrix_filled.columns, columns=rating_matrix_filled.columns)

def recommend_products(username, rating_matrix, item_similarity_df, top_n=20):
    if username not in rating_matrix.index:
        return []
    user_ratings = rating_matrix.loc[username].fillna(0)
    scores = item_similarity_df.dot(user_ratings)
    scores = scores / item_similarity_df.abs().sum(axis=1).replace(0, 1e-9)
    recommended = scores.sort_values(ascending=False).head(top_n).index.tolist()
    return recommended


# Recommend Top 20 Products

In [None]:
user = df['reviews_username'].iloc[0]
top_20_products = recommend_products(user, rating_matrix_filled, item_similarity_df, top_n=20)
print("Top 20 products:", top_20_products)

Top 20 products: ['AVpfBwE4ilAPnD_xTWO1', 'AV13O1A8GV-KLJ3akUyj', 'AVpfoSS51cnluZ0-oVH9', 'AVpf2tw1ilAPnD_xjflC', 'AVpe31o71cnluZ0-YrSD', 'AVpf0pfrilAPnD_xi6s_', 'AVpfrgjFLJeJML43BvCc', 'AVpf1pwXLJeJML43EqpT', 'AVpfewoLilAPnD_xcfgU', 'AVpfrFDZLJeJML43Bmv0', 'AVpe_dxlilAPnD_xSiHI', 'AVpfQtEm1cnluZ0-hUpe', 'AVpfMpZ51cnluZ0-f_L9', 'AVpfNWbPilAPnD_xXPR7', 'AVpfozgyilAPnD_xfe0r', 'AVpe7sl91cnluZ0-aI1Y', 'AVpf0eb2LJeJML43EVSt', 'AVpfR5m0LJeJML436K3W', 'AVpfPaoqLJeJML435Xk9', 'AVpe41TqilAPnD_xQH3d']


# Fine-tune with Sentiment

In [None]:
def filter_top5_products(username, top_products, df, sentiment_model, vectorizer, le):
    product_sentiment_score = {}
    positive_label = list(le.transform(['Positive']))[0]

    for product in top_products:
        reviews = df[df['id'] == product]['full_review']
        if reviews.empty:
            continue
        reviews_clean = reviews.apply(preprocess)
        X_reviews = vectorizer.transform(reviews_clean)
        preds = sentiment_model.predict(X_reviews)
        product_sentiment_score[product] = np.mean(preds == positive_label)

    top5 = sorted(product_sentiment_score, key=product_sentiment_score.get, reverse=True)[:5]
    return top5

top_5_products = filter_top5_products(user, top_20_products, df, best_model, vectorizer, le)
print("Top 5 products:", top_5_products)

Top 5 products: ['AV13O1A8GV-KLJ3akUyj', 'AVpf0pfrilAPnD_xi6s_', 'AVpf2tw1ilAPnD_xjflC', 'AVpfQtEm1cnluZ0-hUpe', 'AVpfPaoqLJeJML435Xk9']


# Flask Deployment

In [None]:
from flask import Flask, request
from pyngrok import ngrok
import pandas as pd
import numpy as np
import joblib

# ------------------------
# Load your pre-trained artifacts
# ------------------------
model = joblib.load("models/sentiment_model.joblib")
vectorizer = joblib.load("models/tfidf_vectorizer.joblib")
label_encoder = joblib.load("models/label_encoder.joblib")

# ------------------------
# Use in-memory variables from notebook
# ------------------------
# rating_matrix_filled and item_similarity_df must be already defined
# df must be already defined

# ------------------------
# Helper Functions
# ------------------------
def preprocess(text):
    import re
    import nltk
    from nltk.corpus import stopwords
    from nltk.stem import WordNetLemmatizer
    stop_words = set(stopwords.words('english'))
    lemmatizer = WordNetLemmatizer()

    text = text.lower()
    text = re.sub(r'[^a-z\s]', '', text)
    tokens = text.split()
    tokens = [lemmatizer.lemmatize(t) for t in tokens if t not in stop_words]
    return " ".join(tokens)

def filter_top5_products(username, top_products, df, sentiment_model, vectorizer, le):
    product_sentiment_score = {}
    positive_label = list(le.transform(['Positive']))[0]

    for product in top_products:
        reviews = df[df['id'] == product]['full_review']
        if reviews.empty:
            continue
        reviews_clean = reviews.apply(preprocess)
        X_reviews = vectorizer.transform(reviews_clean)
        preds = sentiment_model.predict(X_reviews)
        product_sentiment_score[product] = np.mean(preds == positive_label)

    top5 = sorted(product_sentiment_score, key=product_sentiment_score.get, reverse=True)[:5]
    return top5

def recommend_products(username):
    if username not in rating_matrix_filled.index:
        return ["No recommendations available for this user."]

    user_ratings = rating_matrix_filled.loc[username].fillna(0)
    scores = item_similarity_df.dot(user_ratings)
    scores = scores / item_similarity_df.abs().sum(axis=1).replace(0, 1e-9)
    top_20_products = scores.sort_values(ascending=False).head(20).index.tolist()
    top_5_products = filter_top5_products(username, top_20_products, df, model, vectorizer, label_encoder)
    return top_5_products

# ------------------------
# Flask App
# ------------------------
app = Flask(__name__)

@app.route("/", methods=["GET", "POST"])
def home():
    recommendations = None
    if request.method == "POST":
        username = request.form["username"]
        recommendations = recommend_products(username)
    return """
    <html>
      <head><title>Sentiment-Based Product Recommendation</title></head>
      <body>
        <h2>Enter Username for Recommendations</h2>
        <form method="POST">
          <input type="text" name="username" placeholder="Enter username" required>
          <button type="submit">Submit</button>
        </form>
        {}
      </body>
    </html>
    """.format("<br>".join(recommendations) if recommendations else "")

# ------------------------
# Ngrok setup
# ------------------------
ngrok.set_auth_token("339CJMtwVwFtfBTMlvd5AjfB1Ns_6YjZWvJq47mrz4fz44cg1")
public_url = ngrok.connect(5000)
print("Your app is live at:", public_url)

# ------------------------
# Run Flask App
# ------------------------
app.run(port=5000)


Your app is live at: NgrokTunnel: "https://cursedly-undebilitating-jeanetta.ngrok-free.dev" -> "http://localhost:5000"
 * Serving Flask app '__main__'
 * Debug mode: off


 * Running on http://127.0.0.1:5000
INFO:werkzeug:[33mPress CTRL+C to quit[0m
INFO:werkzeug:127.0.0.1 - - [24/Sep/2025 13:56:59] "GET / HTTP/1.1" 200 -
INFO:werkzeug:127.0.0.1 - - [24/Sep/2025 13:57:04] "POST / HTTP/1.1" 200 -
INFO:werkzeug:127.0.0.1 - - [24/Sep/2025 13:57:09] "POST / HTTP/1.1" 200 -
INFO:werkzeug:127.0.0.1 - - [24/Sep/2025 14:00:08] "POST / HTTP/1.1" 200 -
INFO:werkzeug:127.0.0.1 - - [24/Sep/2025 14:00:43] "POST / HTTP/1.1" 200 -
INFO:werkzeug:127.0.0.1 - - [24/Sep/2025 14:01:13] "POST / HTTP/1.1" 200 -
INFO:werkzeug:127.0.0.1 - - [24/Sep/2025 14:01:42] "POST / HTTP/1.1" 200 -
INFO:werkzeug:127.0.0.1 - - [24/Sep/2025 14:02:07] "POST / HTTP/1.1" 200 -
INFO:werkzeug:127.0.0.1 - - [24/Sep/2025 14:03:42] "POST / HTTP/1.1" 200 -
