# Sentiment Analysis of Real-time Flipkart Product Reviews

# Import Required Libraries

In [1]:
import pandas as pd
import numpy as np
import re
import nltk
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
warnings.filterwarnings("ignore")

from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer

from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import MultinomialNB
from sklearn.svm import LinearSVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import f1_score, classification_report

from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

import joblib

nltk.download("stopwords",quiet = True)
nltk.download("wordnet",quiet = True)

True

# Load the Dataset

In [2]:
df = pd.read_csv("reviews_badminton/data.csv")
df.head()

Unnamed: 0,Reviewer Name,Review Title,Place of Review,Up Votes,Down Votes,Month,Review text,Ratings
0,Kamal Suresh,Nice product,"Certified Buyer, Chirakkal",889.0,64.0,Feb 2021,"Nice product, good quality, but price is now r...",4
1,Flipkart Customer,Don't waste your money,"Certified Buyer, Hyderabad",109.0,6.0,Feb 2021,They didn't supplied Yonex Mavis 350. Outside ...,1
2,A. S. Raja Srinivasan,Did not meet expectations,"Certified Buyer, Dharmapuri",42.0,3.0,Apr 2021,Worst product. Damaged shuttlecocks packed in ...,1
3,Suresh Narayanasamy,Fair,"Certified Buyer, Chennai",25.0,1.0,,"Quite O. K. , but nowadays the quality of the...",3
4,ASHIK P A,Over priced,,147.0,24.0,Apr 2016,Over pricedJust â?¹620 ..from retailer.I didn'...,1


In [3]:
df.shape

(8518, 8)

# EDA (Exploratary Data Analysis)

In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8518 entries, 0 to 8517
Data columns (total 8 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   Reviewer Name    8508 non-null   object 
 1   Review Title     8508 non-null   object 
 2   Place of Review  8468 non-null   object 
 3   Up Votes         8508 non-null   float64
 4   Down Votes       8508 non-null   float64
 5   Month            8053 non-null   object 
 6   Review text      8510 non-null   object 
 7   Ratings          8518 non-null   int64  
dtypes: float64(2), int64(1), object(5)
memory usage: 532.5+ KB


In [5]:
df.isnull().sum()

Reviewer Name       10
Review Title        10
Place of Review     50
Up Votes            10
Down Votes          10
Month              465
Review text          8
Ratings              0
dtype: int64

In [6]:
# Remove neutral reviews
df = df[df["Ratings"] != 3]

# Create sentiment label
df["sentiment"] = df["Ratings"].apply(lambda x: 1 if x >= 4 else 0)

# Combine title and review text
df["text"] = df["Review Title"].fillna("") + " " + df["Review text"].fillna("")
df = df[["text", "sentiment"]]
df.head()

Unnamed: 0,text,sentiment
0,"Nice product Nice product, good quality, but p...",1
1,Don't waste your money They didn't supplied Yo...,0
2,Did not meet expectations Worst product. Damag...,0
4,Over priced Over pricedJust â?¹620 ..from reta...,0
5,Mind-blowing purchase Good quality product. De...,1


# Text Preprocessing

In [7]:
stop_words = set(stopwords.words("english")) - {"not", "no", "never"}
lemmatizer = WordNetLemmatizer()

def preprocess(text):
    text = text.lower()
    text = re.sub(r"http\S+|www\S+", "", text)
    text = re.sub(r"[^a-z\s]", "", text)
    words = text.split()
    words = [lemmatizer.lemmatize(w) for w in words if w not in stop_words]
    return " ".join(words)

In [8]:
df["processed_text"] = df["text"].apply(preprocess)
df.head()

Unnamed: 0,text,sentiment,processed_text
0,"Nice product Nice product, good quality, but p...",1,nice product nice product good quality price r...
1,Don't waste your money They didn't supplied Yo...,0,dont waste money didnt supplied yonex mavis ou...
2,Did not meet expectations Worst product. Damag...,0,not meet expectation worst product damaged shu...
4,Over priced Over pricedJust â?¹620 ..from reta...,0,priced pricedjust retaileri didnt understand w...
5,Mind-blowing purchase Good quality product. De...,1,mindblowing purchase good quality product deli...


# Train-Test Split

In [9]:
X = df["processed_text"]
y = df["sentiment"]

X_train, X_test, y_train, y_test = train_test_split(X, y,test_size=0.25,random_state=42,stratify=y)

# TF-IDF Feature Extraction

In [10]:
tfidf = TfidfVectorizer(max_features=7000,ngram_range=(1, 2))

X_train_tfidf = tfidf.fit_transform(X_train)
X_test_tfidf = tfidf.transform(X_test)

# Machine Learning Models (Comparison)

In [11]:
models = {
    "Logistic Regression": LogisticRegression(max_iter=1000),
    "Naive Bayes": MultinomialNB(),
    "Linear SVM": LinearSVC(),
    "Random Forest": RandomForestClassifier(n_estimators=200)
}

print("TF-IDF MODEL RESULTS\n")

for name, model in models.items():
    model.fit(X_train_tfidf, y_train)
    preds = model.predict(X_test_tfidf)
    print(f"{name} F1-score: {f1_score(y_test, preds):.4f}")

TF-IDF MODEL RESULTS

Logistic Regression F1-score: 0.9632
Naive Bayes F1-score: 0.9437
Linear SVM F1-score: 0.9711
Random Forest F1-score: 0.9665


# Final Model Selection ---> (Important)

In [12]:
print("\nLogistic Regression selected as final model based on best F1-score and simplicity.")

best_model = models["Logistic Regression"]

final_preds = best_model.predict(X_test_tfidf)
print("\nFinal Model Classification Report:")
print(classification_report(y_test, final_preds))


Logistic Regression selected as final model based on best F1-score and simplicity.

Final Model Classification Report:
              precision    recall  f1-score   support

           0       0.90      0.59      0.71       269
           1       0.94      0.99      0.96      1707

    accuracy                           0.93      1976
   macro avg       0.92      0.79      0.84      1976
weighted avg       0.93      0.93      0.93      1976



# Save Model and Vectorizer

In [13]:
joblib.dump(best_model, "sentiment_model.pkl")
joblib.dump(tfidf, "tfidf_vectorizer.pkl")

print("\n✅ Model and Vectorizer saved successfully")


✅ Model and Vectorizer saved successfully


# Deep Learning Model (LSTM) ---> Compaison Only

In [14]:
tokenizer = Tokenizer(num_words=20000)
tokenizer.fit_on_texts(X_train)

X_train_seq = tokenizer.texts_to_sequences(X_train)
X_test_seq = tokenizer.texts_to_sequences(X_test)

X_train_pad = pad_sequences(X_train_seq, maxlen=200)
X_test_pad = pad_sequences(X_test_seq, maxlen=200)

lstm_model = Sequential([Embedding(20000, 128, input_length=200),LSTM(128),Dense(1, activation="sigmoid")])

lstm_model.compile(optimizer="adam", loss="binary_crossentropy")
lstm_model.fit(X_train_pad,y_train,epochs=5,batch_size=64,validation_split=0.1,verbose=1)

lstm_preds = (lstm_model.predict(X_test_pad) > 0.5).astype(int)
print(f"\nLSTM F1-score (Comparison Only): {f1_score(y_test, lstm_preds):.4f}")

Epoch 1/5
[1m84/84[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m30s[0m 286ms/step - loss: 0.3299 - val_loss: 0.1918
Epoch 2/5
[1m84/84[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m14s[0m 171ms/step - loss: 0.1710 - val_loss: 0.1721
Epoch 3/5
[1m84/84[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m18s[0m 213ms/step - loss: 0.1238 - val_loss: 0.1550
Epoch 4/5
[1m84/84[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m17s[0m 204ms/step - loss: 0.0973 - val_loss: 0.1868
Epoch 5/5
[1m84/84[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m17s[0m 201ms/step - loss: 0.0823 - val_loss: 0.1818
[1m62/62[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 41ms/step

LSTM F1-score (Comparison Only): 0.9639
