In [1]:
import pandas as pd
import numpy as np

import re
import nltk
nltk.download('stopwords')
nltk.download('wordnet')

from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer

from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer

from sklearn.linear_model import LogisticRegression
from sklearn.svm import LinearSVC
from sklearn.naive_bayes import MultinomialNB

from sklearn.metrics import classification_report, f1_score, confusion_matrix


[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\umama\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\umama\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [2]:
df = pd.read_csv(r"C:\Users\umama\INNOMATICS\INNOMATICS INTERNSHIP\Sentiment App\data.csv")
df.head()

Unnamed: 0,Reviewer Name,Review Title,Place of Review,Up Votes,Down Votes,Month,Review text,Ratings
0,Kamal Suresh,Nice product,"Certified Buyer, Chirakkal",889.0,64.0,Feb 2021,"Nice product, good quality, but price is now r...",4
1,Flipkart Customer,Don't waste your money,"Certified Buyer, Hyderabad",109.0,6.0,Feb 2021,They didn't supplied Yonex Mavis 350. Outside ...,1
2,A. S. Raja Srinivasan,Did not meet expectations,"Certified Buyer, Dharmapuri",42.0,3.0,Apr 2021,Worst product. Damaged shuttlecocks packed in ...,1
3,Suresh Narayanasamy,Fair,"Certified Buyer, Chennai",25.0,1.0,,"Quite O. K. , but nowadays the quality of the...",3
4,ASHIK P A,Over priced,,147.0,24.0,Apr 2016,Over pricedJust â?¹620 ..from retailer.I didn'...,1


## EDA

In [3]:
df.columns = df.columns.str.strip().str.lower().str.replace(' ', '_')

In [4]:
df.shape

(8518, 8)

In [5]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8518 entries, 0 to 8517
Data columns (total 8 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   reviewer_name    8508 non-null   object 
 1   review_title     8508 non-null   object 
 2   place_of_review  8468 non-null   object 
 3   up_votes         8508 non-null   float64
 4   down_votes       8508 non-null   float64
 5   month            8053 non-null   object 
 6   review_text      8510 non-null   object 
 7   ratings          8518 non-null   int64  
dtypes: float64(2), int64(1), object(5)
memory usage: 532.5+ KB


In [6]:
df.isna().sum() #checking for missing values

reviewer_name       10
review_title        10
place_of_review     50
up_votes            10
down_votes          10
month              465
review_text          8
ratings              0
dtype: int64

In [7]:
df.columns

Index(['reviewer_name', 'review_title', 'place_of_review', 'up_votes',
       'down_votes', 'month', 'review_text', 'ratings'],
      dtype='object')

In [8]:
df = df.dropna(subset=['review_text'])

In [9]:
df.head()

Unnamed: 0,reviewer_name,review_title,place_of_review,up_votes,down_votes,month,review_text,ratings
0,Kamal Suresh,Nice product,"Certified Buyer, Chirakkal",889.0,64.0,Feb 2021,"Nice product, good quality, but price is now r...",4
1,Flipkart Customer,Don't waste your money,"Certified Buyer, Hyderabad",109.0,6.0,Feb 2021,They didn't supplied Yonex Mavis 350. Outside ...,1
2,A. S. Raja Srinivasan,Did not meet expectations,"Certified Buyer, Dharmapuri",42.0,3.0,Apr 2021,Worst product. Damaged shuttlecocks packed in ...,1
3,Suresh Narayanasamy,Fair,"Certified Buyer, Chennai",25.0,1.0,,"Quite O. K. , but nowadays the quality of the...",3
4,ASHIK P A,Over priced,,147.0,24.0,Apr 2016,Over pricedJust â?¹620 ..from retailer.I didn'...,1


In [10]:
df.columns

Index(['reviewer_name', 'review_title', 'place_of_review', 'up_votes',
       'down_votes', 'month', 'review_text', 'ratings'],
      dtype='object')

### Sentiment Label Creation
### Convert ratings into binary sentiment and remove neutral reviews.

In [11]:
df = df[['review_title','review_text','ratings']]

In [12]:
df.head()

Unnamed: 0,review_title,review_text,ratings
0,Nice product,"Nice product, good quality, but price is now r...",4
1,Don't waste your money,They didn't supplied Yonex Mavis 350. Outside ...,1
2,Did not meet expectations,Worst product. Damaged shuttlecocks packed in ...,1
3,Fair,"Quite O. K. , but nowadays the quality of the...",3
4,Over priced,Over pricedJust â?¹620 ..from retailer.I didn'...,1


In [13]:
df['sentiment'] = np.where(df['ratings'] <= 2, 0, 1)
df['sentiment'] = df['sentiment'].astype(int)
df['sentiment'].value_counts()


sentiment
1    7438
0    1072
Name: count, dtype: int64

In [14]:
# Merge title and review body to retain full context.
df['review'] = df['review_title'].fillna('') + " " + df['review_text'].fillna('')

In [15]:
df

Unnamed: 0,review_title,review_text,ratings,sentiment,review
0,Nice product,"Nice product, good quality, but price is now r...",4,1,"Nice product Nice product, good quality, but p..."
1,Don't waste your money,They didn't supplied Yonex Mavis 350. Outside ...,1,0,Don't waste your money They didn't supplied Yo...
2,Did not meet expectations,Worst product. Damaged shuttlecocks packed in ...,1,0,Did not meet expectations Worst product. Damag...
3,Fair,"Quite O. K. , but nowadays the quality of the...",3,1,"Fair Quite O. K. , but nowadays the quality o..."
4,Over priced,Over pricedJust â?¹620 ..from retailer.I didn'...,1,0,Over priced Over pricedJust â?¹620 ..from reta...
...,...,...,...,...,...
8505,Very Good,Delivered before time but price is high from m...,3,1,Very Good Delivered before time but price is h...
8506,Don't waste your money,up to the mark but same is available in market...,4,1,Don't waste your money up to the mark but same...
8507,Really Nice,Nice delivery speedREAD MORE,5,1,Really Nice Nice delivery speedREAD MORE
8508,,No complaints about the item . Its the best on...,5,1,No complaints about the item . Its the best o...


In [16]:
stop_words = set(stopwords.words('english'))
lemmatizer = WordNetLemmatizer()

In [17]:
def preprocess_text(text):
    text = text.lower()
    text = re.sub(r'[^a-z\s]', '', text)
    tokens = text.split()
    tokens = [w for w in tokens if w not in stop_words]
    tokens = [lemmatizer.lemmatize(w) for w in tokens]
    return " ".join(tokens)

In [18]:
df['clean_review'] = df['review'].apply(preprocess_text)
df[['review', 'clean_review']].head()

Unnamed: 0,review,clean_review
0,"Nice product Nice product, good quality, but p...",nice product nice product good quality price r...
1,Don't waste your money They didn't supplied Yo...,dont waste money didnt supplied yonex mavis ou...
2,Did not meet expectations Worst product. Damag...,meet expectation worst product damaged shuttle...
3,"Fair Quite O. K. , but nowadays the quality o...",fair quite k nowadays quality cork like year b...
4,Over priced Over pricedJust â?¹620 ..from reta...,priced pricedjust retaileri didnt understand w...


In [19]:
X = df['clean_review']
y = df['sentiment']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

In [20]:
tfidf = TfidfVectorizer(max_features=5000,ngram_range=(1,2),min_df=5)

X_train_tfidf = tfidf.fit_transform(X_train)
X_test_tfidf = tfidf.transform(X_test)


In [21]:
models = {
    "Naive Bayes": MultinomialNB(),
    "Logistic Regression": LogisticRegression(max_iter=1000),
    "Linear SVM": LinearSVC()
}

results = {}

for name, model in models.items():
    model.fit(X_train_tfidf, y_train)
    preds = model.predict(X_test_tfidf)
    f1 = f1_score(y_test, preds)
    results[name] = f1
    print(f"{name} F1-score: {f1:.4f}")

Naive Bayes F1-score: 0.9542
Logistic Regression F1-score: 0.9556
Linear SVM F1-score: 0.9538


### Model Selection

In [22]:
best_model_name = max(results, key=results.get)
best_f1 = results[best_model_name]

print(f"Best Model: {best_model_name}")
print(f"Best F1-score: {best_f1:.4f}")

Best Model: Logistic Regression
Best F1-score: 0.9556


In [23]:
best_model = models[best_model_name]
best_model.fit(X_train_tfidf, y_train)

y_pred = best_model.predict(X_test_tfidf)
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.87      0.43      0.57       214
           1       0.92      0.99      0.96      1488

    accuracy                           0.92      1702
   macro avg       0.89      0.71      0.76      1702
weighted avg       0.92      0.92      0.91      1702



In [25]:
import pickle

In [26]:
with open("sentiment_model.pkl", "wb") as f:
    pickle.dump(best_model, f)

In [27]:
with open("tfidf_vectorizer.pkl", "wb") as f:
    pickle.dump(tfidf, f)

**I compared Naive Bayes, Logistic Regression, and Linear SVM using TF-IDF features and selected Logistic Regression based on the highest F1-score, then deployed it for real-time sentiment analysis.**

In [28]:
negative_reviews = df[df['sentiment'] == 0]['clean_review']

neg_tfidf = TfidfVectorizer(max_features=20, ngram_range=(1,2))
X_neg = neg_tfidf.fit_transform(negative_reviews)

neg_tfidf.get_feature_names_out()

array(['bad', 'box', 'buy', 'day', 'dont', 'good', 'goodread', 'mavis',
       'money', 'one', 'poor', 'product', 'productread', 'purchase',
       'quality', 'qualityread', 'shuttle', 'terrific', 'waste', 'worst'],
      dtype=object)

Identify common complaint terms from negative reviews.