# **Sentiment Analysis on Starbucks Review Data using Naive Bayes**

In [12]:
# Import necessary libraries
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import BernoulliNB,MultinomialNB
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from sklearn.utils.class_weight import compute_class_weight
from imblearn.over_sampling import SMOTE
from nltk.corpus import stopwords
import string
import nltk

In [3]:
# Downloading NLTK stopwords
nltk.download('stopwords')
stop_words = set(stopwords.words('english'))

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


In [6]:
# Loading the dataset
df = pd.read_csv('reviews_data.csv')
df.head()

Unnamed: 0,Rating,Review
0,5.0,Amber and LaDonna at the Starbucks on Southwes...
1,5.0,** at the Starbucks by the fire station on 436...
2,5.0,I just wanted to go out of my way to recognize...
3,5.0,Me and my friend were at Starbucks and my card...
4,5.0,I’m on this kick of drinking 5 cups of warm wa...


Data Pre-processing

In [7]:
# Drop missing and NA values
df.dropna(inplace=True)

In [8]:
# Function to preprocess text data
def preprocess_text(text):
    # Convert to lowercase
    text = text.lower()
    # Remove punctuation
    text = text.translate(str.maketrans('', '', string.punctuation))
    # Remove stopwords
    text = ' '.join([word for word in text.split() if word not in stop_words])
    return text

In [9]:
# Applying the preprocessing function to the 'reviews' column and displaying the cleaned dataframe
df['Review'] = df['Review'].apply(preprocess_text)
df1 = df.copy()
df1.head()

Unnamed: 0,Rating,Review
0,5.0,amber ladonna starbucks southwest parkway alwa...
1,5.0,starbucks fire station 436 altamonte springs f...
2,5.0,wanted go way recognize starbucks employee bil...
3,5.0,friend starbucks card didn’t work thankful wor...
4,5.0,i’m kick drinking 5 cups warm water work insta...


In [10]:
# Checking class distribution in the dataset
print("Class distribution in the dataset:")
print(df['Rating'].value_counts())

Class distribution in the dataset:
Rating
1.0    451
2.0     99
5.0     83
4.0     39
3.0     33
Name: count, dtype: int64


In [11]:
# Separating features and target
X = df1['Review']
y = df1['Rating']

**Bernoulli Naive Bayes**

In [13]:
# Initialize the Count Vectorizer and fit it on the data
vectorizer1 = CountVectorizer(binary = True)
X1 = vectorizer1.fit_transform(X)

In [14]:
# Applying SMOTE to balance the dataset
smote = SMOTE(random_state=42)
X_resampled, y_resampled = smote.fit_resample(X1, y)

In [15]:
# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X_resampled, y_resampled, test_size=0.25, random_state=42)

In [16]:
# Initializing the Bernoulli Naive Bayes model
bnb = BernoulliNB()

In [17]:
# Train the model on the training data
bnb.fit(X_train, y_train)

In [18]:
# Making predictions on the test set
y_pred = bnb.predict(X_test)

In [20]:
# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
accuracy

0.5035460992907801

**Multinomial Naive Bayes**

In [21]:
# Initializing the Count Vectorizer and fit it on the data
vectorizer = CountVectorizer(binary=False)
X_vectorized = vectorizer.fit_transform(X)

In [22]:
# Applying SMOTE to balance the dataset
smote = SMOTE(random_state=42)
X_resampled, y_resampled = smote.fit_resample(X_vectorized, y)

In [23]:
# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X_resampled, y_resampled, test_size=0.25, random_state=42)

In [24]:
# Initializing the Multinomial Naive Bayes model
mnb = MultinomialNB()

In [25]:
# Train the model on the training data
mnb.fit(X_train, y_train)

In [27]:
# Making predictions on the test set
y_pred = mnb.predict(X_test)

In [29]:
# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
accuracy

0.7163120567375887

**TF-IDF with Multinomial Naive Bayes**

In [30]:
from sklearn.feature_extraction.text import TfidfVectorizer

In [33]:
# Initializing the TFIDF Vectorizer
vectorizer = TfidfVectorizer()

In [34]:
# Vectorize the text data
X_vectorized = vectorizer.fit_transform(X)

In [36]:
# Applying SMOTE to balance the dataset
smote = SMOTE(random_state=42)
X_resampled, y_resampled = smote.fit_resample(X_vectorized, y)

In [38]:
# Split the resampled data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X_resampled, y_resampled, test_size=0.25, random_state=42)

In [39]:
# Initializing the Multinomial Naive Bayes model
mnb = MultinomialNB()

In [40]:
# Training the model on the training data
mnb.fit(X_train, y_train)

In [41]:
# Making predictions on the test set
y_pred = mnb.predict(X_test)

In [42]:
# Evaluating the model
accuracy = accuracy_score(y_test, y_pred)
accuracy

0.8918439716312057

**Conclusion on Model Performance:**

Accuracy for BernoulliNB = 0.5035460992907801

Accuracy for MultinomialNB = 0.7163120567375887

Accuracy for TFIDF = **0.8918439716312057**

The results clearly demonstrate that the TFIDF with Multinomial Naive Bayes approach is the most effective for sentiment analysis in this context. The accuracy of 89.18% indicates a robust model that can accurately classify the sentiment of Starbucks reviews. The significant performance difference highlights the importance of feature representation in text classification tasks. TFIDF provides a more detailed and discriminative representation of text data, leading to better performance when combined with a suitable classifier like Multinomial Naive Bayes.


# Testing the model with new user review

In [44]:
# Creating a Function to predict rating for a new review
def predict_rating(review):
    review_processed = preprocess_text(review)
    review_vectorized = vectorizer.transform([review_processed])
    prediction = mnb.predict(review_vectorized)
    return prediction[0]

# Dynamic input for user review and prediction
user_review = input("Enter a review: ")
predicted_rating = predict_rating(user_review)
print(f'The predicted rating for the review is: {predicted_rating}')

Enter a review: love it
The predicted rating for the review is: 5.0
