In [5]:
#importing necessary libraries
import numpy as np 
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import nltk
import re
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
import nltk

# Download necessary NLTK data
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('omw-1.4') 

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\tneel\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\tneel\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to
[nltk_data]     C:\Users\tneel\AppData\Roaming\nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


True

In [7]:
# Initialize lemmatizer and stopwords
lemmatizer = WordNetLemmatizer()
stop_words = set(stopwords.words('english'))

# Define preprocessing function
def preprocess_text(text):
    # Convert to lowercase
    text = text.lower()
    # Remove special characters and numbers
    text = re.sub(r'[^a-z\s]', '', text)
    # Tokenize, remove stopwords, and lemmatize
    text = ' '.join([lemmatizer.lemmatize(word) for word in text.split() if word not in stop_words])
    return text

df = pd.read_csv(r'C:\Users\tneel\Downloads\final_project_sentiment_analysis\archive\Reviews.csv')

# Apply preprocessing to the 'Text' column
df['cleaned_text'] = df['Text'].apply(preprocess_text)

In [10]:
# Binary sentiment classification
def assign_sentiment(score):
    if score > 3:
        return 1  # Positive
    elif score < 3:
        return 0  # Negative
    else:
        return None  # Neutral (can drop these if focusing on binary classification)

df['sentiment'] = df['Score'].apply(assign_sentiment)

# Drop rows with neutral sentiment
df = df.dropna(subset=['sentiment'])

In [15]:
# Splitting the data into train and test sets
from sklearn.model_selection import train_test_split

x = df['cleaned_text']
y = df['sentiment']
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=42, stratify=y)

In [19]:
from sklearn.feature_extraction.text import TfidfVectorizer

# Vectorize the text data using TF-IDF
vectorizer = TfidfVectorizer(max_features=5000)  # Limit features to 5000
x_train_tfidf = vectorizer.fit_transform(x_train)
x_test_tfidf = vectorizer.transform(x_test)

In [23]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score

# Train logistic regression
log_reg = LogisticRegression(max_iter=1000)
log_reg.fit(x_train_tfidf, y_train)

# Predictions and evaluation
y_pred = log_reg.predict(x_test_tfidf)
print("Classification Report:\n", classification_report(y_test, y_pred))
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred))
print("Accuracy Score:", accuracy_score(y_test, y_pred))

Classification Report:
               precision    recall  f1-score   support

         0.0       0.84      0.68      0.75     16407
         1.0       0.94      0.98      0.96     88756

    accuracy                           0.93    105163
   macro avg       0.89      0.83      0.86    105163
weighted avg       0.93      0.93      0.93    105163

Confusion Matrix:
 [[11232  5175]
 [ 2209 86547]]
Accuracy Score: 0.9297851906088643


In [38]:
# identifying top and worst performing products
product_sentiment = df.groupby('ProductId')['sentiment'].mean().reset_index()
product_sentiment_sorted = product_sentiment.sort_values(by='sentiment', ascending=False)

top_performing_products = product_sentiment_sorted.head(10)
worst_performing_products = product_sentiment_sorted.tail(10)

top_products_info = df[df['ProductId'].isin(top_performing_products['ProductId'])]
worst_products_info = df[df['ProductId'].isin(worst_performing_products['ProductId'])]

print("Top 10 Performing Products (Based on Sentiment):")
top_info = top_products_info[['ProductId', 'sentiment', 'Summary']].drop_duplicates(subset='ProductId')
print(top_info)

print("Worst 10 Performing Products (Based on Sentiment):")
worst_info = worst_products_info[['ProductId', 'sentiment', 'Summary']].drop_duplicates(subset='ProductId')
print(worst_info)

Top 10 Performing Products (Based on Sentiment):
         ProductId  sentiment  \
156     B0036VM05I        1.0   
35633   B0036UQTQA        1.0   
36332   B0036UWPJ0        1.0   
105943  B0036VE3V2        1.0   
228469  B0036V7K4E        1.0   
229039  B0036VFX5M        1.0   
309833  B0036UV8RU        1.0   
487525  B001XWR35Y        1.0   
539149  B0036VFTMO        1.0   
544019  B0036UUY9I        1.0   

                                                  Summary  
156                                            Great Deal  
35633                         New label, same great stuff  
36332                                   Works really well  
105943                   My Dog Thinks They Are The Best!  
228469  You can't buy cheddar like this at the grocery...  
229039                    Not for everyone but I LOVE it!  
309833                            I love the coffee, but:  
487525                          Best Risotto in the World  
539149                                Dog + cat