In [1]:
import nltk
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')
from flask import Flask, render_template, request
import pandas as pd
import re
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.ensemble import RandomForestClassifier  # Import Random Forest Classifier
from sklearn.metrics import f1_score
from sklearn.impute import SimpleImputer

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\poppo\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\poppo\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\poppo\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [4]:
# Load the dataset
df = pd.read_csv('../data.csv')

In [5]:
df.head()

Unnamed: 0,Reviewer Name,Review Title,Place of Review,Up Votes,Down Votes,Month,Review text,Ratings
0,Kamal Suresh,Nice product,"Certified Buyer, Chirakkal",889.0,64.0,Feb 2021,"Nice product, good quality, but price is now r...",4
1,Flipkart Customer,Don't waste your money,"Certified Buyer, Hyderabad",109.0,6.0,Feb 2021,They didn't supplied Yonex Mavis 350. Outside ...,1
2,A. S. Raja Srinivasan,Did not meet expectations,"Certified Buyer, Dharmapuri",42.0,3.0,Apr 2021,Worst product. Damaged shuttlecocks packed in ...,1
3,Suresh Narayanasamy,Fair,"Certified Buyer, Chennai",25.0,1.0,,"Quite O. K. , but nowadays the quality of the...",3
4,ASHIK P A,Over priced,,147.0,24.0,Apr 2016,Over pricedJust â?¹620 ..from retailer.I didn'...,1


In [6]:
# Handle missing values
imputer = SimpleImputer(strategy='mean')
df['Up Votes'] = imputer.fit_transform(df[['Up Votes']])
imputer = SimpleImputer(strategy='median')
df['Down Votes'] = imputer.fit_transform(df[['Down Votes']])
imputer = SimpleImputer(strategy='most_frequent')
df['Place of Review'] = imputer.fit_transform(df[['Place of Review']]).flatten()
df['Review text'].fillna('', inplace=True)
df['Review Title'].fillna('', inplace=True)
df['Reviewer Name'].fillna('', inplace=True)
df['Month'].fillna('Unknown', inplace=True)

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df['Review text'].fillna('', inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df['Review Title'].fillna('', inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always

In [7]:
# Define thresholds for positive and negative sentiment
positive_threshold = 3.5
negative_threshold = 2.5

In [8]:
# Infer sentiment based on rating
def infer_sentiment(rating):
    if rating >= positive_threshold:
        return 1  # Positive sentiment
    elif rating <= negative_threshold:
        return 0  # Negative sentiment
    else:
        return -1  # Neutral sentiment or other

In [9]:
# Apply sentiment inference to the dataset
df['Sentiment'] = df['Ratings'].apply(infer_sentiment)

# Preprocessing functions
def clean_text(text):
    text = re.sub(r'[^\w\s]', '', text)  # Remove special characters and punctuation
    text = text.lower()  # Convert text to lowercase
    return text

def remove_stopwords(text):
    stop_words = set(stopwords.words('english'))
    tokens = word_tokenize(text)
    filtered_text = [word for word in tokens if word not in stop_words]
    return " ".join(filtered_text)

def lemmatize_text(text):
    lemmatizer = WordNetLemmatizer()
    tokens = word_tokenize(text)
    lemmatized_text = [lemmatizer.lemmatize(word) for word in tokens]
    return " ".join(lemmatized_text)

# Apply preprocessing to the dataset
df['Review text'] = df['Review text'].apply(clean_text)
df['Review text'] = df['Review text'].apply(remove_stopwords)
df['Review text'] = df['Review text'].apply(lemmatize_text)


In [10]:
# Train the model using the entire dataset
vectorizer = TfidfVectorizer(max_features=1000)
X_vect = vectorizer.fit_transform(df['Review text'])
y = df['Sentiment']

In [11]:
# Updated: Use Random Forest Classifier
model = RandomForestClassifier()
model.fit(X_vect, y)


In [12]:
# Predict on the entire dataset
y_pred = model.predict(X_vect)

In [13]:
# Compute F1 score
f1 = f1_score(y, y_pred, average='weighted')

print("F1 Score:", f1)

F1 Score: 0.9359958478201822


In [14]:
# Function to predict sentiment
def predict_sentiment(text):
    text = clean_text(text)
    text = remove_stopwords(text)
    text = lemmatize_text(text)
    text_vect = vectorizer.transform([text])
    prediction = model.predict(text_vect)
    return prediction[0]