In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report

In [3]:
#load dataset
df = pd.read_csv("amazon.csv") 
df = df[['reviewText', 'overall']]
df.head()

Unnamed: 0,reviewText,overall
0,No issues.,4
1,"Purchased this for my device, it worked as adv...",5
2,it works as expected. I should have sprung for...,4
3,This think has worked out great.Had a diff. br...,5
4,"Bought it with Retail Packaging, arrived legit...",5


In [14]:
#keep only required columns
df = df[['reviewText', 'overall']]

#remove missing values
df = df.dropna(subset=['reviewText'])

#force text to string
df['reviewText'] = df['reviewText'].astype(str)

#remove empty views
df = df[df['reviewText'].str.strip() != ""]

In [15]:
#dataset overview
print("Dataset shape:", df.shape)
df['overall'].value_counts()

Dataset shape: (4772, 2)


overall
5    3921
4     527
1     244
2      80
Name: count, dtype: int64

In [16]:
#create sentiment labels
def label_sentiment(rating):
    if rating >= 4:
        return "positive"
    elif rating <= 2:
        return "negative"
    else:
        return "neutral"
df['sentiment'] = df['overall'].apply(label_sentiment)

#remove neutral views
df = df[df['sentiment'] != 'neutral']
df['sentiment'].value_counts()

sentiment
positive    4448
negative     324
Name: count, dtype: int64

In [17]:
#train-test split
X = df['reviewText']
y = df['sentiment']
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

In [18]:
#text vectorization
vectorizer = TfidfVectorizer(
    stop_words='english',
    max_features=5000
)
X_train_vec = vectorizer.fit_transform(X_train)
X_test_vec = vectorizer.transform(X_test)

In [19]:
#train model
model = LogisticRegression()
model.fit(X_train_vec, y_train)

In [20]:
#evaluation
y_pred = model.predict(X_test_vec)
print("Accuracy:", accuracy_score(y_test, y_pred))
print(classification_report(y_test, y_pred))

Accuracy: 0.9445026178010472
              precision    recall  f1-score   support

    negative       0.93      0.21      0.35        66
    positive       0.94      1.00      0.97       889

    accuracy                           0.94       955
   macro avg       0.94      0.61      0.66       955
weighted avg       0.94      0.94      0.93       955



In [21]:
#sample prediction
sample_review = ["The product quality is excellent and delivery was fast"]
sample_vec = vectorizer.transform(sample_review)
print("Predicted sentiment:", model.predict(sample_vec)[0])

Predicted sentiment: positive
