In [1]:
import sys
import os
# print(os.getcwd())

project_root = os.path.abspath(os.path.join(os.getcwd(),"../../"))

sys.path.append(os.path.join(project_root,"src"))
# sys.path.append("src")

from cvi.preprocessing.cleaner import clean_text, preprocess_texts

In [2]:
import pandas as pd

df = pd.read_csv("../../data/amazon_reviews.csv")

  df = pd.read_csv("../../data/amazon_reviews.csv")


In [3]:
df.shape

(67992, 27)

In [4]:
df.columns

Index(['id', 'name', 'asins', 'brand', 'categories', 'keys', 'manufacturer',
       'reviews.date', 'reviews.dateAdded', 'reviews.dateSeen',
       'reviews.didPurchase', 'reviews.doRecommend', 'reviews.id',
       'reviews.numHelpful', 'reviews.rating', 'reviews.sourceURLs',
       'reviews.text', 'reviews.title', 'reviews.userCity',
       'reviews.userProvince', 'reviews.username', 'dateAdded', 'dateUpdated',
       'primaryCategories', 'imageURLs', 'manufacturerNumber', 'sourceURLs'],
      dtype='object')

In [5]:
df["clean_text"] = preprocess_texts(df["reviews.text"])

In [8]:
df["clean_text"].head()

0    product far disappoint child love use like abi...
1      great beginner experienced person buy gift love
2    inexpensive tablet use learn step nabi thrille...
3    fire week love tablet great prime member table...
4    buy grand daughter come visit set user enter a...
Name: clean_text, dtype: object

In [10]:
df["reviews.text"].head()

0    This product so far has not disappointed. My c...
1    great for beginner or experienced person. Boug...
2    Inexpensive tablet for him to use and learn on...
3    I've had my Fire HD 8 two weeks now and I love...
4    I bought this for my grand daughter when she c...
Name: reviews.text, dtype: object

In [11]:
def map_sentiment(rating):
    if rating in [1,2]:
        return "Negative"
    elif rating == 3:
        return "Neutral"
    else:
        return "Positive"

In [12]:
df["sentiment"] = df["reviews.rating"].apply(map_sentiment)

In [13]:
df["sentiment"].head()

0    Positive
1    Positive
2    Positive
3    Positive
4    Positive
Name: sentiment, dtype: object

In [14]:
df["sentiment"].value_counts()

sentiment
Positive    62580
Neutral      2902
Negative     2510
Name: count, dtype: int64

In [15]:
binary_df = df[df["sentiment"] != "Neutral"].copy()

In [19]:
binary_df["sentiment"].value_counts()

sentiment
Positive    62580
Negative     2510
Name: count, dtype: int64

In [20]:
df.shape,binary_df.shape

((67992, 29), (65090, 29))

In [21]:
from sklearn.model_selection import train_test_split

In [22]:
X = binary_df["clean_text"]
y = binary_df["sentiment"]

X_train, X_test, y_train, y_test = train_test_split(
    X, y,
    test_size =0.2,
    random_state = 42,
    stratify = y
)

In [23]:
from sklearn.feature_extraction.text import TfidfVectorizer

In [None]:
# here we are doing TF-IDF 
# Term frequency - Inverse Document Frequency 

In [24]:
vectorizer = TfidfVectorizer(
    max_features = 10000,
    ngram_range=(1,2)
)

X_train_tfidf = vectorizer.fit_transform(X_train)
X_test_tfidf = vectorizer.transform(X_test)

In [25]:
from sklearn.linear_model import LogisticRegression

In [27]:
model = LogisticRegression(
    class_weight="balanced",
    max_iter=1000
)

model.fit(X_train_tfidf, y_train)

In [31]:
from sklearn.metrics import classification_report

In [32]:
y_pred = model.predict(X_test_tfidf)

In [30]:
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

    Negative       0.41      0.87      0.56       502
    Positive       0.99      0.95      0.97     12516

    accuracy                           0.95     13018
   macro avg       0.70      0.91      0.76     13018
weighted avg       0.97      0.95      0.96     13018



In [33]:
print(model.classes_)

['Negative' 'Positive']


In [34]:
y_probs = model.predict_proba(X_test_tfidf)[:,0]

In [36]:
import numpy as np

In [38]:
def evaluate_threshold(threshold):
    y_pred_custom = np.where(y_probs > threshold,"Negative","Positive")
    print(f"\nThreshold: {threshold}")
    print(classification_report(y_test, y_pred_custom))

evaluate_threshold(0.3)
evaluate_threshold(0.5)
evaluate_threshold(0.7)
evaluate_threshold(0.8)


Threshold: 0.3
              precision    recall  f1-score   support

    Negative       0.26      0.93      0.40       502
    Positive       1.00      0.89      0.94     12516

    accuracy                           0.89     13018
   macro avg       0.63      0.91      0.67     13018
weighted avg       0.97      0.89      0.92     13018


Threshold: 0.5
              precision    recall  f1-score   support

    Negative       0.41      0.87      0.56       502
    Positive       0.99      0.95      0.97     12516

    accuracy                           0.95     13018
   macro avg       0.70      0.91      0.76     13018
weighted avg       0.97      0.95      0.96     13018


Threshold: 0.7
              precision    recall  f1-score   support

    Negative       0.60      0.80      0.68       502
    Positive       0.99      0.98      0.99     12516

    accuracy                           0.97     13018
   macro avg       0.79      0.89      0.83     13018
weighted avg       0.98   

In [39]:
import joblib

In [40]:
joblib.dump(model, "../../models/logistic_model.pkl")
joblib.dump(vectorizer, "../../models/tfidf_vectorizer.pkl")

['../../models/tfidf_vectorizer.pkl']

In [43]:
def predict_sentiment(text, threshold=0.8):
    text_clean = preprocess_texts([text])
    text_vec = vectorizer.transform(text_clean)
    prob_negative = model.predict_proba(text_vec)[0][0]

    if prob_negative > threshold:
        return "Negative", prob_negative
    else:
        return "Postive", 1 - prob_negative

In [45]:
predict_sentiment("This product is terrible and battery is worst")

('Negative', 0.9957704018762751)

In [46]:
predict_sentiment("Amazing quality and fast delivery")

('Postive', 0.9451738435398217)