In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
import re

In [2]:
import warnings
warnings.filterwarnings("ignore")
import nltk

from nltk.corpus import stopwords
stop = stopwords.words('english')
import string
from sklearn.model_selection import train_test_split
from nltk.stem import PorterStemmer
from nltk.stem import WordNetLemmatizer

from nltk.tokenize import word_tokenize
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import classification_report

In [3]:
amazon_data = pd.read_csv("amazon_co-ecommerce_sample.csv")

In [4]:
amazon_data['customer_reviews'] = amazon_data['customer_reviews'].astype(str)

In [5]:
rating_review_data = pd.DataFrame()

for idx, value in amazon_data['customer_reviews'].iteritems():
    # Split the single string into individual reviews
    reviews = re.split(r'\|', str(value))

    # Extract ratings and reviews from each individual review
    rating_pattern = r'//\s*(\d+\.\d)\s*//' # Matches the rating (e.g., 5.0)

    ratings = []
    review_texts = []

    for review in reviews:
        rating_match = re.search(rating_pattern, review)
        if rating_match:
            ratings.append(rating_match.group(1))
            review_text = review.replace(rating_match.group(0), '').strip()
            review_texts.append(review_text)

    # Create a new DataFrame
    new_df = pd.DataFrame({"rating": ratings, "review": review_texts})
    rating_review_data = pd.concat([rating_review_data, new_df])
    rating_review_data.reset_index(drop=True, inplace=True)


In [6]:
# remove user names
rating_review_data['review'] = [re.sub(r'//\s*By\s*[\w\s.]*\s*on\s*', '', text) for text in rating_review_data['review']]

In [7]:
# Remove repeated dates
rating_review_data['review'] = [re.sub(r'(\d+\s*[A-Za-z]+\s*\d{4}(\s*\.?)\s*)\1', r'\1', text) for text in rating_review_data['review']] 
rating_review_data['review'] = [re.sub(r'(\d+\s*[A-Za-z]+\.\s*\d{4})\s*\1', r'\1',text) for text in rating_review_data['review']] 


In [8]:
rating_review_data['review'] = [review.replace('//', ' ') for review in rating_review_data['review']] # remove //
rating_review_data['review'] = [review.replace('\n', ' ') for review in rating_review_data['review']] # remove \n
rating_review_data['review'] = [re.sub(r'\d+', '', text) for text in rating_review_data['review']] # remove numbers

In [9]:
# remove stopwords
rating_review_data['review'] = [' '.join([word for word in text.split() if word.lower() not in stop]) for text in rating_review_data['review']] 

In [10]:
# Remove all punctuations
rating_review_data['review'] = [re.sub(f"[{re.escape(string.punctuation)}]", "", text) for text in rating_review_data['review']]

In [11]:
def stemSentence(sentence):
    token_words = word_tokenize(sentence)
    stem_sentence = [porter.stem(word) for word in token_words]
    return " ".join(stem_sentence)

porter = PorterStemmer()

# Apply stemming
rating_review_data['review'] = [stemSentence(text) for text in rating_review_data['review']]

In [12]:
rating_review_data.head(3)

Unnamed: 0,rating,review
0,4.0,worth buy pictur alon as ever april part magic...
1,5.0,amaz detail fabul photographi april amaz detai...
2,5.0,great purchas april purchas behalf dad alway a...


## Model

In [14]:
rating_review_data['rating'] = pd.to_numeric(rating_review_data['rating'], errors='coerce')
rating_review_data['Label'] = (rating_review_data['rating'] >= 4.0).astype(int)

In [15]:
class_distribution = rating_review_data['Label'].value_counts()
print(class_distribution)

# Imbalanced data

1    26751
0     2783
Name: Label, dtype: int64


In [16]:
dev_text, test_text, dev_y, test_y = train_test_split(rating_review_data['review'], rating_review_data['Label'], stratify =  rating_review_data['Label'],test_size=0.2,  random_state=42)

print(dev_y.value_counts())
print(test_y.value_counts())

1    21401
0     2226
Name: Label, dtype: int64
1    5350
0     557
Name: Label, dtype: int64


### Bag Of Word

In [17]:
vector = CountVectorizer()

dev_X_bow = vector.fit_transform(dev_text)
test_X_bow = vector.transform(test_text)

print(dev_X_bow.shape)
print(test_X_bow.shape)

(23627, 24965)
(5907, 24965)


In [18]:
from sklearn.linear_model import LogisticRegressionCV

lr_bow = LogisticRegressionCV().fit(dev_X_bow, dev_y)
lr_bow.score(test_X_bow, test_y)

0.9400711020822753

In [19]:
test_pred_bow = lr_bow.predict(test_X_bow)

print("Classification Report for Bag of Words Model:")
print(classification_report(test_y, test_pred_bow))

Classification Report for Bag of Words Model:
              precision    recall  f1-score   support

           0       0.77      0.52      0.62       557
           1       0.95      0.98      0.97      5350

    accuracy                           0.94      5907
   macro avg       0.86      0.75      0.79      5907
weighted avg       0.93      0.94      0.93      5907



### TF-IDF

In [20]:
vector_tf = TfidfVectorizer()

dev_X_tf = vector_tf.fit_transform(dev_text)
test_X_tf  = vector_tf.transform(test_text)

print(dev_X_tf.shape)
print(test_X_tf.shape)

(23627, 24965)
(5907, 24965)


In [21]:
lr_tfidf = LogisticRegressionCV().fit(dev_X_tf, dev_y)
lr_tfidf.score(test_X_tf, test_y)

0.9407482647706111

In [22]:
test_pred_tfidf = lr_tfidf.predict(test_X_tf)


print("\nClassification Report for TF-IDF Model:")
print(classification_report(test_y, test_pred_tfidf))


Classification Report for TF-IDF Model:
              precision    recall  f1-score   support

           0       0.83      0.46      0.60       557
           1       0.95      0.99      0.97      5350

    accuracy                           0.94      5907
   macro avg       0.89      0.73      0.78      5907
weighted avg       0.94      0.94      0.93      5907



#### N-grams

In [23]:
vector_ngram = CountVectorizer(ngram_range=(1,2), min_df=4)

dev_X_ngram = vector_ngram.fit_transform(dev_text)
test_X_ngram = vector_ngram.transform(test_text)

lr_ngram = LogisticRegressionCV().fit(dev_X_ngram, dev_y)
lr_ngram.score(test_X_ngram, test_y)

0.9431183341797867

In [24]:
test_pred_ngram = lr_ngram.predict(test_X_ngram)

print(classification_report(test_y, test_pred_ngram))

              precision    recall  f1-score   support

           0       0.79      0.54      0.64       557
           1       0.95      0.99      0.97      5350

    accuracy                           0.94      5907
   macro avg       0.87      0.76      0.81      5907
weighted avg       0.94      0.94      0.94      5907



In [25]:
vector_bigram = CountVectorizer(ngram_range=(2,2), min_df=4)

dev_X_bigram = vector_bigram.fit_transform(dev_text)
test_X_bigram = vector_bigram.transform(test_text)

lr_bigram = LogisticRegressionCV().fit(dev_X_bigram, dev_y)
lr_bigram.score(test_X_bigram, test_y)

0.9300829524293212

In [26]:
test_pred_bigram = lr_bigram.predict(test_X_bigram)

print(classification_report(test_y, test_pred_bigram))

              precision    recall  f1-score   support

           0       0.75      0.39      0.51       557
           1       0.94      0.99      0.96      5350

    accuracy                           0.93      5907
   macro avg       0.84      0.69      0.74      5907
weighted avg       0.92      0.93      0.92      5907



### Imbalanced data

In [27]:
from imblearn.over_sampling import SMOTE

In [28]:
print(dev_y.value_counts())

1    21401
0     2226
Name: Label, dtype: int64


In [29]:
# dev_X_tf 
# test_X_tf

smote = SMOTE(random_state = 42)
dev_X_smote, dev_y_smote = smote.fit_resample(dev_X_tf, dev_y)

print("label counts After smote: ")
print(dev_y_smote.value_counts())

print("shape of development set Before smote: ", dev_X_tf.shape)
print("shape of development set After smote: ", dev_X_smote.shape)

label counts After smote: 
1    21401
0    21401
Name: Label, dtype: int64
shape of development set Before smote:  (23627, 24965)
shape of development set After smote:  (42802, 24965)


In [30]:
lr_smote = LogisticRegressionCV().fit(dev_X_smote, dev_y_smote)
lr_smote.score(test_X_tf, test_y)

0.9106145251396648

In [31]:
test_pred_smote = lr_smote.predict(test_X_tf)

print(classification_report(test_y, test_pred_smote))

              precision    recall  f1-score   support

           0       0.52      0.62      0.57       557
           1       0.96      0.94      0.95      5350

    accuracy                           0.91      5907
   macro avg       0.74      0.78      0.76      5907
weighted avg       0.92      0.91      0.91      5907



### More classifiers

In [32]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import cross_validate
from sklearn.metrics import roc_auc_score, average_precision_score

In [33]:
dtc = DecisionTreeClassifier(max_depth=10, random_state=42)
dtc.fit(dev_X_tf, dev_y)

In [34]:
dtc.score(test_X_tf, test_y)

0.930929405789741

In [35]:
test_pred_dtc = dtc.predict(test_X_tf)

print(classification_report(test_y, test_pred_dtc))

              precision    recall  f1-score   support

           0       0.78      0.37      0.50       557
           1       0.94      0.99      0.96      5350

    accuracy                           0.93      5907
   macro avg       0.86      0.68      0.73      5907
weighted avg       0.92      0.93      0.92      5907



In [36]:
# Your Code Here
scores_smote = cross_validate(DecisionTreeClassifier(max_depth=10, random_state=42), 
                        dev_X_smote, dev_y_smote, cv=5, 
                        scoring = ['roc_auc', 'average_precision', 'precision', 'recall' , 'f1'])

print("AUC: ", scores_smote['test_roc_auc'])
print("Average Precision: ", scores_smote['test_average_precision'])
print("precision: ", scores_smote['test_precision'])
print("recall: ", scores_smote['test_recall'])
print("f1: ", scores_smote['test_f1'])

AUC:  [0.86381113 0.92405148 0.91879938 0.91913879 0.9203402 ]
Average Precision:  [0.81740093 0.89866056 0.89406443 0.89245167 0.89444764]
precision:  [0.72980251 0.83222665 0.80969    0.8228013  0.82423981]
recall:  [0.9495445  0.87850467 0.92149533 0.88528037 0.89299065]
f1:  [0.82529692 0.85473971 0.8619823  0.85289814 0.85723898]


#### Random Forest

In [39]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, classification_report

In [40]:

rf_classifier = RandomForestClassifier(random_state=42)
rf_classifier.fit(dev_X_tf, dev_y)

y_dev_pred = rf_classifier.predict(dev_X_tf)
y_test_pred = rf_classifier.predict(test_X_tf)

precision_test = precision_score(test_y, y_test_pred, pos_label = 0)
recall_test = recall_score(test_y, y_test_pred, pos_label = 0)
f1_score_test = f1_score(test_y, y_test_pred, pos_label = 0)


print("\nTest Set Metrics:")
print(f"Precision: {precision_test}")
print(f"Recall: {recall_test}")
print(f"F1 Score: {f1_score_test}")


Test Set Metrics:
Precision: 0.9772727272727273
Recall: 0.23159784560143626
F1 Score: 0.37445573294629897


#### Gradient Boosted Trees

In [41]:
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.experimental import enable_hist_gradient_boosting
from sklearn.ensemble import HistGradientBoostingClassifier
import time

In [43]:
from sklearn.model_selection import GridSearchCV, StratifiedKFold

In [47]:
# dev_X_tf
# test_X_tf


optimal_gbc = GradientBoostingClassifier(random_state=42, n_estimators=100, learning_rate=0.1, max_depth=5)
optimal_gbc.fit(dev_X_tf, dev_y)

y_pred_gbc = optimal_gbc.predict(test_X_tf) 
test_precision_gbc = precision_score(test_y, y_pred_gbc, pos_label = 0)
recall_test_gbc = recall_score(test_y, y_pred_gbc, pos_label = 0)
f1_score_test_gbc = f1_score(test_y, y_pred_gbc, pos_label = 0)


print("Test Set Metrics (GradientBoostingClassifier):")
print(f"Precision: {test_precision_gbc:.2f}")
print(f"Recall: {recall_test_gbc:.2f}")
print(f"F1 Score: {f1_score_test_gbc:.2f}")

Test Set Metrics (GradientBoostingClassifier):
Precision: 0.88
Recall: 0.41
F1 Score: 0.56


In [52]:
import xgboost as xgb

In [53]:
optimal_xgb = xgb.XGBClassifier(random_state=42, n_estimators=10, learning_rate=0.1, max_depth=5)
optimal_xgb.fit(dev_X_tf, dev_y)

y_pred_xgb = optimal_xgb.predict(test_X_tf) 
test_precision_xgb = precision_score(test_y, y_pred_xgb, pos_label = 0)
recall_test_xgb = recall_score(test_y, y_pred_xgb, pos_label = 0)
f1_score_test_xgb = f1_score(test_y, y_pred_xgb, pos_label = 0)


print("\nTest Set Metrics (XGBClassifier):")
print(f"Precision: {test_precision_xgb:.2f}")
print(f"Recall: {recall_test_xgb:.2f}")
print(f"F1 Score: {f1_score_test_xgb:.2f}")


Test Set Metrics (XGBClassifier):
Precision: 0.93
Recall: 0.19
F1 Score: 0.31
