## 3rd Text Analytics Assignment - Amazon Prediction Problem Using Embeddings

In [2]:
#Importing packages 
import gensim.downloader as api
import numpy as np
import pandas as pd
from gensim.models import Word2Vec
from sklearn.metrics.pairwise import cosine_similarity
from transformers import BertTokenizer, BertModel
import torch

In [37]:
# Read in the data
df = pd.read_csv('Amazon_Unlocked_Mobile.csv')

# Sample the data to speed up computation
# Comment out this line to match with lecture
df = df.sample(frac=0.1, random_state=10) #Using only 10% of the data for better performance 

df.head()

Unnamed: 0,Product Name,Brand Name,Price,Rating,Reviews,Review Votes
394349,Sony XPERIA Z2 D6503 FACTORY UNLOCKED Internat...,,244.95,5,Very good one! Better than Samsung S and iphon...,0.0
34377,Apple iPhone 5c 8GB (Pink) - Verizon Wireless,Apple,194.99,1,"The phone needed a SIM card, would have been n...",1.0
248521,Motorola Droid RAZR MAXX XT912 M Verizon Smart...,Motorola,174.99,5,I was 3 months away from my upgrade and my Str...,3.0
167661,CNPGD [U.S. Office Extended Warranty] Smartwat...,CNPGD,49.99,1,an experience i want to forget,0.0
73287,Apple iPhone 7 Unlocked Phone 256 GB - US Vers...,Apple,922.0,5,GREAT PHONE WORK ACCORDING MY EXPECTATIONS.,1.0


In [38]:
# Drop missing values
df.dropna(inplace=True)

# Remove any 'neutral' ratings equal to 3
df = df[df['Rating'] != 3]

# Encode 4s and 5s as 1 (rated positively)
# Encode 1s and 2s as 0 (rated poorly)
df['Positively Rated'] = np.where(df['Rating'] > 3, 1, 0)
df.head(5)

Unnamed: 0,Product Name,Brand Name,Price,Rating,Reviews,Review Votes,Positively Rated
34377,Apple iPhone 5c 8GB (Pink) - Verizon Wireless,Apple,194.99,1,"The phone needed a SIM card, would have been n...",1.0,0
248521,Motorola Droid RAZR MAXX XT912 M Verizon Smart...,Motorola,174.99,5,I was 3 months away from my upgrade and my Str...,3.0,1
167661,CNPGD [U.S. Office Extended Warranty] Smartwat...,CNPGD,49.99,1,an experience i want to forget,0.0,0
73287,Apple iPhone 7 Unlocked Phone 256 GB - US Vers...,Apple,922.0,5,GREAT PHONE WORK ACCORDING MY EXPECTATIONS.,1.0,1
277158,Nokia N8 Unlocked GSM Touch Screen Phone Featu...,Nokia,95.0,5,I fell in love with this phone because it did ...,0.0,1


In [39]:
#Load the dataset
reviews = df['Reviews'].tolist()
print(reviews[1])

#Preprocess reviews to tokenize them
processed_reviews = [sentence.lower().split() for sentence in reviews]
print(processed_reviews[1])

I was 3 months away from my upgrade and my Stratosphere kept crapping out on me. I didn't want to wait and every time Verizon sent me a new Stratosphere it would work fine for a few days and then it would crap out. I didn't want the HD version of this phone so I decided to try Amazon. They had a used one sold by GottaGetaPhone. It's works great!! I can't even tell that it was used. The battery lasts me 2 to 3 days between charges depending on usage. And best part is that I've had it for a month and it hasn't crapped out on me!!
['i', 'was', '3', 'months', 'away', 'from', 'my', 'upgrade', 'and', 'my', 'stratosphere', 'kept', 'crapping', 'out', 'on', 'me.', 'i', "didn't", 'want', 'to', 'wait', 'and', 'every', 'time', 'verizon', 'sent', 'me', 'a', 'new', 'stratosphere', 'it', 'would', 'work', 'fine', 'for', 'a', 'few', 'days', 'and', 'then', 'it', 'would', 'crap', 'out.', 'i', "didn't", 'want', 'the', 'hd', 'version', 'of', 'this', 'phone', 'so', 'i', 'decided', 'to', 'try', 'amazon.', 't

In [40]:
#Train a Word2Vec model
#Using a small vector size for simplicity
model_w2v = Word2Vec(sentences=processed_reviews, vector_size=50, window=3, min_count=1, sg=0)

In [41]:
#Define a function to get review embeddings
def get_review_embedding(sentence, model):
    words = sentence.lower().split()
    word_vectors = []
    for word in words:
        if word in model.wv:
            word_vectors.append(model.wv[word])
    if len(word_vectors) == 0:
        return np.zeros(model.vector_size)
    return np.mean(word_vectors, axis=0)

In [42]:
#Print the embeddings of the first review with Word2Vec
print(get_review_embedding(reviews[1],model_w2v))

[ 0.04549375 -0.08679199  0.30305657  0.4712545   0.39051977 -1.0729597
  0.46692002  0.5506285  -1.591848   -0.65773165  0.5630719  -0.8070552
  1.579028    1.3323407   0.12145619  1.887161    1.0613836   0.07092652
 -1.2110697  -0.88412184 -0.435929    0.38522837  0.64735067 -0.23414007
  0.80945796 -0.5019369   0.12207182 -0.02201697 -0.2562923  -0.13712391
  1.5399715   0.48792168 -0.0046762  -0.6691273  -0.3320624   0.8751267
  0.71120745  0.08565497  1.0539026   0.48299807  0.29207602 -0.23882143
  0.34673595 -0.45878658  0.8725728   0.6406814  -0.9444696   0.2921943
  0.24522027 -0.60130745]


In [59]:
#Train our first model with embeddings 
from sklearn.model_selection import train_test_split, RandomizedSearchCV #Use of Random Search for better model perfomance 

embeddings = [] 
embeddings = [get_review_embedding(review, model_w2v) for review in reviews] #Fill the empty list with the embeddings from Word2Vec model

# Split data into training and test sets
X_train, X_test, y_train, y_test = train_test_split(embeddings, 
                                                    df['Positively Rated'], 
                                                    random_state=0)

In [60]:
import xgboost as xgb #Importing the XGBoost model 
from sklearn.metrics import roc_auc_score 

#Initialize the model
model = xgb.XGBClassifier()

#Definition of parameters with plenty of different compinations
param_dist = {
    'max_depth': [3, 5, 7, 9],
    'learning_rate': [0.01, 0.05, 0.1],
    'n_estimators': [50, 100, 200],
    'subsample': [0.7, 0.8, 0.9],
    'colsample_bytree': [0.7, 0.8, 0.9],
    'gamma': [0, 0.1, 0.2],
    'reg_alpha': [0, 1, 10],
    'reg_lambda': [0, 1, 10] 
}

#Use of Random Search on XGBoost with parallel Cross-Validation
random_search = RandomizedSearchCV(model, param_distributions=param_dist, n_iter=10, cv=5, scoring='roc_auc')
random_search.fit(X_train, y_train) #Fit the model with Random Search

print("Best Parameters:", random_search.best_params_)
print("Best AUC Score:", random_search.best_score_)

best_model = random_search.best_estimator_

#Model Evaluation
y_pred_proba = best_model.predict_proba(X_test)[:, 1]
auc = roc_auc_score(y_test, y_pred_proba)
print("AUC score:", auc)

Best Parameters: {'subsample': 0.7, 'reg_lambda': 0, 'reg_alpha': 0, 'n_estimators': 200, 'max_depth': 9, 'learning_rate': 0.01, 'gamma': 0.1, 'colsample_bytree': 0.8}
Best AUC Score: 0.9440428047600371
AUC score: 0.9433840561973388


In [61]:
#Importing Metrics for XGBoost 
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix

#Predictions with XGBoost
y_pred = best_model.predict(X_test)

xgb_accuracy = accuracy_score(y_test, y_pred)
xgb_precision = precision_score(y_test, y_pred)
xgb_recall = recall_score(y_test, y_pred)
xgb_f1 = f1_score(y_test, y_pred)

#Print the results
print("Accuracy:", xgb_accuracy)
print("Precision:", xgb_precision)
print("Recall:", xgb_recall)
print("F1 Score:", xgb_f1)

#Confusion matrix
conf_matrix = confusion_matrix(y_test, y_pred)
print("Confusion Matrix:")
print(conf_matrix)

Accuracy: 0.8852160333159813
Precision: 0.894251012145749
Recall: 0.9601808381151105
F1 Score: 0.9260439376152942
Confusion Matrix:
[[1280  653]
 [ 229 5522]]


In [62]:
#Prediction of two simple comments with Word2Vec
new_comments = ['not an issue, phone is working', 'an issue, phone is not working']
new_embeddings = [get_review_embedding(comment, model_w2v) for comment in new_comments]
predictions = best_model.predict(new_embeddings)
print("Expected Predictions: [1,0] with",xgb_accuracy, "accuracy")
print("Predictions:", predictions)

Expected Predictions: [1,0] with 0.8852160333159813 accuracy
Predictions: [1 1]


In [63]:
#Use pre-trained embeddings (GloVe)
print("\n--- Using Pre-trained Embeddings (GloVe) ---")
glove_model = api.load("glove-wiki-gigaword-50")

#Define a function to get review embedding using GloVe
def get_review_embedding_glove(sentence, model):
    words = sentence.lower().split()
    word_vectors = []
    for word in words:
        if word in model:
            word_vectors.append(model[word])
    if len(word_vectors) == 0:
        return np.zeros(model['data'].shape[0])
    return np.mean(word_vectors, axis=0)


--- Using Pre-trained Embeddings (GloVe) ---


In [64]:
#Print the embeddings of the first review with GloVe
print(get_review_embedding_glove(reviews[1],glove_model))

[ 0.32685634  0.09039927  0.09105929 -0.15679033  0.31469712  0.11798217
 -0.48984507 -0.03299778 -0.21472326  0.01071966 -0.01179792  0.13765682
 -0.36748335 -0.11953119  0.52635264  0.24027763 -0.06242474  0.01702298
 -0.41186646 -0.3900368   0.15513963  0.21431924  0.32682326 -0.01186744
  0.14103818 -1.6343603  -0.3040641   0.09819191  0.35085532 -0.38845816
  3.2890372   0.18400379 -0.3511699  -0.13982666  0.1156856  -0.06760355
  0.19432129  0.18797386  0.12478791 -0.21413073 -0.05383534  0.0862225
 -0.02296573  0.11332116 -0.17208     0.07322724 -0.09066887 -0.19894427
 -0.1196489   0.05096698]


In [65]:
#Train the model again this time with embeddings of GloVe
embeddings_glove = [] 
embeddings_glove = [get_review_embedding_glove(review, glove_model) for review in reviews] #Fill the empty list with the embeddings from GloVe model

# Split data into training and test sets
X_train, X_test, y_train, y_test = train_test_split(embeddings_glove, 
                                                    df['Positively Rated'], 
                                                    random_state=0)

In [66]:
import xgboost as xgb #Importing XGBoost Model
from sklearn.metrics import roc_auc_score 

#Initialize the model
model = xgb.XGBClassifier()

#Definition of parameters with plenty of different compinations
param_dist = {
    'max_depth': [3, 5, 7, 9],
    'learning_rate': [0.01, 0.05, 0.1],
    'n_estimators': [50, 100, 200],
    'subsample': [0.7, 0.8, 0.9],
    'colsample_bytree': [0.7, 0.8, 0.9],
    'gamma': [0, 0.1, 0.2],
    'reg_alpha': [0, 1, 10],
    'reg_lambda': [0, 1, 10] 
}

#Use of Random Search on XGBoost with parallel Cross-Validation
random_search = RandomizedSearchCV(model, param_distributions=param_dist, n_iter=10, cv=5, scoring='roc_auc')
random_search.fit(X_train, y_train) #Fit the model with Random Search

print("Best Parameters:", random_search.best_params_)
print("Best AUC Score:", random_search.best_score_)

best_model = random_search.best_estimator_

#Model Evaluation
y_pred_proba = best_model.predict_proba(X_test)[:, 1]
auc = roc_auc_score(y_test, y_pred_proba)
print("AUC score:", auc)

Best Parameters: {'subsample': 0.8, 'reg_lambda': 10, 'reg_alpha': 0, 'n_estimators': 200, 'max_depth': 5, 'learning_rate': 0.1, 'gamma': 0, 'colsample_bytree': 0.9}
Best AUC Score: 0.9238069498714822
AUC score: 0.916426284711006


In [67]:
#Predictions with XGBoost
y_pred = best_model.predict(X_test)

xgb_accuracy = accuracy_score(y_test, y_pred)
xgb_precision = precision_score(y_test, y_pred)
xgb_recall = recall_score(y_test, y_pred)
xgb_f1 = f1_score(y_test, y_pred)

#Print the results
print("Accuracy:", xgb_accuracy)
print("Precision:", xgb_precision)
print("Recall:", xgb_recall)
print("F1 Score:", xgb_f1)

#Confusion matrix
conf_matrix = confusion_matrix(y_test, y_pred)
print("Confusion Matrix:")
print(conf_matrix)

Accuracy: 0.864263404476835
Precision: 0.8927260593927261
Recall: 0.9304468788036863
F1 Score: 0.9111962537249894
Confusion Matrix:
[[1290  643]
 [ 400 5351]]


In [68]:
#Prediction of two simple comments with GloVe
new_comments = ['not an issue, phone is working', 'an issue, phone is not working']
new_embeddings = [get_review_embedding_glove(comment, glove_model) for comment in new_comments]
predictions = best_model.predict(new_embeddings)
print("Expected Predictions: [1,0] with",xgb_accuracy, "accuracy")
print("Predictions:", predictions)

Expected Predictions: [1,0] with 0.864263404476835 accuracy
Predictions: [0 0]
