# Case Study: Amazon Reviews

### Data Prep

In [None]:
import pandas as pd
import numpy as np

# Read in the data
df = pd.read_csv('Amazon_Unlocked_Mobile.csv')

# Sample the data to speed up computation
# Comment out this line to match with lecture
df = df.sample(frac=0.1, random_state=10)

df.head()

In [58]:
# Drop missing values
df.dropna(inplace=True)

# Remove any 'neutral' ratings equal to 3
df = df[df['Rating'] != 3]

# Encode 4s and 5s as 1 (rated positively)
# Encode 1s and 2s as 0 (rated poorly)
df['Positively Rated'] = np.where(df['Rating'] > 3, 1, 0)
df.head(10)

Unnamed: 0,Product Name,Brand Name,Price,Rating,Reviews,Review Votes,Positively Rated
34377,Apple iPhone 5c 8GB (Pink) - Verizon Wireless,Apple,194.99,1,"The phone needed a SIM card, would have been n...",1.0,0
248521,Motorola Droid RAZR MAXX XT912 M Verizon Smart...,Motorola,174.99,5,I was 3 months away from my upgrade and my Str...,3.0,1
167661,CNPGD [U.S. Office Extended Warranty] Smartwat...,CNPGD,49.99,1,an experience i want to forget,0.0,0
73287,Apple iPhone 7 Unlocked Phone 256 GB - US Vers...,Apple,922.0,5,GREAT PHONE WORK ACCORDING MY EXPECTATIONS.,1.0,1
277158,Nokia N8 Unlocked GSM Touch Screen Phone Featu...,Nokia,95.0,5,I fell in love with this phone because it did ...,0.0,1
100311,Blackberry Torch 2 9810 Unlocked Phone with 1....,BlackBerry,77.49,5,I am pleased with this Blackberry phone! The p...,0.0,1
251669,Motorola Moto E (1st Generation) - Black - 4 G...,Motorola,89.99,5,"Great product, best value for money smartphone...",0.0,1
279878,OtterBox 77-29864 Defender Series Hybrid Case ...,OtterBox,9.99,5,I've bought 3 no problems. Fast delivery.,0.0,1
406017,Verizon HTC Rezound 4G Android Smarphone - 8MP...,HTC,74.99,4,Great phone for the price...,0.0,1
302567,"RCA M1 Unlocked Cell Phone, Dual Sim, 5Mp Came...",RCA,159.99,5,My mom is not good with new technoloy but this...,4.0,1


In [59]:
# Most ratings are positive
df['Positively Rated'].mean()

0.7471694429984383

In [60]:
from sklearn.model_selection import train_test_split, RandomizedSearchCV #Use of Random Search for better model perfomance 

# Split data into training and test sets
X_train, X_test, y_train, y_test = train_test_split(df['Reviews'], 
                                                    df['Positively Rated'], 
                                                    random_state=0)

In [61]:
print('X_train first entry:\n\n', X_train.iloc[0])
print('\n\nX_train shape: ', X_train.shape)

X_train first entry:

 Received garbage its defeated phones


X_train shape:  (23052,)


# CountVectorizer

In [62]:
from sklearn.feature_extraction.text import CountVectorizer

# Fit the CountVectorizer to the training data
vect = CountVectorizer().fit(X_train)

In [63]:
vect

In [64]:
vect.get_feature_names_out()[::1000]

array(['00', '_did_', 'artifact', 'breeze2', 'comfy', 'decision', 'ea',
       'farthough', 'googletalk', 'impractical', 'launching', 'minimally',
       'oneself', 'plusmicro', 'realizada', 'saids', 'snap', 'talk',
       'ucpk', 'whereing'], dtype=object)

In [65]:
len(vect.get_feature_names_out())

19533

In [66]:
# transform the documents in the training data to a document-term matrix
X_train_vectorized = vect.transform(X_train)

X_train_vectorized

<23052x19533 sparse matrix of type '<class 'numpy.int64'>'
	with 613452 stored elements in Compressed Sparse Row format>

In [67]:
X_train_vectorized[0]

<1x19533 sparse matrix of type '<class 'numpy.int64'>'
	with 5 stored elements in Compressed Sparse Row format>

In [68]:
import xgboost as xgb #Importing the model different than the original code
from sklearn.metrics import roc_auc_score 

#Initialize the model
model = xgb.XGBClassifier()

#Definition of parameters with plenty of different compinations
param_dist = {
    'max_depth': [3, 5, 7, 9],
    'learning_rate': [0.01, 0.05, 0.1],
    'n_estimators': [50, 100, 200],
    'subsample': [0.7, 0.8, 0.9],
    'colsample_bytree': [0.7, 0.8, 0.9],
    'gamma': [0, 0.1, 0.2],
    'reg_alpha': [0, 1, 10],
    'reg_lambda': [0, 1, 10] 
}

#Use of Random Search on XGBoost with parallel Cross-Validation
random_search = RandomizedSearchCV(model, param_distributions=param_dist, n_iter=10, cv=5, scoring='roc_auc')
random_search.fit(X_train_vectorized, y_train) #Fit the model with Random Search

print("Best Parameters:", random_search.best_params_)
print("Best AUC Score:", random_search.best_score_)

best_model = random_search.best_estimator_

#Model Evaluation
y_pred_proba = best_model.predict_proba(vect.transform(X_test))[:, 1]
auc = roc_auc_score(y_test, y_pred_proba)
print("AUC score:", auc)

Best Parameters: {'subsample': 0.9, 'reg_lambda': 0, 'reg_alpha': 10, 'n_estimators': 200, 'max_depth': 5, 'learning_rate': 0.1, 'gamma': 0, 'colsample_bytree': 0.7}
Best AUC Score: 0.9546511529932473
AUC score: 0.955531294721636


In [69]:
# Predict the transformed test documents
predictions = best_model.predict(vect.transform(X_test))

print('AUC: ', roc_auc_score(y_test, predictions))

AUC:  0.8389396369402635


In [70]:
#Name of features to a np-array
feature_names = np.array(vect.get_feature_names_out())

importances = best_model.feature_importances_ #Finding the importances of features
sorted_importances = importances.argsort() #Sort those importances

#Find the 10 smallest and 10 largest importances
print('Smallest Importances:\n{}\n'.format(feature_names[sorted_importances[:10]]))
print('Largest Importances: \n{}\n'.format(feature_names[sorted_importances[:-11:-1]]))

Smallest Importances:
['00' 'planner' 'planned' 'planing' 'planet' 'planes' 'plane' 'planning'
 'plan' 'plagued']

Largest Importances: 
['not' 'great' 'work' 'love' 'after' 'excellent' 'price' 'stopped' 'waste'
 'good']



# Tfidf

In [71]:
from sklearn.feature_extraction.text import TfidfVectorizer

# Fit the TfidfVectorizer to the training data specifiying a minimum document frequency of 5
vect = TfidfVectorizer(min_df=5).fit(X_train)
len(vect.get_feature_names_out())

5459

In [72]:
X_train_vectorized = vect.transform(X_train)

#Definition of parameters with plenty of different compinations
param_dist = {
    'max_depth': [3, 5, 7, 9],
    'learning_rate': [0.01, 0.05, 0.1],
    'n_estimators': [50, 100, 200],
    'subsample': [0.7, 0.8, 0.9],
    'colsample_bytree': [0.7, 0.8, 0.9],
    'gamma': [0, 0.1, 0.2],
    'reg_alpha': [0, 1, 10],
    'reg_lambda': [0, 1, 10] 
}

#Initialize the model
model = xgb.XGBClassifier()

#Use of Random Search on XGBoost with parallel Cross-Validation
random_search = RandomizedSearchCV(model, param_distributions=param_dist, n_iter=10, cv=5, scoring='roc_auc')
random_search.fit(X_train_vectorized, y_train) #Fit the model with Random Search


print("Best Parameters:", random_search.best_params_)
print("Best AUC Score:", random_search.best_score_)

best_model = random_search.best_estimator_

#Model Evaluation
y_pred_proba = best_model.predict_proba(vect.transform(X_test))[:, 1]
auc = roc_auc_score(y_test, y_pred_proba)
print("AUC score:", auc)

Best Parameters: {'subsample': 0.9, 'reg_lambda': 1, 'reg_alpha': 0, 'n_estimators': 200, 'max_depth': 9, 'learning_rate': 0.05, 'gamma': 0, 'colsample_bytree': 0.8}
Best AUC Score: 0.9637998508327306
AUC score: 0.9629263963000474


In [73]:
#Name of features to a np-array
feature_names = np.array(vect.get_feature_names_out())

importances = best_model.feature_importances_ #Finding the importances of the features
sorted_importances = importances.argsort() #Sort those importances

#Find the 10 smallest and 10 largest importances
print('Smallest Importances:\n{}\n'.format(feature_names[sorted_importances[:10]]))
print('Largest Importances: \n{}\n'.format(feature_names[sorted_importances[:-11:-1]]))

Smallest Importances:
['00' 'playlists' 'playlist' 'playing' 'player' 'played' 'playback' 'play'
 'plays' 'platform']

Largest Importances: 
['stopped' 'not' 'excellent' 'great' 'love' 'waste' 'disappointed'
 'excelente' 'perfect' 'worst']



In [74]:
# These reviews are treated the same by our current model
print(best_model.predict(vect.transform(['not an issue, phone is working',
                                    'an issue, phone is not working'])))

[0 0]


# n-grams

In [75]:
# Fit the CountVectorizer to the training data specifiying a minimum 
# document frequency of 5 and extracting 1-grams and 2-grams
vect = CountVectorizer(min_df=5, ngram_range=(1,3)).fit(X_train)

X_train_vectorized = vect.transform(X_train)

len(vect.get_feature_names_out())

44939

In [76]:
#Definition of parameters with plenty of different compinations
param_dist = {
    'max_depth': [3, 5, 7, 9],
    'learning_rate': [0.01, 0.05, 0.1],
    'n_estimators': [50, 100, 200],
    'subsample': [0.7, 0.8, 0.9],
    'colsample_bytree': [0.7, 0.8, 0.9],
    'gamma': [0, 0.1, 0.2],
    'reg_alpha': [0, 1, 10],
    'reg_lambda': [0, 1, 10] 
}

#Initialize the model
model = xgb.XGBClassifier()

#Use of Random Search on XGBoost with parallel Cross-Validation
random_search = RandomizedSearchCV(model, param_distributions=param_dist, n_iter=10, cv=5, scoring='roc_auc')
random_search.fit(X_train_vectorized, y_train) #Fit the model with Random Search

print("Best Parameters:", random_search.best_params_)
print("Best AUC Score:", random_search.best_score_)

best_model = random_search.best_estimator_

#Model Evaluation
y_pred_proba = best_model.predict_proba(vect.transform(X_test))[:, 1]
auc = roc_auc_score(y_test, y_pred_proba)
print("AUC score:", auc)

Best Parameters: {'subsample': 0.8, 'reg_lambda': 1, 'reg_alpha': 1, 'n_estimators': 200, 'max_depth': 5, 'learning_rate': 0.1, 'gamma': 0, 'colsample_bytree': 0.7}
Best AUC Score: 0.9627448923544384
AUC score: 0.9639975341565467


In [77]:
#Name of features to a np-array
feature_names = np.array(vect.get_feature_names_out())

importances = best_model.feature_importances_ #Finding the importances of the features
sorted_importances = importances.argsort() #Sort those importances

#Find the 10 smallest and 10 largest importances
print('Smallest Importances:\n{}\n'.format(feature_names[sorted_importances[:10]]))
print('Largest Importances: \n{}\n'.format(feature_names[sorted_importances[:-11:-1]]))

Smallest Importances:
['00' 'ratings' 'ratio' 'raw' 'razor' 'razr' 're' 're getting' 're going'
 're going to']

Largest Importances: 
['not' 'great' 'excellent' 'stopped' 'disappointed' 'love' 'work'
 'perfect' 'returned' 'return']



In [78]:
# These reviews are now correctly identified
print(best_model.predict(vect.transform(['not an issue, phone is working',
                                    'an issue, phone is not working'])))

[0 0]
