In [293]:
# Project: Final Team Predictive Models
# COMP_SCI 349-0 Machine Learning | Professor V.S.
# By Sengdao Inthavong, Lillian Torres, Haylie Wu, Ernie Wang

# This file contains the code for the predictive models used in our project.
# The models are trained on the training data and tested on the testing data for the CDs and vinyls.

In [294]:
# Import basic libraries
import os
import numpy as np
import pandas as pd

# Import libraries for text processing
import nltk
from nltk.sentiment.vader import SentimentIntensityAnalyzer
from sklearn.preprocessing import LabelEncoder 
from sklearn.preprocessing import MinMaxScaler

# Import libraries for machine learning
import joblib
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix
from sklearn.naive_bayes import GaussianNB, MultinomialNB, BernoulliNB, ComplementNB, CategoricalNB
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import AdaBoostClassifier, RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV

# Import libraries for logging
import logging
from tqdm import tqdm

In [295]:
# Load product and review data for CDs and vinyls from the training set
data_dir = ''
categories = ['CDs_and_Vinyl', 'Grocery_and_Gourmet_Food', 'Toys_and_Games']

file_path = os.path.join(data_dir, categories[0], 'train', 'product_training.json')
product_training = pd.read_json(file_path)

file_path = os.path.join(data_dir, categories[0], 'train', 'review_training.json')
review_training = pd.read_json(file_path)

# Merge product and review data
training_data = review_training.merge(product_training, on='asin', how='left')

In [296]:
# Fill in any missing values
training_data['reviewText'].fillna('', inplace=True)
training_data['summary'].fillna('', inplace=True)

In [297]:
# Give each review a unique ID
training_data['reviewID'] = training_data.index

In [298]:
# Run sentiment analysis on the review text and summary
# Columns: neg, neu, pos, compound

# sid = SentimentIntensityAnalyzer()

# review_sentiments = pd.DataFrame(columns=['reviewID', 'reviewText_neg', 'reviewText_neu', 'reviewText_pos', 'reviewText_compound', 'summary_neg', 'summary_neu', 'summary_pos', 'summary_compound'])

# for index, row in tqdm(training_data.iterrows(), total=training_data.shape[0], desc="Sentiment Analysis"):
#     review_text_sentiment = sid.polarity_scores(row['reviewText'])
#     summary_text_sentiment = sid.polarity_scores(row['summary'])
    
#     sentiment_row = {'reviewID': row['reviewID'],
#                      'reviewText_neg': review_text_sentiment['neg'],
#                      'reviewText_neu': review_text_sentiment['neu'],
#                      'reviewText_pos': review_text_sentiment['pos'],
#                      'reviewText_compound': review_text_sentiment['compound'],
#                      'summary_neg': summary_text_sentiment['neg'],
#                      'summary_neu': summary_text_sentiment['neu'],
#                      'summary_pos': summary_text_sentiment['pos'],
#                      'summary_compound': summary_text_sentiment['compound']}
    
#     review_sentiments = review_sentiments.append(sentiment_row, ignore_index=True)

# # Save the sentiment data to a csv file for future use
# file_path = os.path.join(data_dir, categories[0], 'csv', 'review_sentiments.csv')
# review_sentiments.to_csv(file_path, index=False)
review_sentiments = pd.read_csv('review_sentiments.csv')
review_sentiments

Unnamed: 0,reviewID,reviewText_neg,reviewText_neu,reviewText_pos,reviewText_compound,summary_neg,summary_neu,summary_pos,summary_compound
0,0.0,0.020,0.757,0.223,0.9926,0.0,1.000,0.000,0.0000
1,1.0,0.101,0.822,0.077,0.3222,0.0,1.000,0.000,0.0000
2,2.0,0.088,0.709,0.203,0.7845,0.0,1.000,0.000,0.0000
3,3.0,0.054,0.788,0.158,0.7430,0.0,0.426,0.574,0.4678
4,4.0,0.000,0.196,0.804,0.6249,0.0,1.000,0.000,0.0000
...,...,...,...,...,...,...,...,...,...
770781,770781.0,0.054,0.800,0.146,0.9555,0.0,1.000,0.000,0.0000
770782,770782.0,0.042,0.875,0.083,0.4767,0.0,1.000,0.000,0.0000
770783,770783.0,0.138,0.648,0.214,0.5371,0.0,1.000,0.000,0.0000
770784,770784.0,0.000,0.798,0.202,0.7444,0.0,0.328,0.672,0.6249


In [299]:
# Merge the sentiment data with the training data
training_data = training_data.merge(review_sentiments, on='reviewID', how='left')
training_data

Unnamed: 0,asin,reviewerID,unixReviewTime,vote,verified,reviewTime,style,reviewerName,reviewText,summary,...,awesomeness,reviewID,reviewText_neg,reviewText_neu,reviewText_pos,reviewText_compound,summary_neg,summary_neu,summary_pos,summary_compound
0,6673F1740E03573BCD64238FE711FC69,9C856D4A18E1355783B3B28B7ECC1848,1451520000,9,True,"12 31, 2015",{'Format:': ' MP3 Music'},8D88BB79AAC50277AEE82FCFD77F6744,Finding the Beatles all over again - and bette...,I sit listening - with my jaw to the floor - H...,...,1,0,0.020,0.757,0.223,0.9926,0.0,1.000,0.000,0.0000
1,690819436E20BB31657AF6B58B984DD4,6E9ABBD26A27C2B2851D1EC34A01CBDC,1113523200,9,False,"04 15, 2005",{'Format:': ' Audio CD'},8DC0611245A871AC51BBEEBB85F33A58,These guys can sing! Such classic tunes...poi...,Under Appreciated....,...,1,1,0.101,0.822,0.077,0.3222,0.0,1.000,0.000,0.0000
2,A680D4753F0CEA2252C168A6ACB2B623,B637C3C93E61094474710F456928BE9F,1126137600,,False,"09 8, 2005",,2259386624CFA0EC53A75A50A9BB57A5,"Snoop Doggy Dogg made a classic album, DoggySt...",DoggyStyle,...,1,2,0.088,0.709,0.203,0.7845,0.0,1.000,0.000,0.0000
3,F4A966F1FA340B16651D676BC246D227,AA7918E9410D650A076221C7B2934A09,954979200,,False,"04 6, 2000",{'Format:': ' Audio CD'},7A65A155C993535BC99CBCB39E7161B5,Stevie Nicks Has had Her Moments. I Like Some ...,Pretty Good but a Bit Dated,...,1,3,0.054,0.788,0.158,0.7430,0.0,0.426,0.574,0.4678
4,EF59DAF0C00319A48D4784266FD157EE,2293C9B7950A3356B95828419A677720,1477958400,,True,"11 1, 2016",{'Format:': ' Audio CD'},C69A09446009C500B1364B7DB5510497,Great cd.,Five Stars,...,1,4,0.000,0.196,0.804,0.6249,0.0,1.000,0.000,0.0000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
770781,BD91503308A437374C3254EDC8BC24CB,936ED23AF4D23943786BBD44D0F1114B,1136246400,12,True,"01 3, 2006",{'Format:': ' Audio CD'},AAB61000438939C8E6165CFCCF02A488,This was the first Simple Minds album that I b...,Their Most Fully Realized Artistic Studio Stat...,...,1,770781,0.054,0.800,0.146,0.9555,0.0,1.000,0.000,0.0000
770782,9BC50277D18FAB423AD33C8CE4CC000D,EF922377A87E9D01F50065F2DA1722A8,1290556800,,False,"11 24, 2010",,0441BC4F6B7BD180769FDCDD8E603560,I have owned a CD copy of this show for at lea...,Forgettable R&H,...,0,770782,0.042,0.875,0.083,0.4767,0.0,1.000,0.000,0.0000
770783,4062627CA1586E517520483964299349,E1F0B0EBC6A36F33301E4FD0B3D62D52,1311120000,3,False,"07 20, 2011",{'Format:': ' Audio CD'},8AB3EEBF23F3583A4396A57DB291D548,carnival of souls to me is bad i gave my cd aw...,Darren d.,...,0,770783,0.138,0.648,0.214,0.5371,0.0,1.000,0.000,0.0000
770784,0AE44A6A9176E6A52507B6ABDDA80B00,DDDC81E6B8C3F8C91867F9AECB385135,1111968000,2,False,"03 28, 2005",{'Format:': ' Audio CD'},51B2E62E0A5864316BDB33FD4A729B37,This is an awesome slayer album. I love the th...,awesome slayer cd,...,1,770784,0.000,0.798,0.202,0.7444,0.0,0.328,0.672,0.6249


In [300]:
# Process the columns that are not numeric
encoder = LabelEncoder()

# Define a function to extract the format information from the "style" column
def extract_format(style):
    if style is None:
        return "None"
    else:
        return style.get("Format:", "None").strip()

# Apply the function to the "style" column to extract the format information
training_data["style"] = training_data["style"].apply(extract_format)
training_data

Unnamed: 0,asin,reviewerID,unixReviewTime,vote,verified,reviewTime,style,reviewerName,reviewText,summary,...,awesomeness,reviewID,reviewText_neg,reviewText_neu,reviewText_pos,reviewText_compound,summary_neg,summary_neu,summary_pos,summary_compound
0,6673F1740E03573BCD64238FE711FC69,9C856D4A18E1355783B3B28B7ECC1848,1451520000,9,True,"12 31, 2015",MP3 Music,8D88BB79AAC50277AEE82FCFD77F6744,Finding the Beatles all over again - and bette...,I sit listening - with my jaw to the floor - H...,...,1,0,0.020,0.757,0.223,0.9926,0.0,1.000,0.000,0.0000
1,690819436E20BB31657AF6B58B984DD4,6E9ABBD26A27C2B2851D1EC34A01CBDC,1113523200,9,False,"04 15, 2005",Audio CD,8DC0611245A871AC51BBEEBB85F33A58,These guys can sing! Such classic tunes...poi...,Under Appreciated....,...,1,1,0.101,0.822,0.077,0.3222,0.0,1.000,0.000,0.0000
2,A680D4753F0CEA2252C168A6ACB2B623,B637C3C93E61094474710F456928BE9F,1126137600,,False,"09 8, 2005",,2259386624CFA0EC53A75A50A9BB57A5,"Snoop Doggy Dogg made a classic album, DoggySt...",DoggyStyle,...,1,2,0.088,0.709,0.203,0.7845,0.0,1.000,0.000,0.0000
3,F4A966F1FA340B16651D676BC246D227,AA7918E9410D650A076221C7B2934A09,954979200,,False,"04 6, 2000",Audio CD,7A65A155C993535BC99CBCB39E7161B5,Stevie Nicks Has had Her Moments. I Like Some ...,Pretty Good but a Bit Dated,...,1,3,0.054,0.788,0.158,0.7430,0.0,0.426,0.574,0.4678
4,EF59DAF0C00319A48D4784266FD157EE,2293C9B7950A3356B95828419A677720,1477958400,,True,"11 1, 2016",Audio CD,C69A09446009C500B1364B7DB5510497,Great cd.,Five Stars,...,1,4,0.000,0.196,0.804,0.6249,0.0,1.000,0.000,0.0000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
770781,BD91503308A437374C3254EDC8BC24CB,936ED23AF4D23943786BBD44D0F1114B,1136246400,12,True,"01 3, 2006",Audio CD,AAB61000438939C8E6165CFCCF02A488,This was the first Simple Minds album that I b...,Their Most Fully Realized Artistic Studio Stat...,...,1,770781,0.054,0.800,0.146,0.9555,0.0,1.000,0.000,0.0000
770782,9BC50277D18FAB423AD33C8CE4CC000D,EF922377A87E9D01F50065F2DA1722A8,1290556800,,False,"11 24, 2010",,0441BC4F6B7BD180769FDCDD8E603560,I have owned a CD copy of this show for at lea...,Forgettable R&H,...,0,770782,0.042,0.875,0.083,0.4767,0.0,1.000,0.000,0.0000
770783,4062627CA1586E517520483964299349,E1F0B0EBC6A36F33301E4FD0B3D62D52,1311120000,3,False,"07 20, 2011",Audio CD,8AB3EEBF23F3583A4396A57DB291D548,carnival of souls to me is bad i gave my cd aw...,Darren d.,...,0,770783,0.138,0.648,0.214,0.5371,0.0,1.000,0.000,0.0000
770784,0AE44A6A9176E6A52507B6ABDDA80B00,DDDC81E6B8C3F8C91867F9AECB385135,1111968000,2,False,"03 28, 2005",Audio CD,51B2E62E0A5864316BDB33FD4A729B37,This is an awesome slayer album. I love the th...,awesome slayer cd,...,1,770784,0.000,0.798,0.202,0.7444,0.0,0.328,0.672,0.6249


In [301]:
# Encode the columns
encoder = LabelEncoder()
training_data["style"] = encoder.fit_transform(training_data["style"])

# Encode the "verified" column
training_data["verified"] = encoder.fit_transform(training_data["verified"])

# Encode the "reviewerID" column
training_data["reviewerID"] = encoder.fit_transform(training_data["reviewerID"])

# Encode the "vote"" column
training_data["vote"] = training_data["vote"].apply(lambda x: float(x.replace(",", "")) if x is not None else 0)

# Encode the "image" column
training_data["image"] = training_data["image"].apply(lambda x: len(x) if x is not None else 0)

training_data

Unnamed: 0,asin,reviewerID,unixReviewTime,vote,verified,reviewTime,style,reviewerName,reviewText,summary,...,awesomeness,reviewID,reviewText_neg,reviewText_neu,reviewText_pos,reviewText_compound,summary_neg,summary_neu,summary_pos,summary_compound
0,6673F1740E03573BCD64238FE711FC69,67931,1451520000,9.0,1,"12 31, 2015",16,8D88BB79AAC50277AEE82FCFD77F6744,Finding the Beatles all over again - and bette...,I sit listening - with my jaw to the floor - H...,...,1,0,0.020,0.757,0.223,0.9926,0.0,1.000,0.000,0.0000
1,690819436E20BB31657AF6B58B984DD4,47952,1113523200,9.0,0,"04 15, 2005",3,8DC0611245A871AC51BBEEBB85F33A58,These guys can sing! Such classic tunes...poi...,Under Appreciated....,...,1,1,0.101,0.822,0.077,0.3222,0.0,1.000,0.000,0.0000
2,A680D4753F0CEA2252C168A6ACB2B623,79174,1126137600,0.0,0,"09 8, 2005",20,2259386624CFA0EC53A75A50A9BB57A5,"Snoop Doggy Dogg made a classic album, DoggySt...",DoggyStyle,...,1,2,0.088,0.709,0.203,0.7845,0.0,1.000,0.000,0.0000
3,F4A966F1FA340B16651D676BC246D227,74058,954979200,0.0,0,"04 6, 2000",3,7A65A155C993535BC99CBCB39E7161B5,Stevie Nicks Has had Her Moments. I Like Some ...,Pretty Good but a Bit Dated,...,1,3,0.054,0.788,0.158,0.7430,0.0,0.426,0.574,0.4678
4,EF59DAF0C00319A48D4784266FD157EE,15169,1477958400,0.0,1,"11 1, 2016",3,C69A09446009C500B1364B7DB5510497,Great cd.,Five Stars,...,1,4,0.000,0.196,0.804,0.6249,0.0,1.000,0.000,0.0000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
770781,BD91503308A437374C3254EDC8BC24CB,63910,1136246400,12.0,1,"01 3, 2006",3,AAB61000438939C8E6165CFCCF02A488,This was the first Simple Minds album that I b...,Their Most Fully Realized Artistic Studio Stat...,...,1,770781,0.054,0.800,0.146,0.9555,0.0,1.000,0.000,0.0000
770782,9BC50277D18FAB423AD33C8CE4CC000D,104092,1290556800,0.0,0,"11 24, 2010",20,0441BC4F6B7BD180769FDCDD8E603560,I have owned a CD copy of this show for at lea...,Forgettable R&H,...,0,770782,0.042,0.875,0.083,0.4767,0.0,1.000,0.000,0.0000
770783,4062627CA1586E517520483964299349,98108,1311120000,3.0,0,"07 20, 2011",3,8AB3EEBF23F3583A4396A57DB291D548,carnival of souls to me is bad i gave my cd aw...,Darren d.,...,0,770783,0.138,0.648,0.214,0.5371,0.0,1.000,0.000,0.0000
770784,0AE44A6A9176E6A52507B6ABDDA80B00,96333,1111968000,2.0,0,"03 28, 2005",3,51B2E62E0A5864316BDB33FD4A729B37,This is an awesome slayer album. I love the th...,awesome slayer cd,...,1,770784,0.000,0.798,0.202,0.7444,0.0,0.328,0.672,0.6249


In [302]:
# Compute the length of reviewText and summary columns
training_data["reviewText_len"] = training_data["reviewText"].apply(len)
training_data["summary_len"] = training_data["summary"].apply(len)

In [303]:
training_data

Unnamed: 0,asin,reviewerID,unixReviewTime,vote,verified,reviewTime,style,reviewerName,reviewText,summary,...,reviewText_neg,reviewText_neu,reviewText_pos,reviewText_compound,summary_neg,summary_neu,summary_pos,summary_compound,reviewText_len,summary_len
0,6673F1740E03573BCD64238FE711FC69,67931,1451520000,9.0,1,"12 31, 2015",16,8D88BB79AAC50277AEE82FCFD77F6744,Finding the Beatles all over again - and bette...,I sit listening - with my jaw to the floor - H...,...,0.020,0.757,0.223,0.9926,0.0,1.000,0.000,0.0000,1119,128
1,690819436E20BB31657AF6B58B984DD4,47952,1113523200,9.0,0,"04 15, 2005",3,8DC0611245A871AC51BBEEBB85F33A58,These guys can sing! Such classic tunes...poi...,Under Appreciated....,...,0.101,0.822,0.077,0.3222,0.0,1.000,0.000,0.0000,457,21
2,A680D4753F0CEA2252C168A6ACB2B623,79174,1126137600,0.0,0,"09 8, 2005",20,2259386624CFA0EC53A75A50A9BB57A5,"Snoop Doggy Dogg made a classic album, DoggySt...",DoggyStyle,...,0.088,0.709,0.203,0.7845,0.0,1.000,0.000,0.0000,321,10
3,F4A966F1FA340B16651D676BC246D227,74058,954979200,0.0,0,"04 6, 2000",3,7A65A155C993535BC99CBCB39E7161B5,Stevie Nicks Has had Her Moments. I Like Some ...,Pretty Good but a Bit Dated,...,0.054,0.788,0.158,0.7430,0.0,0.426,0.574,0.4678,294,27
4,EF59DAF0C00319A48D4784266FD157EE,15169,1477958400,0.0,1,"11 1, 2016",3,C69A09446009C500B1364B7DB5510497,Great cd.,Five Stars,...,0.000,0.196,0.804,0.6249,0.0,1.000,0.000,0.0000,9,10
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
770781,BD91503308A437374C3254EDC8BC24CB,63910,1136246400,12.0,1,"01 3, 2006",3,AAB61000438939C8E6165CFCCF02A488,This was the first Simple Minds album that I b...,Their Most Fully Realized Artistic Studio Stat...,...,0.054,0.800,0.146,0.9555,0.0,1.000,0.000,0.0000,886,53
770782,9BC50277D18FAB423AD33C8CE4CC000D,104092,1290556800,0.0,0,"11 24, 2010",20,0441BC4F6B7BD180769FDCDD8E603560,I have owned a CD copy of this show for at lea...,Forgettable R&H,...,0.042,0.875,0.083,0.4767,0.0,1.000,0.000,0.0000,265,15
770783,4062627CA1586E517520483964299349,98108,1311120000,3.0,0,"07 20, 2011",3,8AB3EEBF23F3583A4396A57DB291D548,carnival of souls to me is bad i gave my cd aw...,Darren d.,...,0.138,0.648,0.214,0.5371,0.0,1.000,0.000,0.0000,215,9
770784,0AE44A6A9176E6A52507B6ABDDA80B00,96333,1111968000,2.0,0,"03 28, 2005",3,51B2E62E0A5864316BDB33FD4A729B37,This is an awesome slayer album. I love the th...,awesome slayer cd,...,0.000,0.798,0.202,0.7444,0.0,0.328,0.672,0.6249,145,17


In [304]:
# # Find correlation between reviewText_neg grouped by 0.1 (reviewText_neg group with values 0.0 to 0.1, etc.)
# training_data.groupby(pd.cut(training_data["reviewText_neg"], np.arange(0, 1.1, 0.1))).mean()

In [305]:
# # Find correlation between vote grouped by 10
# training_data.groupby(pd.cut(training_data["vote"], np.arange(0, 100, 10))).mean()

In [306]:
# # Find correlation between image grouped by 10
# training_data.groupby(pd.cut(training_data["image"], np.arange(0, 50, 5))).mean()

In [307]:
# # Find correlation between reviewText_len grouped by 100
# training_data.groupby(pd.cut(training_data["reviewText_len"], np.arange(0, 1000, 100))).mean()

In [308]:
# # Find correlation between summary_len grouped by 10
# training_data.groupby(pd.cut(training_data["summary_len"], np.arange(0, 100, 10))).mean()

In [309]:
# Group by asin
filteredData = training_data[(training_data["verified"] == 1) | (training_data["vote"] > 10) | (training_data["image"] >= 1)]

# Merge filtered_training_data on training_data if asin in filtered_training_data does not exist (right outer join)
new_training_data = filteredData._append(training_data[~training_data.asin.isin(filteredData.asin)])
training_data = new_training_data.groupby("asin")

# # Normalize the compound scores
# training_data["reviewText_compound_norm"] = (training_data["reviewText_compound"] - training_data["reviewText_compound"].mean()) / training_data["reviewText_compound"].std()
# training_data["summary_compound_norm"] = (training_data["summary_compound"] - training_data["summary_compound"].mean()) / training_data["summary_compound"].std()

# # Calculate the absolute difference between the normalized compound scores and the awesomeness
# training_data["reviewText_compound_diff"] = abs(training_data["reviewText_compound_norm"] - (1 - training_data["reviewText_neg"]))
# training_data["summary_compound_diff"] = abs(training_data["summary_compound_norm"] - (1 - training_data["reviewText_neg"]))

# # Calculate the average difference between the normalized compound scores and the awesomeness for each asin
# compound_diff_mean = training_data.groupby("asin")[["reviewText_compound_diff", "summary_compound_diff"]].mean()

# # Sort the reviews for each asin by the average difference between the normalized compound scores and the awesomeness
# compound_diff_mean["compound_diff_mean"] = compound_diff_mean.mean(axis=1)
# compound_diff_mean = compound_diff_mean.sort_values("compound_diff_mean", ascending=False)

# # Keep the top 2/3 of the reviews for each asin
# num_asins = len(compound_diff_mean)
# top_reviews_per_asin = int(num_asins * 2/3)
# top_asins = compound_diff_mean.iloc[:top_reviews_per_asin].index
# training_data = training_data[training_data["asin"].isin(top_asins)]
# training_data

In [310]:
# Aggregate the training data by asin
training_data = training_data.agg({
    "reviewerID": "count",
    "unixReviewTime": ["min", "max", "mean", "std"],
    "verified": ["mean", "sum"],
    "vote": ["mean", "sum"],
    "image": ["mean", "sum"],
    "style": ["mean", "sum"],
    "reviewText_neg": ["mean", "std"],
    "reviewText_neu": ["mean", "std"],
    "reviewText_pos": ["mean", "std"],
    "reviewText_compound": ["mean", "std"],
    "summary_neg": ["mean", "std"],
    "summary_neu": ["mean", "std"],
    "summary_pos": ["mean", "std"],
    "summary_compound": ["mean", "std"],
    "reviewText_len": ["mean", "std"],
    "summary_len": ["mean", "std"],
}).reset_index()

training_data

Unnamed: 0_level_0,asin,reviewerID,unixReviewTime,unixReviewTime,unixReviewTime,unixReviewTime,verified,verified,vote,vote,...,summary_neu,summary_neu,summary_pos,summary_pos,summary_compound,summary_compound,reviewText_len,reviewText_len,summary_len,summary_len
Unnamed: 0_level_1,Unnamed: 1_level_1,count,min,max,mean,std,mean,sum,mean,sum,...,mean,std,mean,std,mean,std,mean,std,mean,std
0,0000B049F5B33CD310EB1AB236E20191,2,1412294400,1427068800,1.419682e+09,1.044708e+07,1.000000,2,0.000000,0.0,...,0.293000,0.142836,0.707000,0.142836,0.731850,0.134280,686.000000,115.965512,31.500000,27.577164
1,00018184A9EC4D270219A296B2580303,4,997488000,1499558400,1.333433e+09,2.345825e+08,0.750000,3,5.500000,22.0,...,0.660750,0.417966,0.257250,0.297283,0.199700,0.235639,1325.750000,2472.037806,21.000000,8.041559
2,000281A9CAC43FF1F335726A390636DA,3,1206748800,1417219200,1.326499e+09,1.081967e+08,1.000000,3,0.000000,0.0,...,0.867333,0.229785,0.132667,0.229785,0.170200,0.294795,748.000000,1232.756667,20.666667,10.066446
3,00030884DF109F325638A6BFD5B13CFF,18,1060473600,1496880000,1.379150e+09,1.140571e+08,0.944444,17,2.388889,43.0,...,0.782500,0.323669,0.217500,0.323669,0.236078,0.316375,152.388889,238.397133,20.944444,13.531976
4,000325BA25966B5FC701D5D2B5DBA4E0,4,1283472000,1456617600,1.363802e+09,7.752298e+07,1.000000,4,1.750000,7.0,...,0.835250,0.329500,0.059500,0.119000,-0.096000,0.192000,469.250000,732.887156,21.750000,14.198005
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
73077,FFFDD3C72D23AF858D6E0ED92612370D,25,1006819200,1507507200,1.394282e+09,1.219028e+08,0.960000,24,1.160000,29.0,...,0.700960,0.345688,0.252960,0.347569,0.200232,0.361908,710.600000,1204.892768,21.840000,17.973777
73078,FFFDDE284A73B29B320381487EC7DE9E,2,1295913600,1506470400,1.401192e+09,1.488861e+08,1.000000,2,1.000000,2.0,...,0.344500,0.045962,0.311500,0.440528,0.107300,0.570352,369.500000,482.953932,15.500000,2.121320
73079,FFFEB3EE2372807964F024707D50FB21,2,1345852800,1352246400,1.349050e+09,4.520958e+06,1.000000,2,0.000000,0.0,...,0.661500,0.478711,0.177500,0.251023,0.038600,0.054589,651.500000,129.400541,33.500000,6.363961
73080,FFFF4545AB232D81D0F9B208388BB7AA,4,1417651200,1446076800,1.434283e+09,1.287727e+07,1.000000,4,0.000000,0.0,...,0.633750,0.160672,0.270750,0.066605,0.375075,0.355723,553.750000,226.483811,62.500000,12.974334


In [311]:
# Add +1 to compound columns to avoid negative values
training_data["reviewText_compound"] += 1
training_data["summary_compound"] += 1
# Replace NaN values with 0
training_data.fillna(0, inplace=True)
training_data

Unnamed: 0_level_0,asin,reviewerID,unixReviewTime,unixReviewTime,unixReviewTime,unixReviewTime,verified,verified,vote,vote,...,summary_neu,summary_neu,summary_pos,summary_pos,summary_compound,summary_compound,reviewText_len,reviewText_len,summary_len,summary_len
Unnamed: 0_level_1,Unnamed: 1_level_1,count,min,max,mean,std,mean,sum,mean,sum,...,mean,std,mean,std,mean,std,mean,std,mean,std
0,0000B049F5B33CD310EB1AB236E20191,2,1412294400,1427068800,1.419682e+09,1.044708e+07,1.000000,2,0.000000,0.0,...,0.293000,0.142836,0.707000,0.142836,1.731850,1.134280,686.000000,115.965512,31.500000,27.577164
1,00018184A9EC4D270219A296B2580303,4,997488000,1499558400,1.333433e+09,2.345825e+08,0.750000,3,5.500000,22.0,...,0.660750,0.417966,0.257250,0.297283,1.199700,1.235639,1325.750000,2472.037806,21.000000,8.041559
2,000281A9CAC43FF1F335726A390636DA,3,1206748800,1417219200,1.326499e+09,1.081967e+08,1.000000,3,0.000000,0.0,...,0.867333,0.229785,0.132667,0.229785,1.170200,1.294795,748.000000,1232.756667,20.666667,10.066446
3,00030884DF109F325638A6BFD5B13CFF,18,1060473600,1496880000,1.379150e+09,1.140571e+08,0.944444,17,2.388889,43.0,...,0.782500,0.323669,0.217500,0.323669,1.236078,1.316375,152.388889,238.397133,20.944444,13.531976
4,000325BA25966B5FC701D5D2B5DBA4E0,4,1283472000,1456617600,1.363802e+09,7.752298e+07,1.000000,4,1.750000,7.0,...,0.835250,0.329500,0.059500,0.119000,0.904000,1.192000,469.250000,732.887156,21.750000,14.198005
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
73077,FFFDD3C72D23AF858D6E0ED92612370D,25,1006819200,1507507200,1.394282e+09,1.219028e+08,0.960000,24,1.160000,29.0,...,0.700960,0.345688,0.252960,0.347569,1.200232,1.361908,710.600000,1204.892768,21.840000,17.973777
73078,FFFDDE284A73B29B320381487EC7DE9E,2,1295913600,1506470400,1.401192e+09,1.488861e+08,1.000000,2,1.000000,2.0,...,0.344500,0.045962,0.311500,0.440528,1.107300,1.570352,369.500000,482.953932,15.500000,2.121320
73079,FFFEB3EE2372807964F024707D50FB21,2,1345852800,1352246400,1.349050e+09,4.520958e+06,1.000000,2,0.000000,0.0,...,0.661500,0.478711,0.177500,0.251023,1.038600,1.054589,651.500000,129.400541,33.500000,6.363961
73080,FFFF4545AB232D81D0F9B208388BB7AA,4,1417651200,1446076800,1.434283e+09,1.287727e+07,1.000000,4,0.000000,0.0,...,0.633750,0.160672,0.270750,0.066605,1.375075,1.355723,553.750000,226.483811,62.500000,12.974334


In [312]:
# Define the columns to normalize
cols_to_normalize = [('unixReviewTime', 'max'),
('reviewText_neg', 'mean'),
('reviewText_neg', 'std'),
('reviewText_pos', 'mean'),
('reviewText_pos', 'std'),
('reviewText_compound', 'mean'),
('reviewText_compound', 'std'),
('summary_neg', 'mean'),
('summary_neg', 'std'),
('summary_neu', 'mean'),
('summary_neu', 'std'),
('summary_pos', 'mean'),
('summary_pos', 'std'),
('summary_compound', 'mean'),
('summary_compound', 'std'),
('reviewText_len', 'mean'),
('reviewText_len', 'std'),
('summary_len', 'mean'),
('summary_len', 'std')]

# Normalize the data using the MinMaxScaler
scaler = MinMaxScaler()
training_data[cols_to_normalize] = scaler.fit_transform(training_data[cols_to_normalize])
training_data

Unnamed: 0_level_0,asin,reviewerID,unixReviewTime,unixReviewTime,unixReviewTime,unixReviewTime,verified,verified,vote,vote,...,summary_neu,summary_neu,summary_pos,summary_pos,summary_compound,summary_compound,reviewText_len,reviewText_len,summary_len,summary_len
Unnamed: 0_level_1,Unnamed: 1_level_1,count,min,max,mean,std,mean,sum,mean,sum,...,mean,std,mean,std,mean,std,mean,std,mean,std
0,0000B049F5B33CD310EB1AB236E20191,2,1412294400,0.826568,1.419682e+09,1.044708e+07,1.000000,2,0.000000,0.0,...,0.293000,0.202000,0.707000,0.202000,0.867899,0.515674,0.027723,0.005495,0.169355,0.325000
1,00018184A9EC4D270219A296B2580303,4,997488000,0.939717,1.333433e+09,2.345825e+08,0.750000,3,5.500000,22.0,...,0.660750,0.591093,0.257250,0.420421,0.582426,0.561754,0.053576,0.117134,0.112903,0.094771
2,000281A9CAC43FF1F335726A390636DA,3,1206748800,0.811194,1.326499e+09,1.081967e+08,1.000000,3,0.000000,0.0,...,0.867333,0.324966,0.132667,0.324966,0.566601,0.588648,0.030228,0.058413,0.111111,0.118634
3,00030884DF109F325638A6BFD5B13CFF,18,1060473600,0.935536,1.379150e+09,1.140571e+08,0.944444,17,2.388889,43.0,...,0.782500,0.457737,0.217500,0.457737,0.601941,0.598459,0.006158,0.011296,0.112605,0.159476
4,000325BA25966B5FC701D5D2B5DBA4E0,4,1283472000,0.872690,1.363802e+09,7.752298e+07,1.000000,4,1.750000,7.0,...,0.835250,0.465983,0.059500,0.168291,0.423797,0.541915,0.018963,0.034727,0.116935,0.167325
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
73077,FFFDD3C72D23AF858D6E0ED92612370D,25,1006819200,0.952124,1.394282e+09,1.219028e+08,0.960000,24,1.160000,29.0,...,0.700960,0.488877,0.252960,0.491537,0.582711,0.619160,0.028717,0.057092,0.117419,0.211823
73078,FFFDDE284A73B29B320381487EC7DE9E,2,1295913600,0.950506,1.401192e+09,1.488861e+08,1.000000,2,1.000000,2.0,...,0.344500,0.065000,0.311500,0.623000,0.532858,0.713924,0.014932,0.022884,0.083333,0.025000
73079,FFFEB3EE2372807964F024707D50FB21,2,1345852800,0.709777,1.349050e+09,4.520958e+06,1.000000,2,0.000000,0.0,...,0.661500,0.677000,0.177500,0.355000,0.496003,0.479444,0.026329,0.006131,0.180108,0.075000
73080,FFFF4545AB232D81D0F9B208388BB7AA,4,1417651200,0.856237,1.434283e+09,1.287727e+07,1.000000,4,0.000000,0.0,...,0.633750,0.227225,0.270750,0.094194,0.676506,0.616348,0.022378,0.010732,0.336022,0.152904


In [313]:
# Merge the training data with the awesomeness data
file_path = os.path.join(data_dir, categories[0], 'train', 'product_training.json')
product_training = pd.read_json(file_path)

# # merge column to 1D

training_data.columns = ['_'.join(col).strip() for col in training_data.columns.values]
# change "asin_" to "asin"
training_data.rename(columns={'asin_': 'asin'}, inplace=True)
training_data

training_data = training_data.merge(product_training, on='asin', how='left')

# Visualize the absolute correlation between the features on "awesomeness"
# training_data.corr()["awesomeness"].abs().sort_values(ascending=False)

In [314]:
training_data = training_data.drop(training_data.columns[1], axis=1)
training_data


Unnamed: 0,asin,unixReviewTime_min,unixReviewTime_max,unixReviewTime_mean,unixReviewTime_std,verified_mean,verified_sum,vote_mean,vote_sum,image_mean,...,summary_neu_std,summary_pos_mean,summary_pos_std,summary_compound_mean,summary_compound_std,reviewText_len_mean,reviewText_len_std,summary_len_mean,summary_len_std,awesomeness
0,0000B049F5B33CD310EB1AB236E20191,1412294400,0.826568,1.419682e+09,1.044708e+07,1.000000,2,0.000000,0.0,0.0,...,0.202000,0.707000,0.202000,0.867899,0.515674,0.027723,0.005495,0.169355,0.325000,1
1,00018184A9EC4D270219A296B2580303,997488000,0.939717,1.333433e+09,2.345825e+08,0.750000,3,5.500000,22.0,0.0,...,0.591093,0.257250,0.420421,0.582426,0.561754,0.053576,0.117134,0.112903,0.094771,0
2,000281A9CAC43FF1F335726A390636DA,1206748800,0.811194,1.326499e+09,1.081967e+08,1.000000,3,0.000000,0.0,0.0,...,0.324966,0.132667,0.324966,0.566601,0.588648,0.030228,0.058413,0.111111,0.118634,0
3,00030884DF109F325638A6BFD5B13CFF,1060473600,0.935536,1.379150e+09,1.140571e+08,0.944444,17,2.388889,43.0,0.0,...,0.457737,0.217500,0.457737,0.601941,0.598459,0.006158,0.011296,0.112605,0.159476,1
4,000325BA25966B5FC701D5D2B5DBA4E0,1283472000,0.872690,1.363802e+09,7.752298e+07,1.000000,4,1.750000,7.0,0.0,...,0.465983,0.059500,0.168291,0.423797,0.541915,0.018963,0.034727,0.116935,0.167325,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
73077,FFFDD3C72D23AF858D6E0ED92612370D,1006819200,0.952124,1.394282e+09,1.219028e+08,0.960000,24,1.160000,29.0,0.0,...,0.488877,0.252960,0.491537,0.582711,0.619160,0.028717,0.057092,0.117419,0.211823,1
73078,FFFDDE284A73B29B320381487EC7DE9E,1295913600,0.950506,1.401192e+09,1.488861e+08,1.000000,2,1.000000,2.0,0.0,...,0.065000,0.311500,0.623000,0.532858,0.713924,0.014932,0.022884,0.083333,0.025000,1
73079,FFFEB3EE2372807964F024707D50FB21,1345852800,0.709777,1.349050e+09,4.520958e+06,1.000000,2,0.000000,0.0,0.0,...,0.677000,0.177500,0.355000,0.496003,0.479444,0.026329,0.006131,0.180108,0.075000,0
73080,FFFF4545AB232D81D0F9B208388BB7AA,1417651200,0.856237,1.434283e+09,1.287727e+07,1.000000,4,0.000000,0.0,0.0,...,0.227225,0.270750,0.094194,0.676506,0.616348,0.022378,0.010732,0.336022,0.152904,1


In [315]:
# Prepare the data for training
# Keep only the most important features for predicting awesomeness
X = training_data[[    
    'reviewText_neg_mean',
    'summary_neg_std',
    # 'reviewText_pos_mean',
    'summary_neg_mean',
    # 'reviewText_neu_mean',
    # 'unixReviewTime_max',
]].values
y = training_data["awesomeness"].values

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


In [316]:
X_train

array([[0.04733333, 0.29393877, 0.12      ],
       [0.00825   , 0.        , 0.        ],
       [0.        , 0.        , 0.        ],
       ...,
       [0.        , 0.        , 0.        ],
       [0.04566667, 0.        , 0.        ],
       [0.102     , 0.        , 0.        ]])

In [317]:
# Train the model on naive bayes, decision tree, and random forest classifiers

# Define hyperparameters for each classifier
gnb_params = {
'var_smoothing': [1e-9, 1e-8, 1e-7, 1e-6, 1e-5]
}
mnb_params = {
'alpha': [0.1, 0.5, 1.0, 2.0]
}
bnb_params = {
'alpha': [0.1, 0.5, 1.0, 2.0],
'binarize': [0.0, 0.1, 0.5, 1.0]
}
cnb_params = {
'alpha': [0.1, 0.5, 1.0, 2.0]
}

# fit classifiers and make predictions on test set
best_f1_score = 0
best_classifier = None
classifiers = [GaussianNB(), MultinomialNB(), BernoulliNB(), ComplementNB()]
classifier_params = [gnb_params, mnb_params, bnb_params, cnb_params]
classifier_names = ["Gaussian Naive Bayes", "Multinomial Naive Bayes", "Bernoulli Naive Bayes", "Complement Naive Bayes"]

for classifier, params, name in zip(classifiers, classifier_params, classifier_names):
    clf = GridSearchCV(classifier, params, scoring='f1', cv=10)
    with tqdm(total=100, desc=name) as pbar:
        clf.fit(X_train, y_train)
        y_pred = clf.predict(X_test)

    # compute evaluation metrics
    accuracy = accuracy_score(y_test, y_pred)
    precision = precision_score(y_test, y_pred)
    recall = recall_score(y_test, y_pred)
    f1 = f1_score(y_test, y_pred)
    cm = confusion_matrix(y_test, y_pred)

    # print results
    print(f"Results for {name}:")
    print(f"Best parameters: {clf.best_params_}")
    print(f"Accuracy: {accuracy}")
    print(f"Precision: {precision}")
    print(f"Recall: {recall}")
    print(f"F1 score: {f1}")
    print(f"Confusion matrix:\n{cm}")

    # check if current classifier is the best one
    if f1 > best_f1_score:
        best_f1_score = f1
        best_classifier = name
        # save the best model
        joblib.dump(clf, 'finalmodel.pkl')

print(f"\nBest classifier: {best_classifier} (F1 score: {best_f1_score:.4f})")

Gaussian Naive Bayes:   0%|          | 0/100 [00:00<?, ?it/s]


Results for Gaussian Naive Bayes:
Best parameters: {'var_smoothing': 1e-09}
Accuracy: 0.5766573168228775
Precision: 0.5672150216018611
Recall: 0.8745836535997951
F1 score: 0.6881362765850216
Confusion matrix:
[[1602 5209]
 [ 979 6827]]


Multinomial Naive Bayes:   0%|          | 0/100 [00:00<?, ?it/s]


Results for Multinomial Naive Bayes:
Best parameters: {'alpha': 0.5}
Accuracy: 0.5340357118423753
Precision: 0.5340357118423753
Recall: 1.0
F1 score: 0.6962493867903492
Confusion matrix:
[[   0 6811]
 [   0 7806]]


Bernoulli Naive Bayes:   0%|          | 0/100 [00:01<?, ?it/s]


Results for Bernoulli Naive Bayes:
Best parameters: {'alpha': 0.1, 'binarize': 0.5}
Accuracy: 0.5454607648628309
Precision: 0.5410252789154074
Recall: 0.9815526518063028
F1 score: 0.6975600873998543
Confusion matrix:
[[ 311 6500]
 [ 144 7662]]


Complement Naive Bayes:   0%|          | 0/100 [00:00<?, ?it/s]

Results for Complement Naive Bayes:
Best parameters: {'alpha': 0.5}
Accuracy: 0.5067387288773346
Precision: 0.5357057272945124
Recall: 0.572764540097361
F1 score: 0.553615651312531
Confusion matrix:
[[2936 3875]
 [3335 4471]]

Best classifier: Bernoulli Naive Bayes (F1 score: 0.6976)



