In [48]:
# Project: Final Team Predictive Models
# COMP_SCI 349-0 Machine Learning | Professor V.S.
# By Sengdao Inthavong, Lillian Torres, Haylie Wu, Ernie Wang

# This file contains the code for the predictive models used in our project.
# The models are trained on the training data and tested on the testing data for the CDs and vinyls.

In [49]:
# Import basic libraries
import os
import numpy as np
import pandas as pd

# Import libraries for text processing
import nltk
from nltk.sentiment.vader import SentimentIntensityAnalyzer
from sklearn.preprocessing import LabelEncoder 

# Import libraries for machine learning
import joblib
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix
from sklearn.naive_bayes import GaussianNB, MultinomialNB, BernoulliNB, ComplementNB, CategoricalNB
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import AdaBoostClassifier, RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV

# Import libraries for logging
import logging
from tqdm import tqdm

In [50]:
# Load product and review data for CDs and vinyls from the training set
data_dir = ''
categories = ['CDs_and_Vinyl', 'Grocery_and_Gourmet_Food', 'Toys_and_Games']

file_path = os.path.join(data_dir, categories[0], 'train', 'product_training.json')
product_training = pd.read_json(file_path)

file_path = os.path.join(data_dir, categories[0], 'train', 'review_training.json')
review_training = pd.read_json(file_path)

# Merge product and review data
training_data = review_training.merge(product_training, on='asin', how='left')

In [51]:
# Fill in any missing values
training_data['reviewText'].fillna('', inplace=True)
training_data['summary'].fillna('', inplace=True)

In [52]:
# Give each review a unique ID
training_data['reviewID'] = training_data.index

In [53]:
# Run sentiment analysis on the review text and summary
# Columns: neg, neu, pos, compound

# sid = SentimentIntensityAnalyzer()

# review_sentiments = pd.DataFrame(columns=['reviewID', 'reviewText_neg', 'reviewText_neu', 'reviewText_pos', 'reviewText_compound', 'summary_neg', 'summary_neu', 'summary_pos', 'summary_compound'])

# for index, row in tqdm(training_data.iterrows(), total=training_data.shape[0], desc="Sentiment Analysis"):
#     review_text_sentiment = sid.polarity_scores(row['reviewText'])
#     summary_text_sentiment = sid.polarity_scores(row['summary'])
    
#     sentiment_row = {'reviewID': row['reviewID'],
#                      'reviewText_neg': review_text_sentiment['neg'],
#                      'reviewText_neu': review_text_sentiment['neu'],
#                      'reviewText_pos': review_text_sentiment['pos'],
#                      'reviewText_compound': review_text_sentiment['compound'],
#                      'summary_neg': summary_text_sentiment['neg'],
#                      'summary_neu': summary_text_sentiment['neu'],
#                      'summary_pos': summary_text_sentiment['pos'],
#                      'summary_compound': summary_text_sentiment['compound']}
    
#     review_sentiments = review_sentiments.append(sentiment_row, ignore_index=True)

# # Save the sentiment data to a csv file for future use
# file_path = os.path.join(data_dir, categories[0], 'csv', 'review_sentiments.csv')
# review_sentiments.to_csv(file_path, index=False)
review_sentiments = pd.read_csv('review_sentiments.csv')
review_sentiments

Unnamed: 0,reviewID,reviewText_neg,reviewText_neu,reviewText_pos,reviewText_compound,summary_neg,summary_neu,summary_pos,summary_compound
0,0.0,0.020,0.757,0.223,0.9926,0.0,1.000,0.000,0.0000
1,1.0,0.101,0.822,0.077,0.3222,0.0,1.000,0.000,0.0000
2,2.0,0.088,0.709,0.203,0.7845,0.0,1.000,0.000,0.0000
3,3.0,0.054,0.788,0.158,0.7430,0.0,0.426,0.574,0.4678
4,4.0,0.000,0.196,0.804,0.6249,0.0,1.000,0.000,0.0000
...,...,...,...,...,...,...,...,...,...
770781,770781.0,0.054,0.800,0.146,0.9555,0.0,1.000,0.000,0.0000
770782,770782.0,0.042,0.875,0.083,0.4767,0.0,1.000,0.000,0.0000
770783,770783.0,0.138,0.648,0.214,0.5371,0.0,1.000,0.000,0.0000
770784,770784.0,0.000,0.798,0.202,0.7444,0.0,0.328,0.672,0.6249


In [54]:
# Merge the sentiment data with the training data
training_data = training_data.merge(review_sentiments, on='reviewID', how='left')
training_data

Unnamed: 0,asin,reviewerID,unixReviewTime,vote,verified,reviewTime,style,reviewerName,reviewText,summary,...,awesomeness,reviewID,reviewText_neg,reviewText_neu,reviewText_pos,reviewText_compound,summary_neg,summary_neu,summary_pos,summary_compound
0,6673F1740E03573BCD64238FE711FC69,9C856D4A18E1355783B3B28B7ECC1848,1451520000,9,True,"12 31, 2015",{'Format:': ' MP3 Music'},8D88BB79AAC50277AEE82FCFD77F6744,Finding the Beatles all over again - and bette...,I sit listening - with my jaw to the floor - H...,...,1,0,0.020,0.757,0.223,0.9926,0.0,1.000,0.000,0.0000
1,690819436E20BB31657AF6B58B984DD4,6E9ABBD26A27C2B2851D1EC34A01CBDC,1113523200,9,False,"04 15, 2005",{'Format:': ' Audio CD'},8DC0611245A871AC51BBEEBB85F33A58,These guys can sing! Such classic tunes...poi...,Under Appreciated....,...,1,1,0.101,0.822,0.077,0.3222,0.0,1.000,0.000,0.0000
2,A680D4753F0CEA2252C168A6ACB2B623,B637C3C93E61094474710F456928BE9F,1126137600,,False,"09 8, 2005",,2259386624CFA0EC53A75A50A9BB57A5,"Snoop Doggy Dogg made a classic album, DoggySt...",DoggyStyle,...,1,2,0.088,0.709,0.203,0.7845,0.0,1.000,0.000,0.0000
3,F4A966F1FA340B16651D676BC246D227,AA7918E9410D650A076221C7B2934A09,954979200,,False,"04 6, 2000",{'Format:': ' Audio CD'},7A65A155C993535BC99CBCB39E7161B5,Stevie Nicks Has had Her Moments. I Like Some ...,Pretty Good but a Bit Dated,...,1,3,0.054,0.788,0.158,0.7430,0.0,0.426,0.574,0.4678
4,EF59DAF0C00319A48D4784266FD157EE,2293C9B7950A3356B95828419A677720,1477958400,,True,"11 1, 2016",{'Format:': ' Audio CD'},C69A09446009C500B1364B7DB5510497,Great cd.,Five Stars,...,1,4,0.000,0.196,0.804,0.6249,0.0,1.000,0.000,0.0000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
770781,BD91503308A437374C3254EDC8BC24CB,936ED23AF4D23943786BBD44D0F1114B,1136246400,12,True,"01 3, 2006",{'Format:': ' Audio CD'},AAB61000438939C8E6165CFCCF02A488,This was the first Simple Minds album that I b...,Their Most Fully Realized Artistic Studio Stat...,...,1,770781,0.054,0.800,0.146,0.9555,0.0,1.000,0.000,0.0000
770782,9BC50277D18FAB423AD33C8CE4CC000D,EF922377A87E9D01F50065F2DA1722A8,1290556800,,False,"11 24, 2010",,0441BC4F6B7BD180769FDCDD8E603560,I have owned a CD copy of this show for at lea...,Forgettable R&H,...,0,770782,0.042,0.875,0.083,0.4767,0.0,1.000,0.000,0.0000
770783,4062627CA1586E517520483964299349,E1F0B0EBC6A36F33301E4FD0B3D62D52,1311120000,3,False,"07 20, 2011",{'Format:': ' Audio CD'},8AB3EEBF23F3583A4396A57DB291D548,carnival of souls to me is bad i gave my cd aw...,Darren d.,...,0,770783,0.138,0.648,0.214,0.5371,0.0,1.000,0.000,0.0000
770784,0AE44A6A9176E6A52507B6ABDDA80B00,DDDC81E6B8C3F8C91867F9AECB385135,1111968000,2,False,"03 28, 2005",{'Format:': ' Audio CD'},51B2E62E0A5864316BDB33FD4A729B37,This is an awesome slayer album. I love the th...,awesome slayer cd,...,1,770784,0.000,0.798,0.202,0.7444,0.0,0.328,0.672,0.6249


In [55]:
# Process the columns that are not numeric
encoder = LabelEncoder()

# Define a function to extract the format information from the "style" column
def extract_format(style):
    if style is None:
        return "None"
    else:
        return style.get("Format:", "None").strip()

# Apply the function to the "style" column to extract the format information
training_data["style"] = training_data["style"].apply(extract_format)
training_data

Unnamed: 0,asin,reviewerID,unixReviewTime,vote,verified,reviewTime,style,reviewerName,reviewText,summary,...,awesomeness,reviewID,reviewText_neg,reviewText_neu,reviewText_pos,reviewText_compound,summary_neg,summary_neu,summary_pos,summary_compound
0,6673F1740E03573BCD64238FE711FC69,9C856D4A18E1355783B3B28B7ECC1848,1451520000,9,True,"12 31, 2015",MP3 Music,8D88BB79AAC50277AEE82FCFD77F6744,Finding the Beatles all over again - and bette...,I sit listening - with my jaw to the floor - H...,...,1,0,0.020,0.757,0.223,0.9926,0.0,1.000,0.000,0.0000
1,690819436E20BB31657AF6B58B984DD4,6E9ABBD26A27C2B2851D1EC34A01CBDC,1113523200,9,False,"04 15, 2005",Audio CD,8DC0611245A871AC51BBEEBB85F33A58,These guys can sing! Such classic tunes...poi...,Under Appreciated....,...,1,1,0.101,0.822,0.077,0.3222,0.0,1.000,0.000,0.0000
2,A680D4753F0CEA2252C168A6ACB2B623,B637C3C93E61094474710F456928BE9F,1126137600,,False,"09 8, 2005",,2259386624CFA0EC53A75A50A9BB57A5,"Snoop Doggy Dogg made a classic album, DoggySt...",DoggyStyle,...,1,2,0.088,0.709,0.203,0.7845,0.0,1.000,0.000,0.0000
3,F4A966F1FA340B16651D676BC246D227,AA7918E9410D650A076221C7B2934A09,954979200,,False,"04 6, 2000",Audio CD,7A65A155C993535BC99CBCB39E7161B5,Stevie Nicks Has had Her Moments. I Like Some ...,Pretty Good but a Bit Dated,...,1,3,0.054,0.788,0.158,0.7430,0.0,0.426,0.574,0.4678
4,EF59DAF0C00319A48D4784266FD157EE,2293C9B7950A3356B95828419A677720,1477958400,,True,"11 1, 2016",Audio CD,C69A09446009C500B1364B7DB5510497,Great cd.,Five Stars,...,1,4,0.000,0.196,0.804,0.6249,0.0,1.000,0.000,0.0000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
770781,BD91503308A437374C3254EDC8BC24CB,936ED23AF4D23943786BBD44D0F1114B,1136246400,12,True,"01 3, 2006",Audio CD,AAB61000438939C8E6165CFCCF02A488,This was the first Simple Minds album that I b...,Their Most Fully Realized Artistic Studio Stat...,...,1,770781,0.054,0.800,0.146,0.9555,0.0,1.000,0.000,0.0000
770782,9BC50277D18FAB423AD33C8CE4CC000D,EF922377A87E9D01F50065F2DA1722A8,1290556800,,False,"11 24, 2010",,0441BC4F6B7BD180769FDCDD8E603560,I have owned a CD copy of this show for at lea...,Forgettable R&H,...,0,770782,0.042,0.875,0.083,0.4767,0.0,1.000,0.000,0.0000
770783,4062627CA1586E517520483964299349,E1F0B0EBC6A36F33301E4FD0B3D62D52,1311120000,3,False,"07 20, 2011",Audio CD,8AB3EEBF23F3583A4396A57DB291D548,carnival of souls to me is bad i gave my cd aw...,Darren d.,...,0,770783,0.138,0.648,0.214,0.5371,0.0,1.000,0.000,0.0000
770784,0AE44A6A9176E6A52507B6ABDDA80B00,DDDC81E6B8C3F8C91867F9AECB385135,1111968000,2,False,"03 28, 2005",Audio CD,51B2E62E0A5864316BDB33FD4A729B37,This is an awesome slayer album. I love the th...,awesome slayer cd,...,1,770784,0.000,0.798,0.202,0.7444,0.0,0.328,0.672,0.6249


In [56]:
# Encode the columns
encoder = LabelEncoder()
training_data["style"] = encoder.fit_transform(training_data["style"])

# Encode the "verified" column
training_data["verified"] = encoder.fit_transform(training_data["verified"])

# Encode the "reviewerID" column
training_data["reviewerID"] = encoder.fit_transform(training_data["reviewerID"])

# Encode the "vote"" column
training_data["vote"] = training_data["vote"].apply(lambda x: float(x.replace(",", "")) if x is not None else 0)

# Encode the "image" column
training_data["image"] = training_data["image"].apply(lambda x: len(x) if x is not None else 0)

training_data

Unnamed: 0,asin,reviewerID,unixReviewTime,vote,verified,reviewTime,style,reviewerName,reviewText,summary,...,awesomeness,reviewID,reviewText_neg,reviewText_neu,reviewText_pos,reviewText_compound,summary_neg,summary_neu,summary_pos,summary_compound
0,6673F1740E03573BCD64238FE711FC69,67931,1451520000,9.0,1,"12 31, 2015",16,8D88BB79AAC50277AEE82FCFD77F6744,Finding the Beatles all over again - and bette...,I sit listening - with my jaw to the floor - H...,...,1,0,0.020,0.757,0.223,0.9926,0.0,1.000,0.000,0.0000
1,690819436E20BB31657AF6B58B984DD4,47952,1113523200,9.0,0,"04 15, 2005",3,8DC0611245A871AC51BBEEBB85F33A58,These guys can sing! Such classic tunes...poi...,Under Appreciated....,...,1,1,0.101,0.822,0.077,0.3222,0.0,1.000,0.000,0.0000
2,A680D4753F0CEA2252C168A6ACB2B623,79174,1126137600,0.0,0,"09 8, 2005",20,2259386624CFA0EC53A75A50A9BB57A5,"Snoop Doggy Dogg made a classic album, DoggySt...",DoggyStyle,...,1,2,0.088,0.709,0.203,0.7845,0.0,1.000,0.000,0.0000
3,F4A966F1FA340B16651D676BC246D227,74058,954979200,0.0,0,"04 6, 2000",3,7A65A155C993535BC99CBCB39E7161B5,Stevie Nicks Has had Her Moments. I Like Some ...,Pretty Good but a Bit Dated,...,1,3,0.054,0.788,0.158,0.7430,0.0,0.426,0.574,0.4678
4,EF59DAF0C00319A48D4784266FD157EE,15169,1477958400,0.0,1,"11 1, 2016",3,C69A09446009C500B1364B7DB5510497,Great cd.,Five Stars,...,1,4,0.000,0.196,0.804,0.6249,0.0,1.000,0.000,0.0000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
770781,BD91503308A437374C3254EDC8BC24CB,63910,1136246400,12.0,1,"01 3, 2006",3,AAB61000438939C8E6165CFCCF02A488,This was the first Simple Minds album that I b...,Their Most Fully Realized Artistic Studio Stat...,...,1,770781,0.054,0.800,0.146,0.9555,0.0,1.000,0.000,0.0000
770782,9BC50277D18FAB423AD33C8CE4CC000D,104092,1290556800,0.0,0,"11 24, 2010",20,0441BC4F6B7BD180769FDCDD8E603560,I have owned a CD copy of this show for at lea...,Forgettable R&H,...,0,770782,0.042,0.875,0.083,0.4767,0.0,1.000,0.000,0.0000
770783,4062627CA1586E517520483964299349,98108,1311120000,3.0,0,"07 20, 2011",3,8AB3EEBF23F3583A4396A57DB291D548,carnival of souls to me is bad i gave my cd aw...,Darren d.,...,0,770783,0.138,0.648,0.214,0.5371,0.0,1.000,0.000,0.0000
770784,0AE44A6A9176E6A52507B6ABDDA80B00,96333,1111968000,2.0,0,"03 28, 2005",3,51B2E62E0A5864316BDB33FD4A729B37,This is an awesome slayer album. I love the th...,awesome slayer cd,...,1,770784,0.000,0.798,0.202,0.7444,0.0,0.328,0.672,0.6249


In [57]:
# Compute the length of reviewText and summary columns
training_data["reviewText_len"] = training_data["reviewText"].apply(len)
training_data["summary_len"] = training_data["summary"].apply(len)

In [58]:
# # Filter out the reviews that are not verified, have no votes, and have no images unless there is no verified and voted reviews
filteredData = training_data.copy()
filteredData = filteredData[(filteredData["verified"] == 1) | (filteredData["vote"] >= 5) | (filteredData["image"] >= 1)]

# joining filteredData and training_data so we can have at least one review for each product
filteredData = filteredData._append(training_data[~training_data.asin.isin(filteredData.asin)])

In [59]:
# Aggregate the training data by asin
training_data = training_data.groupby("asin").agg({
    "reviewerID": "count",
    "unixReviewTime": ["min", "max", "mean", "std"],
    "verified": ["mean", "sum"],
    "vote": ["mean", "sum"],
    "image": ["mean", "sum"],
    "style": ["mean", "sum"],
    "reviewText_neg": ["mean", "std"],
    "reviewText_neu": ["mean", "std"],
    "reviewText_pos": ["mean", "std"],
    "reviewText_compound": ["mean", "std"],
    "summary_neg": ["mean", "std"],
    "summary_neu": ["mean", "std"],
    "summary_pos": ["mean", "std"],
    "summary_compound": ["mean", "std"],
    "reviewText_len": ["mean", "std"],
    "summary_len": ["mean", "std"],
}).reset_index()

training_data

Unnamed: 0_level_0,asin,reviewerID,unixReviewTime,unixReviewTime,unixReviewTime,unixReviewTime,verified,verified,vote,vote,...,summary_neu,summary_neu,summary_pos,summary_pos,summary_compound,summary_compound,reviewText_len,reviewText_len,summary_len,summary_len
Unnamed: 0_level_1,Unnamed: 1_level_1,count,min,max,mean,std,mean,sum,mean,sum,...,mean,std,mean,std,mean,std,mean,std,mean,std
0,0000B049F5B33CD310EB1AB236E20191,4,933552000,1427068800,1.296583e+09,2.421140e+08,0.500000,2,1.500000,6.0,...,0.646500,0.416434,0.353500,0.416434,0.365925,0.429587,500.750000,334.137073,27.750000,20.072784
1,00018184A9EC4D270219A296B2580303,18,983836800,1499558400,1.111282e+09,1.782869e+08,0.166667,3,2.500000,45.0,...,0.760389,0.256622,0.198111,0.219135,0.181272,0.298701,1338.277778,1149.440058,27.055556,14.529776
2,000281A9CAC43FF1F335726A390636DA,4,1040342400,1417219200,1.254960e+09,1.681540e+08,0.750000,3,1.250000,5.0,...,0.706000,0.373249,0.099500,0.199000,-0.007925,0.429942,700.000000,1011.109292,18.500000,9.291573
3,00030884DF109F325638A6BFD5B13CFF,29,1001030400,1496880000,1.315392e+09,1.490887e+08,0.586207,17,2.862069,83.0,...,0.744276,0.321352,0.234276,0.325211,0.230669,0.349602,449.448276,783.090288,26.241379,17.683033
4,000325BA25966B5FC701D5D2B5DBA4E0,4,1283472000,1456617600,1.363802e+09,7.752298e+07,1.000000,4,1.750000,7.0,...,0.835250,0.329500,0.059500,0.119000,-0.096000,0.192000,469.250000,732.887156,21.750000,14.198005
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
73077,FFFDD3C72D23AF858D6E0ED92612370D,64,914025600,1507507200,1.258704e+09,1.867436e+08,0.375000,24,0.890625,57.0,...,0.623687,0.321136,0.294453,0.314120,0.197958,0.387439,949.968750,1175.008490,27.406250,17.130423
73078,FFFDDE284A73B29B320381487EC7DE9E,4,1127088000,1506470400,1.353758e+09,1.783429e+08,0.500000,2,2.000000,8.0,...,0.522250,0.320673,0.305750,0.353175,0.168350,0.385339,929.000000,718.593534,22.250000,10.781929
73079,FFFEB3EE2372807964F024707D50FB21,2,1345852800,1352246400,1.349050e+09,4.520958e+06,1.000000,2,0.000000,0.0,...,0.661500,0.478711,0.177500,0.251023,0.038600,0.054589,651.500000,129.400541,33.500000,6.363961
73080,FFFF4545AB232D81D0F9B208388BB7AA,7,1416182400,1446076800,1.427846e+09,1.244540e+07,0.571429,4,1.571429,11.0,...,0.700429,0.179107,0.245000,0.124519,0.389014,0.324535,1079.714286,1174.519578,55.571429,20.517124


In [60]:
# Add +1 to compound columns to avoid negative values
training_data["reviewText_compound"] += 1
training_data["summary_compound"] += 1
# Replace NaN values with 0
training_data.fillna(0, inplace=True)
training_data

Unnamed: 0_level_0,asin,reviewerID,unixReviewTime,unixReviewTime,unixReviewTime,unixReviewTime,verified,verified,vote,vote,...,summary_neu,summary_neu,summary_pos,summary_pos,summary_compound,summary_compound,reviewText_len,reviewText_len,summary_len,summary_len
Unnamed: 0_level_1,Unnamed: 1_level_1,count,min,max,mean,std,mean,sum,mean,sum,...,mean,std,mean,std,mean,std,mean,std,mean,std
0,0000B049F5B33CD310EB1AB236E20191,4,933552000,1427068800,1.296583e+09,2.421140e+08,0.500000,2,1.500000,6.0,...,0.646500,0.416434,0.353500,0.416434,1.365925,1.429587,500.750000,334.137073,27.750000,20.072784
1,00018184A9EC4D270219A296B2580303,18,983836800,1499558400,1.111282e+09,1.782869e+08,0.166667,3,2.500000,45.0,...,0.760389,0.256622,0.198111,0.219135,1.181272,1.298701,1338.277778,1149.440058,27.055556,14.529776
2,000281A9CAC43FF1F335726A390636DA,4,1040342400,1417219200,1.254960e+09,1.681540e+08,0.750000,3,1.250000,5.0,...,0.706000,0.373249,0.099500,0.199000,0.992075,1.429942,700.000000,1011.109292,18.500000,9.291573
3,00030884DF109F325638A6BFD5B13CFF,29,1001030400,1496880000,1.315392e+09,1.490887e+08,0.586207,17,2.862069,83.0,...,0.744276,0.321352,0.234276,0.325211,1.230669,1.349602,449.448276,783.090288,26.241379,17.683033
4,000325BA25966B5FC701D5D2B5DBA4E0,4,1283472000,1456617600,1.363802e+09,7.752298e+07,1.000000,4,1.750000,7.0,...,0.835250,0.329500,0.059500,0.119000,0.904000,1.192000,469.250000,732.887156,21.750000,14.198005
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
73077,FFFDD3C72D23AF858D6E0ED92612370D,64,914025600,1507507200,1.258704e+09,1.867436e+08,0.375000,24,0.890625,57.0,...,0.623687,0.321136,0.294453,0.314120,1.197958,1.387439,949.968750,1175.008490,27.406250,17.130423
73078,FFFDDE284A73B29B320381487EC7DE9E,4,1127088000,1506470400,1.353758e+09,1.783429e+08,0.500000,2,2.000000,8.0,...,0.522250,0.320673,0.305750,0.353175,1.168350,1.385339,929.000000,718.593534,22.250000,10.781929
73079,FFFEB3EE2372807964F024707D50FB21,2,1345852800,1352246400,1.349050e+09,4.520958e+06,1.000000,2,0.000000,0.0,...,0.661500,0.478711,0.177500,0.251023,1.038600,1.054589,651.500000,129.400541,33.500000,6.363961
73080,FFFF4545AB232D81D0F9B208388BB7AA,7,1416182400,1446076800,1.427846e+09,1.244540e+07,0.571429,4,1.571429,11.0,...,0.700429,0.179107,0.245000,0.124519,1.389014,1.324535,1079.714286,1174.519578,55.571429,20.517124


In [63]:
from sklearn.preprocessing import MinMaxScaler

# Define the columns to normalize
cols_to_normalize = ['unixReviewTime', 'verified', 'vote', 'image', 'style', 'reviewText_neg', 'reviewText_neu', 'reviewText_pos', 'reviewText_compound', 'summary_neg', 'summary_neu', 'summary_pos', 'summary_compound', 'reviewText_len', 'summary_len']

# Normalize the data using the MinMaxScaler
scaler = MinMaxScaler()
training_data[cols_to_normalize] = scaler.fit_transform(training_data[cols_to_normalize])
training_data

ValueError: Columns must be same length as key

In [62]:
# Merge the training data with the awesomeness data
file_path = os.path.join(data_dir, categories[0], 'train', 'product_training.json')
product_training = pd.read_json(file_path)
training_data = training_data.merge(product_training, on='asin', how='left')

# Visualize the absolute correlation between the features on "awesomeness"
training_data.corr()["awesomeness"].abs().sort_values(ascending=False)

MergeError: Not allowed to merge between different levels. (2 levels on the left, 1 on the right)

In [None]:
training_data = training_data.drop(training_data.columns[1], axis=1)

In [None]:
# Prepare the data for training
# Keep only the most important features for predicting awesomeness
X = training_data[[    
    ('reviewText_neg', 'mean'),
    ('summary_neg', 'mean'),
    ('summary_neg', 'std')
]].values
y = training_data["awesomeness"].values

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


KeyError: 'awesomeness'

In [None]:
X_train

array([[0.        , 0.        , 0.        ],
       [0.06166667, 0.        , 0.        ],
       [0.0404    , 0.        , 0.        ],
       ...,
       [0.075     , 0.096     , 0.271529  ],
       [0.        , 0.        , 0.        ],
       [0.04284615, 0.03046154, 0.10516117]])

In [None]:
# Train the model on naive bayes, decision tree, and random forest classifiers

# Define hyperparameters for each classifier
gnb_params = {
'var_smoothing': [1e-9, 1e-8, 1e-7, 1e-6, 1e-5]
}
mnb_params = {
'alpha': [0.1, 0.5, 1.0, 2.0]
}
bnb_params = {
'alpha': [0.1, 0.5, 1.0, 2.0],
'binarize': [0.0, 0.1, 0.5, 1.0]
}
cnb_params = {
'alpha': [0.1, 0.5, 1.0, 2.0]
}

# fit classifiers and make predictions on test set
best_f1_score = 0
best_classifier = None
classifiers = [GaussianNB(), MultinomialNB(), BernoulliNB(), ComplementNB()]
classifier_params = [gnb_params, mnb_params, bnb_params, cnb_params]
classifier_names = ["Gaussian Naive Bayes", "Multinomial Naive Bayes", "Bernoulli Naive Bayes", "Complement Naive Bayes"]

for classifier, params, name in zip(classifiers, classifier_params, classifier_names):
    clf = GridSearchCV(classifier, params, scoring='f1', cv=10)
    with tqdm(total=100, desc=name) as pbar:
        clf.fit(X_train, y_train)
        y_pred = clf.predict(X_test)

    # compute evaluation metrics
    accuracy = accuracy_score(y_test, y_pred)
    precision = precision_score(y_test, y_pred)
    recall = recall_score(y_test, y_pred)
    f1 = f1_score(y_test, y_pred)
    cm = confusion_matrix(y_test, y_pred)

    # print results
    print(f"Results for {name}:")
    print(f"Best parameters: {clf.best_params_}")
    print(f"Accuracy: {accuracy}")
    print(f"Precision: {precision}")
    print(f"Recall: {recall}")
    print(f"F1 score: {f1}")
    print(f"Confusion matrix:\n{cm}")

    # check if current classifier is the best one
    if f1 > best_f1_score:
        best_f1_score = f1
        best_classifier = name
        # save the best model
        joblib.dump(clf, best_classifier + '_model.pkl')

print(f"\nBest classifier: {best_classifier} (F1 score: {best_f1_score:.4f})")

Gaussian Naive Bayes:   0%|          | 0/100 [00:01<?, ?it/s]
Multinomial Naive Bayes:   0%|          | 0/100 [00:00<?, ?it/s]

Results for Gaussian Naive Bayes:
Best parameters: {'var_smoothing': 1e-05}
Accuracy: 0.5851540616246499
Precision: 0.5810489202291759
Recall: 0.8491562540255057
F1 score: 0.689972786267532
Confusion matrix:
[[1764 4753]
 [1171 6592]]


Multinomial Naive Bayes:   0%|          | 0/100 [00:01<?, ?it/s]
Bernoulli Naive Bayes:   0%|          | 0/100 [00:00<?, ?it/s]

Results for Multinomial Naive Bayes:
Best parameters: {'alpha': 0.1}
Accuracy: 0.5436274509803921
Precision: 0.5436274509803921
Recall: 1.0
F1 score: 0.7043505874880914
Confusion matrix:
[[   0 6517]
 [   0 7763]]


Bernoulli Naive Bayes:   0%|          | 0/100 [00:05<?, ?it/s]
Complement Naive Bayes:   0%|          | 0/100 [00:00<?, ?it/s]

Results for Bernoulli Naive Bayes:
Best parameters: {'alpha': 0.1, 'binarize': 0.5}
Accuracy: 0.5521008403361345
Precision: 0.5493822700671916
Recall: 0.9795182274893727
F1 score: 0.7039437141270135
Confusion matrix:
[[ 280 6237]
 [ 159 7604]]


Complement Naive Bayes:   0%|          | 0/100 [00:01<?, ?it/s]

Results for Complement Naive Bayes:
Best parameters: {'alpha': 0.1}
Accuracy: 0.5348739495798319
Precision: 0.5658250146799765
Recall: 0.6206363519258019
F1 score: 0.5919646148175451
Confusion matrix:
[[2820 3697]
 [2945 4818]]

Best classifier: Multinomial Naive Bayes (F1 score: 0.7044)



