In [75]:
# Project: Final Team Predictive Models
# COMP_SCI 349-0 Machine Learning | Professor V.S.
# By Sengdao Inthavong, Lillian Torres, Haylie Wu, Ernie Wang

# This file contains the code for the predictive models used in our project.
# The models are trained on the training data and tested on the testing data for the CDs and vinyls.

In [94]:
# Import basic libraries
import os
import numpy as np
import pandas as pd

# Import libraries for text processing
import nltk
from nltk.sentiment.vader import SentimentIntensityAnalyzer
from sklearn.preprocessing import LabelEncoder 

# Import libraries for machine learning
import joblib
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix
from sklearn.naive_bayes import GaussianNB, MultinomialNB, BernoulliNB, ComplementNB, CategoricalNB
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import AdaBoostClassifier, RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV

# Import libraries for logging
import logging
from tqdm import tqdm

In [95]:
#GETTING THE DATA
#getting to the data directory
data_dir = ''
categories = ['CDs_and_Vinyl', 'Grocery_and_Gourmet_Food', 'Toys_and_Games']
#getting the products and the reviews for the shit
resultpath = os.path.join(data_dir, categories[0], 'train', 'product_training.json')
trainingpath = os.path.join(data_dir, categories[0], 'train', 'review_training.json')
#reading the file with json
reviewdata = pd.read_json(trainingpath)
resultpath = pd.read_json(resultpath)
#combine the 2 shits together
training_data = resultpath.merge(reviewdata, how='left', on='asin')

In [96]:
# Fill in any missing values
training_data['reviewText'].fillna('', inplace=True)
training_data['summary'].fillna('', inplace=True)

In [97]:
# Give each review a unique ID
training_data['reviewID'] = training_data.index

In [98]:
# Run sentiment analysis on the review text and summary
# Columns: neg, neu, pos, compound

# sid = SentimentIntensityAnalyzer()

# review_sentiments = pd.DataFrame(columns=['reviewID', 'reviewText_neg', 'reviewText_neu', 'reviewText_pos', 'reviewText_compound', 'summary_neg', 'summary_neu', 'summary_pos', 'summary_compound'])

# for index, row in tqdm(training_data.iterrows(), total=training_data.shape[0], desc="Sentiment Analysis"):
#     review_text_sentiment = sid.polarity_scores(row['reviewText'])
#     summary_text_sentiment = sid.polarity_scores(row['summary'])
    
#     sentiment_row = {'reviewID': row['reviewID'],
#                      'reviewText_neg': review_text_sentiment['neg'],
#                      'reviewText_neu': review_text_sentiment['neu'],
#                      'reviewText_pos': review_text_sentiment['pos'],
#                      'reviewText_compound': review_text_sentiment['compound'],
#                      'summary_neg': summary_text_sentiment['neg'],
#                      'summary_neu': summary_text_sentiment['neu'],
#                      'summary_pos': summary_text_sentiment['pos'],
#                      'summary_compound': summary_text_sentiment['compound']}
    
#     review_sentiments = review_sentiments.append(sentiment_row, ignore_index=True)

# # Save the sentiment data to a csv file for future use
# file_path = os.path.join(data_dir, categories[0], 'csv', 'review_sentiments.csv')
# review_sentiments.to_csv(file_path, index=False)
review_sentiments = pd.read_csv('review_sentiments.csv')
review_sentiments

Unnamed: 0,reviewID,reviewText_neg,reviewText_neu,reviewText_pos,reviewText_compound,summary_neg,summary_neu,summary_pos,summary_compound
0,0.0,0.020,0.757,0.223,0.9926,0.0,1.000,0.000,0.0000
1,1.0,0.101,0.822,0.077,0.3222,0.0,1.000,0.000,0.0000
2,2.0,0.088,0.709,0.203,0.7845,0.0,1.000,0.000,0.0000
3,3.0,0.054,0.788,0.158,0.7430,0.0,0.426,0.574,0.4678
4,4.0,0.000,0.196,0.804,0.6249,0.0,1.000,0.000,0.0000
...,...,...,...,...,...,...,...,...,...
770781,770781.0,0.054,0.800,0.146,0.9555,0.0,1.000,0.000,0.0000
770782,770782.0,0.042,0.875,0.083,0.4767,0.0,1.000,0.000,0.0000
770783,770783.0,0.138,0.648,0.214,0.5371,0.0,1.000,0.000,0.0000
770784,770784.0,0.000,0.798,0.202,0.7444,0.0,0.328,0.672,0.6249


In [99]:
# Merge the sentiment data with the training data
training_data = training_data.merge(review_sentiments, on='reviewID', how='left')
training_data

Unnamed: 0,asin,awesomeness,reviewerID,unixReviewTime,vote,verified,reviewTime,style,reviewerName,reviewText,...,image,reviewID,reviewText_neg,reviewText_neu,reviewText_pos,reviewText_compound,summary_neg,summary_neu,summary_pos,summary_compound
0,0000B049F5B33CD310EB1AB236E20191,1,D99D95C2B351EF6AF85580B6639D397C,1412294400,,True,"10 3, 2014",{'Format:': ' Audio CD'},12A80DAD02AB007538C670D2CF5F0999,"Even tho I love this album, I am having proble...",...,,0,0.020,0.757,0.223,0.9926,0.0,1.000,0.000,0.0000
1,0000B049F5B33CD310EB1AB236E20191,1,B22BD9411BCB845478B2F7DDC16DBC9F,1413417600,,False,"10 16, 2014",{'Format:': ' Audio CD'},08434218ABA526223A66E2A8B4C38DA8,I LOVE IT!!!,...,,1,0.101,0.822,0.077,0.3222,0.0,1.000,0.000,0.0000
2,0000B049F5B33CD310EB1AB236E20191,1,348C9063667D083543986178A70BCF2F,1427068800,,True,"03 23, 2015",{'Format:': ' Audio CD'},3AA76E176E4BE75233EB0557F9C1738E,Nancy Wilson is still one of the most distinct...,...,,2,0.088,0.709,0.203,0.7845,0.0,1.000,0.000,0.0000
3,0000B049F5B33CD310EB1AB236E20191,1,D5098BA77D43EC644CF38CE59ED03DF6,933552000,6,False,"08 2, 1999",{'Format:': ' Audio CD'},802D103F0C999BF1E71DF82E52BA5F0D,Having been a Nancy Wilson fan for over twenty...,...,,3,0.054,0.788,0.158,0.7430,0.0,0.426,0.574,0.4678
4,00018184A9EC4D270219A296B2580303,0,D29172D2A2B9887A8AEC9A9DF9F3DA78,997488000,22,False,"08 11, 2001",{'Format:': ' Audio CD'},EA0DDA7564052267BCAB87316B53E400,"The musical genre of dance/electronica is, at ...",...,,4,0.000,0.196,0.804,0.6249,0.0,1.000,0.000,0.0000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
770781,FFFF5A3D9CB0B40FF0FE6B95F05D26FE,1,196A1B3BDB2769C74DE42F26AB907DA1,1100563200,5,False,"11 16, 2004",{'Format:': ' Audio CD'},2AF77877103CB27C4FA8F432847460BD,a come back from the very suscessful last albu...,...,,770781,0.054,0.800,0.146,0.9555,0.0,1.000,0.000,0.0000
770782,FFFF5A3D9CB0B40FF0FE6B95F05D26FE,1,B5BCF12F1FE81482A985C1FCADEEEA4A,1100044800,3,False,"11 10, 2004",{'Format:': ' Audio CD'},459BB37B144704161F27BCFC13A8071D,"Welll first thier young and hopeless, and suck...",...,,770782,0.042,0.875,0.083,0.4767,0.0,1.000,0.000,0.0000
770783,FFFF5A3D9CB0B40FF0FE6B95F05D26FE,1,C63DE291C79BE53DD6596437916EC34E,1391126400,,False,"01 31, 2014",{'Format:': ' Audio CD'},21F0635609927C23142499715DA76546,So I'm riding to work with this dude. Nice br...,...,,770783,0.138,0.648,0.214,0.5371,0.0,1.000,0.000,0.0000
770784,FFFF5A3D9CB0B40FF0FE6B95F05D26FE,1,6A43CE26BAB6A6D18AC5DF5B635238E0,1097020800,3,False,"10 6, 2004",{'Format:': ' Audio CD'},8734979613D4F8D97756DE9AEFC245B3,"Oh s**t, first they TRIED to be punk and now t...",...,,770784,0.000,0.798,0.202,0.7444,0.0,0.328,0.672,0.6249


In [100]:
# Process the columns that are not numeric
encoder = LabelEncoder()

# Define a function to extract the format information from the "style" column
def extract_format(style):
    if style is None:
        return "None"
    else:
        return style.get("Format:", "None").strip()

# Apply the function to the "style" column to extract the format information
training_data["style"] = training_data["style"].apply(extract_format)
training_data

Unnamed: 0,asin,awesomeness,reviewerID,unixReviewTime,vote,verified,reviewTime,style,reviewerName,reviewText,...,image,reviewID,reviewText_neg,reviewText_neu,reviewText_pos,reviewText_compound,summary_neg,summary_neu,summary_pos,summary_compound
0,0000B049F5B33CD310EB1AB236E20191,1,D99D95C2B351EF6AF85580B6639D397C,1412294400,,True,"10 3, 2014",Audio CD,12A80DAD02AB007538C670D2CF5F0999,"Even tho I love this album, I am having proble...",...,,0,0.020,0.757,0.223,0.9926,0.0,1.000,0.000,0.0000
1,0000B049F5B33CD310EB1AB236E20191,1,B22BD9411BCB845478B2F7DDC16DBC9F,1413417600,,False,"10 16, 2014",Audio CD,08434218ABA526223A66E2A8B4C38DA8,I LOVE IT!!!,...,,1,0.101,0.822,0.077,0.3222,0.0,1.000,0.000,0.0000
2,0000B049F5B33CD310EB1AB236E20191,1,348C9063667D083543986178A70BCF2F,1427068800,,True,"03 23, 2015",Audio CD,3AA76E176E4BE75233EB0557F9C1738E,Nancy Wilson is still one of the most distinct...,...,,2,0.088,0.709,0.203,0.7845,0.0,1.000,0.000,0.0000
3,0000B049F5B33CD310EB1AB236E20191,1,D5098BA77D43EC644CF38CE59ED03DF6,933552000,6,False,"08 2, 1999",Audio CD,802D103F0C999BF1E71DF82E52BA5F0D,Having been a Nancy Wilson fan for over twenty...,...,,3,0.054,0.788,0.158,0.7430,0.0,0.426,0.574,0.4678
4,00018184A9EC4D270219A296B2580303,0,D29172D2A2B9887A8AEC9A9DF9F3DA78,997488000,22,False,"08 11, 2001",Audio CD,EA0DDA7564052267BCAB87316B53E400,"The musical genre of dance/electronica is, at ...",...,,4,0.000,0.196,0.804,0.6249,0.0,1.000,0.000,0.0000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
770781,FFFF5A3D9CB0B40FF0FE6B95F05D26FE,1,196A1B3BDB2769C74DE42F26AB907DA1,1100563200,5,False,"11 16, 2004",Audio CD,2AF77877103CB27C4FA8F432847460BD,a come back from the very suscessful last albu...,...,,770781,0.054,0.800,0.146,0.9555,0.0,1.000,0.000,0.0000
770782,FFFF5A3D9CB0B40FF0FE6B95F05D26FE,1,B5BCF12F1FE81482A985C1FCADEEEA4A,1100044800,3,False,"11 10, 2004",Audio CD,459BB37B144704161F27BCFC13A8071D,"Welll first thier young and hopeless, and suck...",...,,770782,0.042,0.875,0.083,0.4767,0.0,1.000,0.000,0.0000
770783,FFFF5A3D9CB0B40FF0FE6B95F05D26FE,1,C63DE291C79BE53DD6596437916EC34E,1391126400,,False,"01 31, 2014",Audio CD,21F0635609927C23142499715DA76546,So I'm riding to work with this dude. Nice br...,...,,770783,0.138,0.648,0.214,0.5371,0.0,1.000,0.000,0.0000
770784,FFFF5A3D9CB0B40FF0FE6B95F05D26FE,1,6A43CE26BAB6A6D18AC5DF5B635238E0,1097020800,3,False,"10 6, 2004",Audio CD,8734979613D4F8D97756DE9AEFC245B3,"Oh s**t, first they TRIED to be punk and now t...",...,,770784,0.000,0.798,0.202,0.7444,0.0,0.328,0.672,0.6249


In [101]:
# Encode the columns
encoder = LabelEncoder()
training_data["style"] = encoder.fit_transform(training_data["style"])

# Encode the "verified" column
training_data["verified"] = encoder.fit_transform(training_data["verified"])

# Encode the "reviewerID" column
training_data["reviewerID"] = encoder.fit_transform(training_data["reviewerID"])

# Encode the "vote"" column
training_data["vote"] = training_data["vote"].apply(lambda x: float(x.replace(",", "")) if x is not None else 0)

# Encode the "image" column
training_data["image"] = training_data["image"].apply(lambda x: len(x) if x is not None else 0)

training_data

Unnamed: 0,asin,awesomeness,reviewerID,unixReviewTime,vote,verified,reviewTime,style,reviewerName,reviewText,...,image,reviewID,reviewText_neg,reviewText_neu,reviewText_pos,reviewText_compound,summary_neg,summary_neu,summary_pos,summary_compound
0,0000B049F5B33CD310EB1AB236E20191,1,94522,1412294400,0.0,1,"10 3, 2014",3,12A80DAD02AB007538C670D2CF5F0999,"Even tho I love this album, I am having proble...",...,0,0,0.020,0.757,0.223,0.9926,0.0,1.000,0.000,0.0000
1,0000B049F5B33CD310EB1AB236E20191,1,77388,1413417600,0.0,0,"10 16, 2014",3,08434218ABA526223A66E2A8B4C38DA8,I LOVE IT!!!,...,0,1,0.101,0.822,0.077,0.3222,0.0,1.000,0.000,0.0000
2,0000B049F5B33CD310EB1AB236E20191,1,22879,1427068800,0.0,1,"03 23, 2015",3,3AA76E176E4BE75233EB0557F9C1738E,Nancy Wilson is still one of the most distinct...,...,0,2,0.088,0.709,0.203,0.7845,0.0,1.000,0.000,0.0000
3,0000B049F5B33CD310EB1AB236E20191,1,92550,933552000,6.0,0,"08 2, 1999",3,802D103F0C999BF1E71DF82E52BA5F0D,Having been a Nancy Wilson fan for over twenty...,...,0,3,0.054,0.788,0.158,0.7430,0.0,0.426,0.574,0.4678
4,00018184A9EC4D270219A296B2580303,0,91472,997488000,22.0,0,"08 11, 2001",3,EA0DDA7564052267BCAB87316B53E400,"The musical genre of dance/electronica is, at ...",...,0,4,0.000,0.196,0.804,0.6249,0.0,1.000,0.000,0.0000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
770781,FFFF5A3D9CB0B40FF0FE6B95F05D26FE,1,11206,1100563200,5.0,0,"11 16, 2004",3,2AF77877103CB27C4FA8F432847460BD,a come back from the very suscessful last albu...,...,0,770781,0.054,0.800,0.146,0.9555,0.0,1.000,0.000,0.0000
770782,FFFF5A3D9CB0B40FF0FE6B95F05D26FE,1,78948,1100044800,3.0,0,"11 10, 2004",3,459BB37B144704161F27BCFC13A8071D,"Welll first thier young and hopeless, and suck...",...,0,770782,0.042,0.875,0.083,0.4767,0.0,1.000,0.000,0.0000
770783,FFFF5A3D9CB0B40FF0FE6B95F05D26FE,1,86162,1391126400,0.0,0,"01 31, 2014",3,21F0635609927C23142499715DA76546,So I'm riding to work with this dude. Nice br...,...,0,770783,0.138,0.648,0.214,0.5371,0.0,1.000,0.000,0.0000
770784,FFFF5A3D9CB0B40FF0FE6B95F05D26FE,1,46059,1097020800,3.0,0,"10 6, 2004",3,8734979613D4F8D97756DE9AEFC245B3,"Oh s**t, first they TRIED to be punk and now t...",...,0,770784,0.000,0.798,0.202,0.7444,0.0,0.328,0.672,0.6249


In [102]:
# Compute the length of reviewText and summary columns
training_data["reviewText_len"] = training_data["reviewText"].apply(len)
training_data["summary_len"] = training_data["summary"].apply(len)

In [103]:
# # Filter out the reviews that are not verified, have no votes, and have no images unless there is no verified and voted reviews
filteredData = training_data.copy()
filteredData = filteredData[(filteredData["verified"] == 1) | (filteredData["vote"] >= 10) | (filteredData["image"] >= 1)]

# joining filteredData and training_data so we can have at least one review for each product
filteredData = filteredData._append(training_data[~training_data.asin.isin(filteredData.asin)])
training_data = filteredData.groupby('asin')
training_data

<pandas.core.groupby.generic.DataFrameGroupBy object at 0x384f73ed0>

In [104]:
# Aggregate the training data by asin
training_data = training_data.agg({
    "reviewerID": "count",
    "unixReviewTime": ["min", "max", "mean", "std"],
    "verified": ["mean", "sum"],
    "vote": ["mean", "sum"],
    "image": ["mean", "sum"],
    "style": ["mean", "sum"],
    "reviewText_neg": ["mean", "std"],
    "reviewText_neu": ["mean", "std"],
    "reviewText_pos": ["mean", "std"],
    "reviewText_compound": ["mean", "std"],
    "summary_neg": ["mean", "std"],
    "summary_neu": ["mean", "std"],
    "summary_pos": ["mean", "std"],
    "summary_compound": ["mean", "std"],
    "reviewText_len": ["mean", "std"],
    "summary_len": ["mean", "std"],
}).reset_index()

training_data

Unnamed: 0_level_0,asin,reviewerID,unixReviewTime,unixReviewTime,unixReviewTime,unixReviewTime,verified,verified,vote,vote,...,summary_neu,summary_neu,summary_pos,summary_pos,summary_compound,summary_compound,reviewText_len,reviewText_len,summary_len,summary_len
Unnamed: 0_level_1,Unnamed: 1_level_1,count,min,max,mean,std,mean,sum,mean,sum,...,mean,std,mean,std,mean,std,mean,std,mean,std
0,0000B049F5B33CD310EB1AB236E20191,2,1412294400,1427068800,1.419682e+09,1.044708e+07,1.000000,2,0.000000,0.0,...,1.000000,0.000000,0.000000,0.000000,0.000000,0.000000,686.000000,115.965512,31.500000,27.577164
1,00018184A9EC4D270219A296B2580303,4,997488000,1499558400,1.333433e+09,2.345825e+08,0.750000,3,5.500000,22.0,...,1.000000,0.000000,0.000000,0.000000,0.000000,0.000000,1325.750000,2472.037806,21.000000,8.041559
2,000281A9CAC43FF1F335726A390636DA,3,1206748800,1417219200,1.326499e+09,1.081967e+08,1.000000,3,0.000000,0.0,...,0.303667,0.066463,0.696333,0.066463,0.647967,0.161507,748.000000,1232.756667,20.666667,10.066446
3,00030884DF109F325638A6BFD5B13CFF,18,1060473600,1496880000,1.379150e+09,1.140571e+08,0.944444,17,2.388889,43.0,...,0.636000,0.337530,0.272111,0.344625,0.179011,0.351419,152.388889,238.397133,20.944444,13.531976
4,000325BA25966B5FC701D5D2B5DBA4E0,4,1283472000,1456617600,1.363802e+09,7.752298e+07,1.000000,4,1.750000,7.0,...,0.817000,0.366000,0.183000,0.366000,0.212025,0.424050,469.250000,732.887156,21.750000,14.198005
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
73077,FFFDD3C72D23AF858D6E0ED92612370D,25,1006819200,1507507200,1.394282e+09,1.219028e+08,0.960000,24,1.160000,29.0,...,0.701560,0.329317,0.263760,0.343665,0.228032,0.401541,710.600000,1204.892768,21.840000,17.973777
73078,FFFDDE284A73B29B320381487EC7DE9E,2,1295913600,1506470400,1.401192e+09,1.488861e+08,1.000000,2,1.000000,2.0,...,0.659500,0.109602,0.340500,0.109602,0.505900,0.204637,369.500000,482.953932,15.500000,2.121320
73079,FFFEB3EE2372807964F024707D50FB21,2,1345852800,1352246400,1.349050e+09,4.520958e+06,1.000000,2,0.000000,0.0,...,0.810500,0.267993,0.189500,0.267993,0.353650,0.500137,651.500000,129.400541,33.500000,6.363961
73080,FFFF4545AB232D81D0F9B208388BB7AA,4,1417651200,1446076800,1.434283e+09,1.287727e+07,1.000000,4,0.000000,0.0,...,0.493000,0.170151,0.507000,0.170151,0.567075,0.300785,553.750000,226.483811,62.500000,12.974334


In [105]:
# Add +1 to compound columns to avoid negative values
training_data["reviewText_compound"] += 1
training_data["summary_compound"] += 1
# Replace NaN values with 0
training_data.fillna(0, inplace=True)
training_data

Unnamed: 0_level_0,asin,reviewerID,unixReviewTime,unixReviewTime,unixReviewTime,unixReviewTime,verified,verified,vote,vote,...,summary_neu,summary_neu,summary_pos,summary_pos,summary_compound,summary_compound,reviewText_len,reviewText_len,summary_len,summary_len
Unnamed: 0_level_1,Unnamed: 1_level_1,count,min,max,mean,std,mean,sum,mean,sum,...,mean,std,mean,std,mean,std,mean,std,mean,std
0,0000B049F5B33CD310EB1AB236E20191,2,1412294400,1427068800,1.419682e+09,1.044708e+07,1.000000,2,0.000000,0.0,...,1.000000,0.000000,0.000000,0.000000,1.000000,1.000000,686.000000,115.965512,31.500000,27.577164
1,00018184A9EC4D270219A296B2580303,4,997488000,1499558400,1.333433e+09,2.345825e+08,0.750000,3,5.500000,22.0,...,1.000000,0.000000,0.000000,0.000000,1.000000,1.000000,1325.750000,2472.037806,21.000000,8.041559
2,000281A9CAC43FF1F335726A390636DA,3,1206748800,1417219200,1.326499e+09,1.081967e+08,1.000000,3,0.000000,0.0,...,0.303667,0.066463,0.696333,0.066463,1.647967,1.161507,748.000000,1232.756667,20.666667,10.066446
3,00030884DF109F325638A6BFD5B13CFF,18,1060473600,1496880000,1.379150e+09,1.140571e+08,0.944444,17,2.388889,43.0,...,0.636000,0.337530,0.272111,0.344625,1.179011,1.351419,152.388889,238.397133,20.944444,13.531976
4,000325BA25966B5FC701D5D2B5DBA4E0,4,1283472000,1456617600,1.363802e+09,7.752298e+07,1.000000,4,1.750000,7.0,...,0.817000,0.366000,0.183000,0.366000,1.212025,1.424050,469.250000,732.887156,21.750000,14.198005
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
73077,FFFDD3C72D23AF858D6E0ED92612370D,25,1006819200,1507507200,1.394282e+09,1.219028e+08,0.960000,24,1.160000,29.0,...,0.701560,0.329317,0.263760,0.343665,1.228032,1.401541,710.600000,1204.892768,21.840000,17.973777
73078,FFFDDE284A73B29B320381487EC7DE9E,2,1295913600,1506470400,1.401192e+09,1.488861e+08,1.000000,2,1.000000,2.0,...,0.659500,0.109602,0.340500,0.109602,1.505900,1.204637,369.500000,482.953932,15.500000,2.121320
73079,FFFEB3EE2372807964F024707D50FB21,2,1345852800,1352246400,1.349050e+09,4.520958e+06,1.000000,2,0.000000,0.0,...,0.810500,0.267993,0.189500,0.267993,1.353650,1.500137,651.500000,129.400541,33.500000,6.363961
73080,FFFF4545AB232D81D0F9B208388BB7AA,4,1417651200,1446076800,1.434283e+09,1.287727e+07,1.000000,4,0.000000,0.0,...,0.493000,0.170151,0.507000,0.170151,1.567075,1.300785,553.750000,226.483811,62.500000,12.974334


In [107]:
column_dict = {
    #"reviewerID": ["count"],
    "unixReviewTime": ["min", "max", "mean", "std"],
    "verified": ["mean", "sum"],
    "vote": ["mean", "sum"],
    "image": ["mean", "sum"],
    "style": ["mean", "sum"],
    "reviewText_neg": ["mean", "std"],
    "reviewText_neu": ["mean", "std"],
    "reviewText_pos": ["mean", "std"],
    "reviewText_compound": ["mean", "std"],
    "summary_neg": ["mean", "std"],
    "summary_neu": ["mean", "std"],
    "summary_pos": ["mean", "std"],
    "summary_compound": ["mean", "std"],
    "reviewText_len": ["mean", "std"],
    "summary_len": ["mean", "std"],
}
column_list = []
for k in column_dict:
    for n in column_dict[k]:
        column_list.append((k, n))

In [108]:
from sklearn.preprocessing import MinMaxScaler
scaler = MinMaxScaler()
# Define the columns to normalize
#cols_to_normalize = ['unixReviewTime', 'verified', 'vote', 'image', 'style', 'reviewText_neg', 'reviewText_neu', 'reviewText_pos', 'reviewText_compound', 'summary_neg', 'summary_neu', 'summary_pos', 'summary_compound', 'reviewText_len', 'summary_len']
#cols_to_normalize = [('verified','mean')]
# Normalize the data using the MinMaxScaler
#scaled_df = scaler.fit_transform(training_data[cols_to_normalize])
# training_data
training_data[column_list] = scaler.fit_transform(training_data[column_list])
# training_data

In [109]:
# Merge the training data with the awesomeness data
file_path = os.path.join(data_dir, categories[0], 'train', 'product_training.json')
product_training = pd.read_json(file_path)
training_data.columns = training_data.columns.to_flat_index()

In [110]:
training_data['asin'] = training_data[('asin', '')]

In [111]:
# Merge the training data with the awesomeness data
file_path = os.path.join(data_dir, categories[0], 'train', 'product_training.json')
product_training = pd.read_json(file_path)
training_data = training_data.merge(product_training, on='asin', how='left')

# Visualize the absolute correlation between the features on "awesomeness"
training_data.corr()["awesomeness"].abs().sort_values(ascending=False)
training_data

ValueError: could not convert string to float: '0000B049F5B33CD310EB1AB236E20191'

In [112]:
training_data = training_data.drop(training_data.columns[1], axis=1)

In [113]:
# Prepare the data for training
# Keep only the most important features for predicting awesomeness
X = training_data[[    
    ('reviewText_neg', 'mean'),
    ('summary_neg', 'mean'),
    ('summary_neg', 'std')
]].values
y = training_data["awesomeness"].values

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


In [None]:
X_train

array([[0.04466667, 0.        , 0.        ],
       [0.04566667, 0.        , 0.        ],
       [0.0464    , 0.0642    , 0.20301823],
       ...,
       [0.009     , 0.        , 0.        ],
       [0.0175    , 0.        , 0.        ],
       [0.02915385, 0.00946154, 0.04824457]])

In [114]:
# Train the model on naive bayes, decision tree, and random forest classifiers

# Define hyperparameters for each classifier
gnb_params = {
'var_smoothing': [1e-9, 1e-8, 1e-7, 1e-6, 1e-5]
}
mnb_params = {
'alpha': [0.1, 0.5, 1.0, 2.0]
}
bnb_params = {
'alpha': [0.1, 0.5, 1.0, 2.0],
'binarize': [0.0, 0.1, 0.5, 1.0]
}
cnb_params = {
'alpha': [0.1, 0.5, 1.0, 2.0]
}

# fit classifiers and make predictions on test set
best_f1_score = 0
best_classifier = None
classifiers = [GaussianNB(), MultinomialNB(), BernoulliNB(), ComplementNB()]
classifier_params = [gnb_params, mnb_params, bnb_params, cnb_params]
classifier_names = ["Gaussian Naive Bayes", "Multinomial Naive Bayes", "Bernoulli Naive Bayes", "Complement Naive Bayes"]

for classifier, params, name in zip(classifiers, classifier_params, classifier_names):
    clf = GridSearchCV(classifier, params, scoring='f1', cv=10)
    with tqdm(total=100, desc=name) as pbar:
        clf.fit(X_train, y_train)
        y_pred = clf.predict(X_test)

    # compute evaluation metrics
    accuracy = accuracy_score(y_test, y_pred)
    precision = precision_score(y_test, y_pred)
    recall = recall_score(y_test, y_pred)
    f1 = f1_score(y_test, y_pred)
    cm = confusion_matrix(y_test, y_pred)

    # print results
    print(f"Results for {name}:")
    print(f"Best parameters: {clf.best_params_}")
    print(f"Accuracy: {accuracy}")
    print(f"Precision: {precision}")
    print(f"Recall: {recall}")
    print(f"F1 score: {f1}")
    print(f"Confusion matrix:\n{cm}")

    # check if current classifier is the best one
    if f1 > best_f1_score:
        best_f1_score = f1
        best_classifier = name
        # save the best model
        joblib.dump(clf, best_classifier + '_model.pkl')

print(f"\nBest classifier: {best_classifier} (F1 score: {best_f1_score:.4f})")

Gaussian Naive Bayes:   0%|          | 0/100 [00:00<?, ?it/s]


Results for Gaussian Naive Bayes:
Best parameters: {'var_smoothing': 1e-09}
Accuracy: 0.5340357118423753
Precision: 0.5340357118423753
Recall: 1.0
F1 score: 0.6962493867903492
Confusion matrix:
[[   0 6811]
 [   0 7806]]


Multinomial Naive Bayes:   0%|          | 0/100 [00:00<?, ?it/s]


Results for Multinomial Naive Bayes:
Best parameters: {'alpha': 0.1}
Accuracy: 0.5340357118423753
Precision: 0.5340357118423753
Recall: 1.0
F1 score: 0.6962493867903492
Confusion matrix:
[[   0 6811]
 [   0 7806]]


Bernoulli Naive Bayes:   0%|          | 0/100 [00:01<?, ?it/s]


Results for Bernoulli Naive Bayes:
Best parameters: {'alpha': 0.1, 'binarize': 0.0}
Accuracy: 0.5340357118423753
Precision: 0.5340357118423753
Recall: 1.0
F1 score: 0.6962493867903492
Confusion matrix:
[[   0 6811]
 [   0 7806]]


Complement Naive Bayes:   0%|          | 0/100 [00:00<?, ?it/s]

Results for Complement Naive Bayes:
Best parameters: {'alpha': 2.0}
Accuracy: 0.4746528015324622
Precision: 0.556045895851721
Recall: 0.08070714834742505
F1 score: 0.14095536413469065
Confusion matrix:
[[6308  503]
 [7176  630]]

Best classifier: Gaussian Naive Bayes (F1 score: 0.6962)





In [None]:
# Define hyperparameters for the decision tree classifier
dt_params = {
    'criterion': ['gini', 'entropy'],
    'max_depth': [None, 5, 10, 20],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4]
}

# fit the decision tree classifier and make predictions on test set
dt_clf = GridSearchCV(DecisionTreeClassifier(), dt_params, scoring='f1', cv=10)
with tqdm(total=100, desc="Decision Tree") as pbar:
    dt_clf.fit(X_train, y_train)
    y_pred = dt_clf.predict(X_test)

# compute evaluation metrics
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)
cm = confusion_matrix(y_test, y_pred)

# print results
print("Results for Decision Tree:")
print(f"Best parameters: {dt_clf.best_params_}")
print(f"Accuracy: {accuracy}")
print(f"Precision: {precision}")
print(f"Recall: {recall}")
print(f"F1 score: {f1}")
print(f"Confusion matrix:\n{cm}")

# save the best model
joblib.dump(dt_clf, 'Decision_Tree_model.pkl')

Decision Tree:   0%|          | 0/100 [01:39<?, ?it/s]

Results for Decision Tree:
Best parameters: {'criterion': 'entropy', 'max_depth': 5, 'min_samples_leaf': 1, 'min_samples_split': 2}
Accuracy: 0.5432072829131652
Precision: 0.5436804283500071
Recall: 0.9940744557516424
F1 score: 0.7029193423509587
Confusion matrix:
[[  40 6477]
 [  46 7717]]





['Decision_Tree_model.pkl']

In [None]:
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix
from tqdm import tqdm
import joblib

# Define hyperparameters for each classifier
rf_params = {
    'n_estimators': [100, 200, 300],
    'max_depth': [10, 20, 30],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4]
}
gb_params = {
    'n_estimators': [100, 200, 300],
    'max_depth': [3, 5, 7],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4],
    'learning_rate': [0.01, 0.1, 1.0]
}

# fit classifiers and make predictions on test set
best_f1_score = 0
best_classifier = None
classifiers = [RandomForestClassifier(), GradientBoostingClassifier()]
classifier_params = [rf_params, gb_params]
classifier_names = ["Random Forest", "Gradient Boosting"]

for classifier, params, name in zip(classifiers, classifier_params, classifier_names):
    #clf = GridSearchCV(classifier, params, scoring='f1', cv=10)
    #with tqdm(total=100, desc=name) as pbar:
        #clf.fit(X_train, y_train)
        #y_pred = clf.predict(X_test)

    # compute evaluation metrics
    accuracy = accuracy_score(y_test, y_pred)
    precision = precision_score(y_test, y_pred)
    recall = recall_score(y_test, y_pred)
    f1 = f1_score(y_test, y_pred)
    cm = confusion_matrix(y_test, y_pred)

    # print results
    print(f"Results for {name}:")
    print(f"Best parameters: {clf.best_params_}")
    print(f"Accuracy: {accuracy}")
    print(f"Precision: {precision}")
    print(f"Recall: {recall}")
    print(f"F1 score: {f1}")
    print(f"Confusion matrix:\n{cm}")

    # check if current classifier is the best one
    if f1 > best_f1_score:
        best_f1_score = f1
        best_classifier = name
        # save the best model
        joblib.dump(clf, best_classifier + '_model.pkl')

print(f"\nBest classifier: {best_classifier} (F1 score: {best_f1_score:.4f})")

Results for Random Forest:
Best parameters: {'alpha': 2.0}
Accuracy: 0.5432072829131652
Precision: 0.5436804283500071
Recall: 0.9940744557516424
F1 score: 0.7029193423509587
Confusion matrix:
[[  40 6477]
 [  46 7717]]
Results for Gradient Boosting:
Best parameters: {'alpha': 2.0}
Accuracy: 0.5432072829131652
Precision: 0.5436804283500071
Recall: 0.9940744557516424
F1 score: 0.7029193423509587
Confusion matrix:
[[  40 6477]
 [  46 7717]]

Best classifier: Random Forest (F1 score: 0.7029)
