In [14]:
# Project: Final Team Predictive Models
# COMP_SCI 349-0 Machine Learning | Professor V.S.
# By Sengdao Inthavong, Lillian Torres, Haylie Wu, Ernie Wang

# This file contains the code for the predictive models used in our project.
# The models are trained on the training data and tested on the testing data for the CDs and vinyls.

In [22]:
# Import basic libraries
import os
import numpy as np
import pandas as pd

# Import libraries for text processing
import nltk
from nltk.sentiment.vader import SentimentIntensityAnalyzer
from sklearn.preprocessing import LabelEncoder 

# Import libraries for machine learning
import joblib
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix
from sklearn.naive_bayes import GaussianNB, MultinomialNB, BernoulliNB, ComplementNB, CategoricalNB
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV

# Import libraries for logging
import logging
from tqdm import tqdm

In [60]:
# Load product and review data for CDs and vinyls from the training set
data_dir = ''
categories = ['CDs_and_Vinyl', 'Grocery_and_Gourmet_Food', 'Toys_and_Games']

file_path = os.path.join(data_dir, categories[0], 'train', 'product_training.json')
product_training = pd.read_json(file_path)

file_path = os.path.join(data_dir, categories[0], 'train', 'review_training.json')
review_training = pd.read_json(file_path)

# Merge product and review data
training_data = review_training.merge(product_training, on='asin', how='left')

In [61]:
# Fill in any missing values
training_data['reviewText'].fillna('', inplace=True)
training_data['summary'].fillna('', inplace=True)

In [62]:
# Give each review a unique ID
training_data['reviewID'] = training_data.index

In [63]:
# Run sentiment analysis on the review text and summary
# Columns: neg, neu, pos, compound

# sid = SentimentIntensityAnalyzer()

# review_sentiments = pd.DataFrame(columns=['reviewID', 'reviewText_neg', 'reviewText_neu', 'reviewText_pos', 'reviewText_compound', 'summary_neg', 'summary_neu', 'summary_pos', 'summary_compound'])

# for index, row in tqdm(training_data.iterrows(), total=training_data.shape[0], desc="Sentiment Analysis"):
#     review_text_sentiment = sid.polarity_scores(row['reviewText'])
#     summary_text_sentiment = sid.polarity_scores(row['summary'])
    
#     sentiment_row = {'reviewID': row['reviewID'],
#                      'reviewText_neg': review_text_sentiment['neg'],
#                      'reviewText_neu': review_text_sentiment['neu'],
#                      'reviewText_pos': review_text_sentiment['pos'],
#                      'reviewText_compound': review_text_sentiment['compound'],
#                      'summary_neg': summary_text_sentiment['neg'],
#                      'summary_neu': summary_text_sentiment['neu'],
#                      'summary_pos': summary_text_sentiment['pos'],
#                      'summary_compound': summary_text_sentiment['compound']}
    
#     review_sentiments = review_sentiments.append(sentiment_row, ignore_index=True)

# # Save the sentiment data to a csv file for future use
# file_path = os.path.join(data_dir, categories[0], 'csv', 'review_sentiments.csv')
# review_sentiments.to_csv(file_path, index=False)
review_sentiments = pd.read_csv('review_sentiments.csv')
review_sentiments

Unnamed: 0,reviewID,reviewText_neg,reviewText_neu,reviewText_pos,reviewText_compound,summary_neg,summary_neu,summary_pos,summary_compound
0,0.0,0.020,0.757,0.223,0.9926,0.0,1.000,0.000,0.0000
1,1.0,0.101,0.822,0.077,0.3222,0.0,1.000,0.000,0.0000
2,2.0,0.088,0.709,0.203,0.7845,0.0,1.000,0.000,0.0000
3,3.0,0.054,0.788,0.158,0.7430,0.0,0.426,0.574,0.4678
4,4.0,0.000,0.196,0.804,0.6249,0.0,1.000,0.000,0.0000
...,...,...,...,...,...,...,...,...,...
770781,770781.0,0.054,0.800,0.146,0.9555,0.0,1.000,0.000,0.0000
770782,770782.0,0.042,0.875,0.083,0.4767,0.0,1.000,0.000,0.0000
770783,770783.0,0.138,0.648,0.214,0.5371,0.0,1.000,0.000,0.0000
770784,770784.0,0.000,0.798,0.202,0.7444,0.0,0.328,0.672,0.6249


In [64]:
# Merge the sentiment data with the training data
training_data = training_data.merge(review_sentiments, on='reviewID', how='left')
training_data

Unnamed: 0,asin,reviewerID,unixReviewTime,vote,verified,reviewTime,style,reviewerName,reviewText,summary,...,awesomeness,reviewID,reviewText_neg,reviewText_neu,reviewText_pos,reviewText_compound,summary_neg,summary_neu,summary_pos,summary_compound
0,6673F1740E03573BCD64238FE711FC69,9C856D4A18E1355783B3B28B7ECC1848,1451520000,9,True,"12 31, 2015",{'Format:': ' MP3 Music'},8D88BB79AAC50277AEE82FCFD77F6744,Finding the Beatles all over again - and bette...,I sit listening - with my jaw to the floor - H...,...,1,0,0.020,0.757,0.223,0.9926,0.0,1.000,0.000,0.0000
1,690819436E20BB31657AF6B58B984DD4,6E9ABBD26A27C2B2851D1EC34A01CBDC,1113523200,9,False,"04 15, 2005",{'Format:': ' Audio CD'},8DC0611245A871AC51BBEEBB85F33A58,These guys can sing! Such classic tunes...poi...,Under Appreciated....,...,1,1,0.101,0.822,0.077,0.3222,0.0,1.000,0.000,0.0000
2,A680D4753F0CEA2252C168A6ACB2B623,B637C3C93E61094474710F456928BE9F,1126137600,,False,"09 8, 2005",,2259386624CFA0EC53A75A50A9BB57A5,"Snoop Doggy Dogg made a classic album, DoggySt...",DoggyStyle,...,1,2,0.088,0.709,0.203,0.7845,0.0,1.000,0.000,0.0000
3,F4A966F1FA340B16651D676BC246D227,AA7918E9410D650A076221C7B2934A09,954979200,,False,"04 6, 2000",{'Format:': ' Audio CD'},7A65A155C993535BC99CBCB39E7161B5,Stevie Nicks Has had Her Moments. I Like Some ...,Pretty Good but a Bit Dated,...,1,3,0.054,0.788,0.158,0.7430,0.0,0.426,0.574,0.4678
4,EF59DAF0C00319A48D4784266FD157EE,2293C9B7950A3356B95828419A677720,1477958400,,True,"11 1, 2016",{'Format:': ' Audio CD'},C69A09446009C500B1364B7DB5510497,Great cd.,Five Stars,...,1,4,0.000,0.196,0.804,0.6249,0.0,1.000,0.000,0.0000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
770781,BD91503308A437374C3254EDC8BC24CB,936ED23AF4D23943786BBD44D0F1114B,1136246400,12,True,"01 3, 2006",{'Format:': ' Audio CD'},AAB61000438939C8E6165CFCCF02A488,This was the first Simple Minds album that I b...,Their Most Fully Realized Artistic Studio Stat...,...,1,770781,0.054,0.800,0.146,0.9555,0.0,1.000,0.000,0.0000
770782,9BC50277D18FAB423AD33C8CE4CC000D,EF922377A87E9D01F50065F2DA1722A8,1290556800,,False,"11 24, 2010",,0441BC4F6B7BD180769FDCDD8E603560,I have owned a CD copy of this show for at lea...,Forgettable R&H,...,0,770782,0.042,0.875,0.083,0.4767,0.0,1.000,0.000,0.0000
770783,4062627CA1586E517520483964299349,E1F0B0EBC6A36F33301E4FD0B3D62D52,1311120000,3,False,"07 20, 2011",{'Format:': ' Audio CD'},8AB3EEBF23F3583A4396A57DB291D548,carnival of souls to me is bad i gave my cd aw...,Darren d.,...,0,770783,0.138,0.648,0.214,0.5371,0.0,1.000,0.000,0.0000
770784,0AE44A6A9176E6A52507B6ABDDA80B00,DDDC81E6B8C3F8C91867F9AECB385135,1111968000,2,False,"03 28, 2005",{'Format:': ' Audio CD'},51B2E62E0A5864316BDB33FD4A729B37,This is an awesome slayer album. I love the th...,awesome slayer cd,...,1,770784,0.000,0.798,0.202,0.7444,0.0,0.328,0.672,0.6249


In [65]:
# Process the columns that are not numeric
encoder = LabelEncoder()

# Define a function to extract the format information from the "style" column
def extract_format(style):
    if style is None:
        return "None"
    else:
        return style.get("Format:", "None").strip()

# Apply the function to the "style" column to extract the format information
training_data["style"] = training_data["style"].apply(extract_format)
training_data

Unnamed: 0,asin,reviewerID,unixReviewTime,vote,verified,reviewTime,style,reviewerName,reviewText,summary,...,awesomeness,reviewID,reviewText_neg,reviewText_neu,reviewText_pos,reviewText_compound,summary_neg,summary_neu,summary_pos,summary_compound
0,6673F1740E03573BCD64238FE711FC69,9C856D4A18E1355783B3B28B7ECC1848,1451520000,9,True,"12 31, 2015",MP3 Music,8D88BB79AAC50277AEE82FCFD77F6744,Finding the Beatles all over again - and bette...,I sit listening - with my jaw to the floor - H...,...,1,0,0.020,0.757,0.223,0.9926,0.0,1.000,0.000,0.0000
1,690819436E20BB31657AF6B58B984DD4,6E9ABBD26A27C2B2851D1EC34A01CBDC,1113523200,9,False,"04 15, 2005",Audio CD,8DC0611245A871AC51BBEEBB85F33A58,These guys can sing! Such classic tunes...poi...,Under Appreciated....,...,1,1,0.101,0.822,0.077,0.3222,0.0,1.000,0.000,0.0000
2,A680D4753F0CEA2252C168A6ACB2B623,B637C3C93E61094474710F456928BE9F,1126137600,,False,"09 8, 2005",,2259386624CFA0EC53A75A50A9BB57A5,"Snoop Doggy Dogg made a classic album, DoggySt...",DoggyStyle,...,1,2,0.088,0.709,0.203,0.7845,0.0,1.000,0.000,0.0000
3,F4A966F1FA340B16651D676BC246D227,AA7918E9410D650A076221C7B2934A09,954979200,,False,"04 6, 2000",Audio CD,7A65A155C993535BC99CBCB39E7161B5,Stevie Nicks Has had Her Moments. I Like Some ...,Pretty Good but a Bit Dated,...,1,3,0.054,0.788,0.158,0.7430,0.0,0.426,0.574,0.4678
4,EF59DAF0C00319A48D4784266FD157EE,2293C9B7950A3356B95828419A677720,1477958400,,True,"11 1, 2016",Audio CD,C69A09446009C500B1364B7DB5510497,Great cd.,Five Stars,...,1,4,0.000,0.196,0.804,0.6249,0.0,1.000,0.000,0.0000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
770781,BD91503308A437374C3254EDC8BC24CB,936ED23AF4D23943786BBD44D0F1114B,1136246400,12,True,"01 3, 2006",Audio CD,AAB61000438939C8E6165CFCCF02A488,This was the first Simple Minds album that I b...,Their Most Fully Realized Artistic Studio Stat...,...,1,770781,0.054,0.800,0.146,0.9555,0.0,1.000,0.000,0.0000
770782,9BC50277D18FAB423AD33C8CE4CC000D,EF922377A87E9D01F50065F2DA1722A8,1290556800,,False,"11 24, 2010",,0441BC4F6B7BD180769FDCDD8E603560,I have owned a CD copy of this show for at lea...,Forgettable R&H,...,0,770782,0.042,0.875,0.083,0.4767,0.0,1.000,0.000,0.0000
770783,4062627CA1586E517520483964299349,E1F0B0EBC6A36F33301E4FD0B3D62D52,1311120000,3,False,"07 20, 2011",Audio CD,8AB3EEBF23F3583A4396A57DB291D548,carnival of souls to me is bad i gave my cd aw...,Darren d.,...,0,770783,0.138,0.648,0.214,0.5371,0.0,1.000,0.000,0.0000
770784,0AE44A6A9176E6A52507B6ABDDA80B00,DDDC81E6B8C3F8C91867F9AECB385135,1111968000,2,False,"03 28, 2005",Audio CD,51B2E62E0A5864316BDB33FD4A729B37,This is an awesome slayer album. I love the th...,awesome slayer cd,...,1,770784,0.000,0.798,0.202,0.7444,0.0,0.328,0.672,0.6249


In [66]:
# Encode the columns
encoder = LabelEncoder()
training_data["style"] = encoder.fit_transform(training_data["style"])

# Encode the "verified" column
training_data["verified"] = encoder.fit_transform(training_data["verified"])

# Encode the "reviewerID" column
training_data["reviewerID"] = encoder.fit_transform(training_data["reviewerID"])

# Encode the "vote"" column
training_data["vote"] = training_data["vote"].apply(lambda x: float(x.replace(",", "")) if x is not None else 0)

# Encode the "image" column
training_data["image"] = training_data["image"].apply(lambda x: len(x) if x is not None else 0)

training_data

Unnamed: 0,asin,reviewerID,unixReviewTime,vote,verified,reviewTime,style,reviewerName,reviewText,summary,...,awesomeness,reviewID,reviewText_neg,reviewText_neu,reviewText_pos,reviewText_compound,summary_neg,summary_neu,summary_pos,summary_compound
0,6673F1740E03573BCD64238FE711FC69,67931,1451520000,9.0,1,"12 31, 2015",16,8D88BB79AAC50277AEE82FCFD77F6744,Finding the Beatles all over again - and bette...,I sit listening - with my jaw to the floor - H...,...,1,0,0.020,0.757,0.223,0.9926,0.0,1.000,0.000,0.0000
1,690819436E20BB31657AF6B58B984DD4,47952,1113523200,9.0,0,"04 15, 2005",3,8DC0611245A871AC51BBEEBB85F33A58,These guys can sing! Such classic tunes...poi...,Under Appreciated....,...,1,1,0.101,0.822,0.077,0.3222,0.0,1.000,0.000,0.0000
2,A680D4753F0CEA2252C168A6ACB2B623,79174,1126137600,0.0,0,"09 8, 2005",20,2259386624CFA0EC53A75A50A9BB57A5,"Snoop Doggy Dogg made a classic album, DoggySt...",DoggyStyle,...,1,2,0.088,0.709,0.203,0.7845,0.0,1.000,0.000,0.0000
3,F4A966F1FA340B16651D676BC246D227,74058,954979200,0.0,0,"04 6, 2000",3,7A65A155C993535BC99CBCB39E7161B5,Stevie Nicks Has had Her Moments. I Like Some ...,Pretty Good but a Bit Dated,...,1,3,0.054,0.788,0.158,0.7430,0.0,0.426,0.574,0.4678
4,EF59DAF0C00319A48D4784266FD157EE,15169,1477958400,0.0,1,"11 1, 2016",3,C69A09446009C500B1364B7DB5510497,Great cd.,Five Stars,...,1,4,0.000,0.196,0.804,0.6249,0.0,1.000,0.000,0.0000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
770781,BD91503308A437374C3254EDC8BC24CB,63910,1136246400,12.0,1,"01 3, 2006",3,AAB61000438939C8E6165CFCCF02A488,This was the first Simple Minds album that I b...,Their Most Fully Realized Artistic Studio Stat...,...,1,770781,0.054,0.800,0.146,0.9555,0.0,1.000,0.000,0.0000
770782,9BC50277D18FAB423AD33C8CE4CC000D,104092,1290556800,0.0,0,"11 24, 2010",20,0441BC4F6B7BD180769FDCDD8E603560,I have owned a CD copy of this show for at lea...,Forgettable R&H,...,0,770782,0.042,0.875,0.083,0.4767,0.0,1.000,0.000,0.0000
770783,4062627CA1586E517520483964299349,98108,1311120000,3.0,0,"07 20, 2011",3,8AB3EEBF23F3583A4396A57DB291D548,carnival of souls to me is bad i gave my cd aw...,Darren d.,...,0,770783,0.138,0.648,0.214,0.5371,0.0,1.000,0.000,0.0000
770784,0AE44A6A9176E6A52507B6ABDDA80B00,96333,1111968000,2.0,0,"03 28, 2005",3,51B2E62E0A5864316BDB33FD4A729B37,This is an awesome slayer album. I love the th...,awesome slayer cd,...,1,770784,0.000,0.798,0.202,0.7444,0.0,0.328,0.672,0.6249


In [67]:
# Aggregate the training data by asin
training_data = training_data.groupby("asin").agg({
    "reviewerID": "count",
    "unixReviewTime": "mean",
    "verified": "mean",
    "vote": "mean",
    "image": "mean",
    "style": "mean",
    "reviewText_neg": "mean",
    "reviewText_neu": "mean",
    "reviewText_pos": "mean",
    "reviewText_compound": "mean",
    "summary_neg": "mean",
    "summary_neu": "mean",
    "summary_pos": "mean",
    "summary_compound": "mean"
}).reset_index()
training_data

Unnamed: 0,asin,reviewerID,unixReviewTime,verified,vote,image,style,reviewText_neg,reviewText_neu,reviewText_pos,reviewText_compound,summary_neg,summary_neu,summary_pos,summary_compound
0,0000B049F5B33CD310EB1AB236E20191,4,1.296583e+09,0.500000,1.500000,0.0,3.000000,0.028000,0.595000,0.376750,0.908350,0.000000,0.646500,0.353500,0.365925
1,00018184A9EC4D270219A296B2580303,18,1.111282e+09,0.166667,2.500000,0.0,3.000000,0.048833,0.761889,0.189333,0.736117,0.041556,0.760389,0.198111,0.181272
2,000281A9CAC43FF1F335726A390636DA,4,1.254960e+09,0.750000,1.250000,0.0,3.000000,0.025250,0.618000,0.356750,0.581325,0.194500,0.706000,0.099500,-0.007925
3,00030884DF109F325638A6BFD5B13CFF,29,1.315392e+09,0.586207,2.862069,0.0,3.862069,0.024207,0.629690,0.346138,0.628945,0.021448,0.744276,0.234276,0.230669
4,000325BA25966B5FC701D5D2B5DBA4E0,4,1.363802e+09,1.000000,1.750000,0.0,3.000000,0.005500,0.585000,0.409500,0.856500,0.105250,0.835250,0.059500,-0.096000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
73077,FFFDD3C72D23AF858D6E0ED92612370D,64,1.258704e+09,0.375000,0.890625,0.0,5.562500,0.065938,0.648469,0.285578,0.698494,0.081844,0.623688,0.294453,0.197958
73078,FFFDDE284A73B29B320381487EC7DE9E,4,1.353758e+09,0.500000,2.000000,0.0,3.000000,0.017000,0.651750,0.331250,0.949700,0.172000,0.522250,0.305750,0.168350
73079,FFFEB3EE2372807964F024707D50FB21,2,1.349050e+09,1.000000,0.000000,0.0,3.000000,0.024500,0.741500,0.234000,0.955100,0.161500,0.661500,0.177500,0.038600
73080,FFFF4545AB232D81D0F9B208388BB7AA,7,1.427846e+09,0.571429,1.571429,0.0,3.000000,0.095857,0.740714,0.163429,0.510000,0.054429,0.700429,0.245000,0.389014


In [68]:
# Add +1 to compound columns to avoid negative values
training_data["reviewText_compound"] += 1
training_data["summary_compound"] += 1
training_data

Unnamed: 0,asin,reviewerID,unixReviewTime,verified,vote,image,style,reviewText_neg,reviewText_neu,reviewText_pos,reviewText_compound,summary_neg,summary_neu,summary_pos,summary_compound
0,0000B049F5B33CD310EB1AB236E20191,4,1.296583e+09,0.500000,1.500000,0.0,3.000000,0.028000,0.595000,0.376750,1.908350,0.000000,0.646500,0.353500,1.365925
1,00018184A9EC4D270219A296B2580303,18,1.111282e+09,0.166667,2.500000,0.0,3.000000,0.048833,0.761889,0.189333,1.736117,0.041556,0.760389,0.198111,1.181272
2,000281A9CAC43FF1F335726A390636DA,4,1.254960e+09,0.750000,1.250000,0.0,3.000000,0.025250,0.618000,0.356750,1.581325,0.194500,0.706000,0.099500,0.992075
3,00030884DF109F325638A6BFD5B13CFF,29,1.315392e+09,0.586207,2.862069,0.0,3.862069,0.024207,0.629690,0.346138,1.628945,0.021448,0.744276,0.234276,1.230669
4,000325BA25966B5FC701D5D2B5DBA4E0,4,1.363802e+09,1.000000,1.750000,0.0,3.000000,0.005500,0.585000,0.409500,1.856500,0.105250,0.835250,0.059500,0.904000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
73077,FFFDD3C72D23AF858D6E0ED92612370D,64,1.258704e+09,0.375000,0.890625,0.0,5.562500,0.065938,0.648469,0.285578,1.698494,0.081844,0.623688,0.294453,1.197958
73078,FFFDDE284A73B29B320381487EC7DE9E,4,1.353758e+09,0.500000,2.000000,0.0,3.000000,0.017000,0.651750,0.331250,1.949700,0.172000,0.522250,0.305750,1.168350
73079,FFFEB3EE2372807964F024707D50FB21,2,1.349050e+09,1.000000,0.000000,0.0,3.000000,0.024500,0.741500,0.234000,1.955100,0.161500,0.661500,0.177500,1.038600
73080,FFFF4545AB232D81D0F9B208388BB7AA,7,1.427846e+09,0.571429,1.571429,0.0,3.000000,0.095857,0.740714,0.163429,1.510000,0.054429,0.700429,0.245000,1.389014


In [115]:
# Merge the training data with the awesomeness data
file_path = os.path.join(data_dir, categories[0], 'train', 'product_training.json')
product_training = pd.read_json(file_path)
training_data = training_data.merge(product_training, on='asin', how='left')

# Visualize the absolute correlation between the features on "awesomeness"
training_data.corr()["awesomeness"].abs().sort_values(ascending=False)


awesomeness            1.000000
awesomeness_x          1.000000
awesomeness_y          1.000000
awesomeness_x          1.000000
awesomeness_y          1.000000
reviewText_neg         0.142073
reviewText_pos         0.131876
summary_neg            0.123938
reviewText_neu         0.098996
summary_compound       0.091541
reviewText_compound    0.076495
unixReviewTime         0.076253
verified               0.073970
summary_pos            0.045333
style                  0.012240
summary_neu            0.010901
image                  0.004757
reviewerID             0.003925
vote                   0.001789
Name: awesomeness, dtype: float64

In [118]:
# Prepare the data for training
# Drop all columns except for reviewText_pos, summary_compound, reviewText_compound, unixReviewTime, and verified
X = training_data.drop(["asin", "awesomeness", "reviewerID", "vote", "image", "style", "unixReviewTime", "verified", "summary_neu", "summary_pos", "reviewText_compound", "reviewText_neu", "summary_compound"], axis=1)
# X = training_data.drop(["asin", "awesomeness"], axis=1)
y = training_data["awesomeness"]

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y)

In [119]:
X_train

Unnamed: 0,reviewText_neg,reviewText_pos,summary_neg,awesomeness_x,awesomeness_y,awesomeness_x.1,awesomeness_y.1
54375,0.052667,0.350333,0.000000,1,1,1,1
72310,0.043636,0.349091,0.029182,1,1,1,1
24225,0.009800,0.357000,0.000000,1,1,1,1
70762,0.015500,0.196667,0.000000,1,1,1,1
6212,0.035084,0.243825,0.053385,1,1,1,1
...,...,...,...,...,...,...,...
61653,0.041000,0.368800,0.028600,1,1,1,1
29194,0.000000,0.577250,0.000000,1,1,1,1
52351,0.000000,0.618750,0.000000,1,1,1,1
61572,0.016000,0.262000,0.000000,1,1,1,1


In [111]:
# Train the model

# Define hyperparameters for each classifier
gnb_params = {
    'var_smoothing': [1e-5]
}
mnb_params = {}
bnb_params = {}
cnb_params = {}

# fit classifiers and make predictions on test set
best_f1_score = 0
best_classifier = None
classifiers = [GaussianNB(), MultinomialNB(), BernoulliNB(), ComplementNB()]
classifier_params = [gnb_params, mnb_params, bnb_params, cnb_params]
classifier_names = ["Gaussian Naive Bayes", "Multinomial Naive Bayes", "Bernoulli Naive Bayes", "Complement Naive Bayes"]

for classifier, params, name in zip(classifiers, classifier_params, classifier_names):
    clf = GridSearchCV(classifier, params, scoring='f1', cv=5)
    with tqdm(total=100, desc=name) as pbar:
        clf.fit(X_train, y_train)
        y_pred = clf.predict(X_test)

    # compute evaluation metrics
    accuracy = accuracy_score(y_test, y_pred)
    precision = precision_score(y_test, y_pred)
    recall = recall_score(y_test, y_pred)
    f1 = f1_score(y_test, y_pred)
    cm = confusion_matrix(y_test, y_pred)

    # print results
    print(f"Results for {name}:")
    print(f"Best parameters: {clf.best_params_}")
    print(f"Accuracy: {accuracy}")
    print(f"Precision: {precision}")
    print(f"Recall: {recall}")
    print(f"F1 score: {f1}")
    print(f"Confusion matrix:\n{cm}")

    # check if current classifier is the best one
    if f1 > best_f1_score:
        best_f1_score = f1
        best_classifier = name
        # save the best model
        joblib.dump(clf, best_classifier + '_model.pkl')

print(f"\nBest classifier: {best_classifier} (F1 score: {best_f1_score:.4f})")

Gaussian Naive Bayes:   0%|          | 0/100 [00:00<?, ?it/s]
Multinomial Naive Bayes:   0%|          | 0/100 [00:00<?, ?it/s]


Results for Gaussian Naive Bayes:
Best parameters: {}
Accuracy: 0.5755568934376881
Precision: 0.5735271730857056
Recall: 0.7778465985341179
F1 score: 0.6602409638554216
Confusion matrix:
[[2981 5603]
 [2152 7535]]
Results for Multinomial Naive Bayes:
Best parameters: {}
Accuracy: 0.5370258880192655
Precision: 0.534382349647217
Recall: 0.9851347166305358


Bernoulli Naive Bayes:   0%|          | 0/100 [00:00<?, ?it/s]
Complement Naive Bayes:   0%|          | 0/100 [00:00<?, ?it/s]

F1 score: 0.6929025231439463
Confusion matrix:
[[ 269 8315]
 [ 144 9543]]
Results for Bernoulli Naive Bayes:
Best parameters: {}
Accuracy: 0.5595205516939412
Precision: 0.5788359788359788
Recall: 0.6211417363476824
F1 score: 0.599243103276566
Confusion matrix:
[[4206 4378]
 [3670 6017]]


Complement Naive Bayes:   0%|          | 0/100 [00:00<?, ?it/s]

Results for Complement Naive Bayes:
Best parameters: {}
Accuracy: 0.5782387389852772
Precision: 0.5920624593363696
Recall: 0.6575823268297719
F1 score: 0.6231047637679742
Confusion matrix:
[[4195 4389]
 [3317 6370]]

Best classifier: Multinomial Naive Bayes (F1 score: 0.6929)



