In [103]:
# Project: Final Team Predictive Models
# COMP_SCI 349-0 Machine Learning | Professor V.S.
# By Sengdao Inthavong, Lillian Torres, Haylie Wu, Ernie Wang

# This file contains the code for the predictive models used in our project.
# The models are trained on the training data and tested on the testing data for the CDs and vinyls.

In [104]:
# Import basic libraries
import os
import numpy as np
import pandas as pd

# Import libraries for text processing
import nltk
from nltk.sentiment.vader import SentimentIntensityAnalyzer
from sklearn.preprocessing import LabelEncoder 

# Import libraries for machine learning
import joblib
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix
from sklearn.naive_bayes import GaussianNB, MultinomialNB, BernoulliNB, ComplementNB, CategoricalNB
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import AdaBoostClassifier, RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV

# Import libraries for logging
import logging
from tqdm import tqdm

In [105]:
# #GETTING THE DATA
# #getting to the data directory
# data_dir = 'devided_dataset_v2'
# categories = ['CDs_and_Vinyl', 'Grocery_and_Gourmet_Food', 'Toys_and_Games']
# #getting the products and the reviews for the shit
# resultpath = os.path.join(data_dir, categories[0], 'train', 'product_training.json')
# trainingpath = os.path.join(data_dir, categories[0], 'train', 'review_training.json')
# #reading the file with json
# reviewdata = pd.read_json(trainingpath)
# resultpath = pd.read_json(resultpath)
# #combine the 2 shits together
# training_data = resultpath.merge(reviewdata, how='left', on='asin')

In [106]:
# Load product and review data for CDs and vinyls from the training set
data_dir = ''
categories = ['CDs_and_Vinyl', 'Grocery_and_Gourmet_Food', 'Toys_and_Games']

file_path = os.path.join(data_dir, categories[0], 'train', 'product_training.json')
product_training = pd.read_json(file_path)

file_path = os.path.join(data_dir, categories[0], 'train', 'review_training.json')
review_training = pd.read_json(file_path)

# Merge product and review data
training_data = review_training.merge(product_training, on='asin', how='left')

In [107]:
# Fill in any missing values
training_data['reviewText'].fillna('', inplace=True)
training_data['summary'].fillna('', inplace=True)

In [108]:
# Give each review a unique ID
training_data['reviewID'] = training_data.index

In [109]:
# Run sentiment analysis on the review text and summary
# Columns: neg, neu, pos, compound

# sid = SentimentIntensityAnalyzer()

# review_sentiments = pd.DataFrame(columns=['reviewID', 'reviewText_neg', 'reviewText_neu', 'reviewText_pos', 'reviewText_compound', 'summary_neg', 'summary_neu', 'summary_pos', 'summary_compound'])

# for index, row in tqdm(training_data.iterrows(), total=training_data.shape[0], desc="Sentiment Analysis"):
#     review_text_sentiment = sid.polarity_scores(row['reviewText'])
#     summary_text_sentiment = sid.polarity_scores(row['summary'])
    
#     sentiment_row = {'reviewID': row['reviewID'],
#                      'reviewText_neg': review_text_sentiment['neg'],
#                      'reviewText_neu': review_text_sentiment['neu'],
#                      'reviewText_pos': review_text_sentiment['pos'],
#                      'reviewText_compound': review_text_sentiment['compound'],
#                      'summary_neg': summary_text_sentiment['neg'],
#                      'summary_neu': summary_text_sentiment['neu'],
#                      'summary_pos': summary_text_sentiment['pos'],
#                      'summary_compound': summary_text_sentiment['compound']}
    
#     review_sentiments = review_sentiments.append(sentiment_row, ignore_index=True)

# # Save the sentiment data to a csv file for future use
# file_path = os.path.join(data_dir, categories[0], 'csv', 'review_sentiments.csv')
# review_sentiments.to_csv(file_path, index=False)
review_sentiments = pd.read_csv('review_sentiments.csv')

In [110]:
# Merge the sentiment data with the training data
training_data = training_data.merge(review_sentiments, on='reviewID', how='left')

In [111]:
# Process the columns that are not numeric
encoder = LabelEncoder()

# Define a function to extract the format information from the "style" column
def extract_format(style):
    if style is None:
        return "None"
    else:
        return style.get("Format:", "None").strip()

# Apply the function to the "style" column to extract the format information
training_data["style"] = training_data["style"].apply(extract_format)

In [112]:
# Encode the columns
encoder = LabelEncoder()
training_data["style"] = encoder.fit_transform(training_data["style"])

# Encode the "verified" column
training_data["verified"] = encoder.fit_transform(training_data["verified"])

# Encode the "reviewerID" column
training_data["reviewerID"] = encoder.fit_transform(training_data["reviewerID"])

# Encode the "vote"" column
training_data["vote"] = training_data["vote"].apply(lambda x: float(x.replace(",", "")) if x is not None else 0)

# Encode the "image" column
training_data["image"] = training_data["image"].apply(lambda x: len(x) if x is not None else 0)

In [113]:
# Compute the length of reviewText and summary columns
training_data["reviewText_len"] = training_data["reviewText"].apply(len)
training_data["summary_len"] = training_data["summary"].apply(len)

In [114]:
# # Filter out the reviews that are not verified, have no votes, and have no images unless there is no verified and voted reviews
# training_data = training_data[(training_data["verified"] == 1) | (training_data["vote"] > 0) | (training_data["image"] > 0)]

# Normalize the compound scores
training_data["reviewText_compound_norm"] = (training_data["reviewText_compound"] - training_data["reviewText_compound"].mean()) / training_data["reviewText_compound"].std()
training_data["summary_compound_norm"] = (training_data["summary_compound"] - training_data["summary_compound"].mean()) / training_data["summary_compound"].std()

# Calculate the absolute difference between the normalized compound scores and the awesomeness
training_data["reviewText_compound_diff"] = abs(training_data["reviewText_compound_norm"] - training_data["awesomeness"])
training_data["summary_compound_diff"] = abs(training_data["summary_compound_norm"] - training_data["awesomeness"])

# Calculate the average difference between the normalized compound scores and the awesomeness for each asin
compound_diff_mean = training_data.groupby("asin")[["reviewText_compound_diff", "summary_compound_diff"]].mean()

# Sort the reviews for each asin by the average difference between the normalized compound scores and the awesomeness
compound_diff_mean["compound_diff_mean"] = compound_diff_mean.mean(axis=1)
compound_diff_mean = compound_diff_mean.sort_values("compound_diff_mean", ascending=False)

# Keep the top 2/3 of the reviews for each asin
num_asins = len(compound_diff_mean)
top_reviews_per_asin = int(num_asins * 2/3)
top_asins = compound_diff_mean.iloc[:top_reviews_per_asin].index
training_data = training_data[training_data["asin"].isin(top_asins)]
training_data

Unnamed: 0,asin,reviewerID,unixReviewTime,vote,verified,reviewTime,style,reviewerName,reviewText,summary,...,summary_neg,summary_neu,summary_pos,summary_compound,reviewText_len,summary_len,reviewText_compound_norm,summary_compound_norm,reviewText_compound_diff,summary_compound_diff
0,6673F1740E03573BCD64238FE711FC69,67931,1451520000,9.0,1,"12 31, 2015",16,8D88BB79AAC50277AEE82FCFD77F6744,Finding the Beatles all over again - and bette...,I sit listening - with my jaw to the floor - H...,...,0.0,1.000,0.000,0.0000,1119,128,0.709300,-0.616331,0.290700,1.616331
1,690819436E20BB31657AF6B58B984DD4,47952,1113523200,9.0,0,"04 15, 2005",3,8DC0611245A871AC51BBEEBB85F33A58,These guys can sing! Such classic tunes...poi...,Under Appreciated....,...,0.0,1.000,0.000,0.0000,457,21,-0.889340,-0.616331,1.889340,1.616331
2,A680D4753F0CEA2252C168A6ACB2B623,79174,1126137600,0.0,0,"09 8, 2005",20,2259386624CFA0EC53A75A50A9BB57A5,"Snoop Doggy Dogg made a classic album, DoggySt...",DoggyStyle,...,0.0,1.000,0.000,0.0000,321,10,0.213063,-0.616331,0.786937,1.616331
3,F4A966F1FA340B16651D676BC246D227,74058,954979200,0.0,0,"04 6, 2000",3,7A65A155C993535BC99CBCB39E7161B5,Stevie Nicks Has had Her Moments. I Like Some ...,Pretty Good but a Bit Dated,...,0.0,0.426,0.574,0.4678,294,27,0.114102,0.754990,0.885898,0.245010
4,EF59DAF0C00319A48D4784266FD157EE,15169,1477958400,0.0,1,"11 1, 2016",3,C69A09446009C500B1364B7DB5510497,Great cd.,Five Stars,...,0.0,1.000,0.000,0.0000,9,10,-0.167520,-0.616331,1.167520,1.616331
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
770781,BD91503308A437374C3254EDC8BC24CB,63910,1136246400,12.0,1,"01 3, 2006",3,AAB61000438939C8E6165CFCCF02A488,This was the first Simple Minds album that I b...,Their Most Fully Realized Artistic Studio Stat...,...,0.0,1.000,0.000,0.0000,886,53,0.620831,-0.616331,0.379169,1.616331
770782,9BC50277D18FAB423AD33C8CE4CC000D,104092,1290556800,0.0,0,"11 24, 2010",20,0441BC4F6B7BD180769FDCDD8E603560,I have owned a CD copy of this show for at lea...,Forgettable R&H,...,0.0,1.000,0.000,0.0000,265,15,-0.520918,-0.616331,0.520918,0.616331
770783,4062627CA1586E517520483964299349,98108,1311120000,3.0,0,"07 20, 2011",3,8AB3EEBF23F3583A4396A57DB291D548,carnival of souls to me is bad i gave my cd aw...,Darren d.,...,0.0,1.000,0.000,0.0000,215,9,-0.376888,-0.616331,0.376888,0.616331
770784,0AE44A6A9176E6A52507B6ABDDA80B00,96333,1111968000,2.0,0,"03 28, 2005",3,51B2E62E0A5864316BDB33FD4A729B37,This is an awesome slayer album. I love the th...,awesome slayer cd,...,0.0,0.328,0.672,0.6249,145,17,0.117441,1.215517,0.882559,0.215517


In [115]:
# Aggregate the training data by asin
training_data = training_data.groupby("asin").agg({
    "reviewerID": "count",
    "unixReviewTime": ["min", "max", "mean", "std"],
    "verified": ["mean", "sum"],
    "vote": ["mean", "sum"],
    "image": ["mean", "sum"],
    "style": ["mean", "sum"],
    "reviewText_neg": ["mean", "std"],
    "reviewText_neu": ["mean", "std"],
    "reviewText_pos": ["mean", "std"],
    "reviewText_compound": ["mean", "std"],
    "summary_neg": ["mean", "std"],
    "summary_neu": ["mean", "std"],
    "summary_pos": ["mean", "std"],
    "summary_compound": ["mean", "std"],
    "reviewText_len": ["mean", "std"],
    "summary_len": ["mean", "std"],
}).reset_index()

In [116]:
# Add +1 to compound columns to avoid negative values
training_data["reviewText_compound"] += 1
training_data["summary_compound"] += 1
# Replace NaN values with 0
training_data.fillna(0, inplace=True)

In [117]:
column_dict = {
    #"reviewerID": ["count"],
    "unixReviewTime": ["min", "max", "mean", "std"],
    "verified": ["mean", "sum"],
    "vote": ["mean", "sum"],
    "image": ["mean", "sum"],
    "style": ["mean", "sum"],
    "reviewText_neg": ["mean", "std"],
    "reviewText_neu": ["mean", "std"],
    "reviewText_pos": ["mean", "std"],
    "reviewText_compound": ["mean", "std"],
    "summary_neg": ["mean", "std"],
    "summary_neu": ["mean", "std"],
    "summary_pos": ["mean", "std"],
    "summary_compound": ["mean", "std"],
    "reviewText_len": ["mean", "std"],
    "summary_len": ["mean", "std"],
}
column_list = []
for k in column_dict:
    for n in column_dict[k]:
        column_list.append((k, n))

In [118]:
from sklearn.preprocessing import MinMaxScaler
scaler = MinMaxScaler()
# Define the columns to normalize
#cols_to_normalize = ['unixReviewTime', 'verified', 'vote', 'image', 'style', 'reviewText_neg', 'reviewText_neu', 'reviewText_pos', 'reviewText_compound', 'summary_neg', 'summary_neu', 'summary_pos', 'summary_compound', 'reviewText_len', 'summary_len']
#cols_to_normalize = [('verified','mean')]
# Normalize the data using the MinMaxScaler
#scaled_df = scaler.fit_transform(training_data[cols_to_normalize])
# training_data
training_data[column_list] = scaler.fit_transform(training_data[column_list])
# training_data

In [119]:
# Merge the training data with the awesomeness data
file_path = os.path.join(data_dir, categories[0], 'train', 'product_training.json')
product_training = pd.read_json(file_path)
training_data.columns = training_data.columns.to_flat_index()

In [120]:
training_data['asin'] = training_data[('asin', '')]

In [121]:
training_data = training_data.merge(product_training, on='asin', how='left')

# Visualize the absolute correlation between the features on "awesomeness"
#training_data.corr()["awesomeness"].abs().sort_values(ascending=False)
training_data

Unnamed: 0,"(asin, )","(reviewerID, count)","(unixReviewTime, min)","(unixReviewTime, max)","(unixReviewTime, mean)","(unixReviewTime, std)","(verified, mean)","(verified, sum)","(vote, mean)","(vote, sum)",...,"(summary_pos, mean)","(summary_pos, std)","(summary_compound, mean)","(summary_compound, std)","(reviewText_len, mean)","(reviewText_len, std)","(summary_len, mean)","(summary_len, std)",asin,awesomeness
0,0000B049F5B33CD310EB1AB236E20191,4,0.071141,0.826499,0.628401,0.578536,0.500000,0.000137,0.010638,0.000061,...,0.353500,0.588926,0.666662,0.685984,0.045284,0.019032,0.198000,0.240570,0000B049F5B33CD310EB1AB236E20191,1
1,000281A9CAC43FF1F335726A390636DA,4,0.237047,0.811071,0.562543,0.401807,0.750000,0.000206,0.008865,0.000051,...,0.099500,0.281428,0.461148,0.686154,0.063303,0.057590,0.124000,0.111358,000281A9CAC43FF1F335726A390636DA,0
2,00030884DF109F325638A6BFD5B13CFF,29,0.175973,0.935851,0.658161,0.356251,0.586207,0.001166,0.020298,0.000839,...,0.234276,0.459918,0.592309,0.647604,0.040645,0.044603,0.185931,0.211929,00030884DF109F325638A6BFD5B13CFF,1
3,00039B53F332D3A911B0B18F88051C80,6,0.608188,0.967113,0.724858,0.207527,0.666667,0.000274,0.031915,0.000273,...,0.225833,0.572720,0.565967,0.616179,0.077139,0.036285,0.201333,0.155603,00039B53F332D3A911B0B18F88051C80,0
4,0004D01A4CED3FE007D35FB3933B3A6C,5,0.321879,0.907971,0.601121,0.336764,0.400000,0.000137,0.041135,0.000293,...,0.278000,0.405501,0.627893,0.653000,0.186272,0.092939,0.288000,0.183529,0004D01A4CED3FE007D35FB3933B3A6C,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
48716,FFFC4E5BE7E879DD32C396CC4E92FACC,17,0.249530,0.929084,0.643255,0.360012,0.529412,0.000617,0.004172,0.000101,...,0.146706,0.395296,0.553328,0.623103,0.039460,0.035130,0.172235,0.184656,FFFC4E5BE7E879DD32C396CC4E92FACC,1
48717,FFFCA74DB5BAF31CDB9CB1E0D7299197,2,0.466040,0.487887,0.474504,0.039124,0.000000,0.000000,0.053191,0.000152,...,0.102500,0.205000,0.521109,0.548489,0.171143,0.073260,0.336000,0.101695,FFFCA74DB5BAF31CDB9CB1E0D7299197,1
48718,FFFDD3C72D23AF858D6E0ED92612370D,64,0.040805,0.952497,0.568466,0.446228,0.375000,0.001646,0.006316,0.000576,...,0.294453,0.444233,0.574327,0.665759,0.085908,0.066925,0.195250,0.205306,FFFDD3C72D23AF858D6E0ED92612370D,1
48719,FFFF4545AB232D81D0F9B208388BB7AA,7,0.820940,0.856273,0.836090,0.029739,0.571429,0.000274,0.011145,0.000111,...,0.245000,0.176097,0.679355,0.635575,0.097641,0.066898,0.420571,0.245895,FFFF4545AB232D81D0F9B208388BB7AA,1


In [122]:
training_data = training_data.drop(training_data.columns[1], axis=1)

In [123]:
training_data

Unnamed: 0,"(asin, )","(unixReviewTime, min)","(unixReviewTime, max)","(unixReviewTime, mean)","(unixReviewTime, std)","(verified, mean)","(verified, sum)","(vote, mean)","(vote, sum)","(image, mean)",...,"(summary_pos, mean)","(summary_pos, std)","(summary_compound, mean)","(summary_compound, std)","(reviewText_len, mean)","(reviewText_len, std)","(summary_len, mean)","(summary_len, std)",asin,awesomeness
0,0000B049F5B33CD310EB1AB236E20191,0.071141,0.826499,0.628401,0.578536,0.500000,0.000137,0.010638,0.000061,0.0,...,0.353500,0.588926,0.666662,0.685984,0.045284,0.019032,0.198000,0.240570,0000B049F5B33CD310EB1AB236E20191,1
1,000281A9CAC43FF1F335726A390636DA,0.237047,0.811071,0.562543,0.401807,0.750000,0.000206,0.008865,0.000051,0.0,...,0.099500,0.281428,0.461148,0.686154,0.063303,0.057590,0.124000,0.111358,000281A9CAC43FF1F335726A390636DA,0
2,00030884DF109F325638A6BFD5B13CFF,0.175973,0.935851,0.658161,0.356251,0.586207,0.001166,0.020298,0.000839,0.0,...,0.234276,0.459918,0.592309,0.647604,0.040645,0.044603,0.185931,0.211929,00030884DF109F325638A6BFD5B13CFF,1
3,00039B53F332D3A911B0B18F88051C80,0.608188,0.967113,0.724858,0.207527,0.666667,0.000274,0.031915,0.000273,0.0,...,0.225833,0.572720,0.565967,0.616179,0.077139,0.036285,0.201333,0.155603,00039B53F332D3A911B0B18F88051C80,0
4,0004D01A4CED3FE007D35FB3933B3A6C,0.321879,0.907971,0.601121,0.336764,0.400000,0.000137,0.041135,0.000293,0.0,...,0.278000,0.405501,0.627893,0.653000,0.186272,0.092939,0.288000,0.183529,0004D01A4CED3FE007D35FB3933B3A6C,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
48716,FFFC4E5BE7E879DD32C396CC4E92FACC,0.249530,0.929084,0.643255,0.360012,0.529412,0.000617,0.004172,0.000101,0.0,...,0.146706,0.395296,0.553328,0.623103,0.039460,0.035130,0.172235,0.184656,FFFC4E5BE7E879DD32C396CC4E92FACC,1
48717,FFFCA74DB5BAF31CDB9CB1E0D7299197,0.466040,0.487887,0.474504,0.039124,0.000000,0.000000,0.053191,0.000152,0.0,...,0.102500,0.205000,0.521109,0.548489,0.171143,0.073260,0.336000,0.101695,FFFCA74DB5BAF31CDB9CB1E0D7299197,1
48718,FFFDD3C72D23AF858D6E0ED92612370D,0.040805,0.952497,0.568466,0.446228,0.375000,0.001646,0.006316,0.000576,0.0,...,0.294453,0.444233,0.574327,0.665759,0.085908,0.066925,0.195250,0.205306,FFFDD3C72D23AF858D6E0ED92612370D,1
48719,FFFF4545AB232D81D0F9B208388BB7AA,0.820940,0.856273,0.836090,0.029739,0.571429,0.000274,0.011145,0.000111,0.0,...,0.245000,0.176097,0.679355,0.635575,0.097641,0.066898,0.420571,0.245895,FFFF4545AB232D81D0F9B208388BB7AA,1


In [124]:
# Prepare the data for training
# Keep only the most important features for predicting awesomeness
X = training_data[[    
    ('reviewText_pos', 'mean'),    
    ('summary_neu', 'mean'),
    ('reviewText_neg', 'mean'),
    ('summary_neg', 'std'),
    ('summary_neg', 'mean'),
    ('reviewText_neu', 'mean'),
    ('reviewText_neu', 'std')

]].values
y = training_data["awesomeness"].values

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


In [125]:
X_train

array([[0.22966667, 0.67522222, 0.05111111, ..., 0.        , 0.71922222,
        0.28145584],
       [0.29333333, 1.        , 0.13633333, ..., 0.        , 0.57      ,
        0.14974645],
       [0.17592308, 0.55292308, 0.04669231, ..., 0.07730769, 0.77730769,
        0.1140751 ],
       ...,
       [0.26328571, 0.66714286, 0.01228571, ..., 0.        , 0.72442857,
        0.48342932],
       [0.39755102, 0.79722449, 0.02581633, ..., 0.02434694, 0.57659184,
        0.39044344],
       [0.15233333, 0.65316667, 0.017     , ..., 0.        , 0.83066667,
        0.15208857]])

In [127]:
# Train the model on naive bayes, decision tree, and random forest classifiers

# Define hyperparameters for each classifier
gnb_params = {
'var_smoothing': [1e-9, 1e-8, 1e-7, 1e-6, 1e-5]
}
mnb_params = {
'alpha': [0.1, 0.5, 1.0, 2.0]
}
bnb_params = {
'alpha': [0.1, 0.5, 1.0, 2.0],
'binarize': [0.0, 0.1, 0.5, 1.0]
}
cnb_params = {
'alpha': [0.1, 0.5, 1.0, 2.0]
},
# Parameters for DecisionTreeClassifier
# dt_params = {
#     'criterion': ['gini', 'entropy'],  # Function to measure the quality of a split
#     'splitter': ['best', 'random'],  # Strategy used to choose the split at each node
#     'max_depth': [None, 5, 10, 20, 30],  # Maximum depth of the tree
#     'min_samples_split': [2, 5, 10],  # Minimum number of samples required to split an internal node
#     'min_samples_leaf': [1, 2, 5],  # Minimum number of samples required to be at a leaf node
#     'max_features': [None, 'auto', 'sqrt', 'log2']  # Number of features to consider when looking for the best split
# }

# Parameters for RandomForestClassifier
rf_params = {
    'n_estimators': [100, 200, 300, 500],  # Number of trees in the forest
    'criterion': ['gini', 'entropy'],  # Function to measure the quality of a split
    'max_depth': [None, 5, 10, 20, 30],  # Maximum depth of the tree
    'min_samples_split': [2, 5, 10],  # Minimum number of samples required to split an internal node
    'min_samples_leaf': [1, 2, 5],  # Minimum number of samples required to be at a leaf node
    'max_features': ['auto', 'sqrt', 'log2'],  # Number of features to consider when looking for the best split
    'bootstrap': [True, False]  # Method for sampling data points (with or without replacement)
}


# fit classifiers and make predictions on test set
best_f1_score = 0
best_classifier = None
classifiers = [GaussianNB(), MultinomialNB(), BernoulliNB(), ComplementNB()]
classifier_params = [gnb_params, mnb_params, bnb_params, cnb_params]
classifier_names = ["Gaussian Naive Bayes", "Multinomial Naive Bayes", "Bernoulli Naive Bayes", "Complement Naive Bayes"]

for classifier, params, name in zip(classifiers, classifier_params, classifier_names):
    clf = GridSearchCV(classifier, params, scoring='f1', cv=10)
    with tqdm(total=100, desc=name) as pbar:
        clf.fit(X_train, y_train)
        y_pred = clf.predict(X_test)

    # compute evaluation metrics
    accuracy = accuracy_score(y_test, y_pred)
    precision = precision_score(y_test, y_pred)
    recall = recall_score(y_test, y_pred)
    f1 = f1_score(y_test, y_pred)
    cm = confusion_matrix(y_test, y_pred)

    # print results
    print(f"Results for {name}:")
    print(f"Best parameters: {clf.best_params_}")
    print(f"Accuracy: {accuracy}")
    print(f"Precision: {precision}")
    print(f"Recall: {recall}")
    print(f"F1 score: {f1}")
    print(f"Confusion matrix:\n{cm}")

    # check if current classifier is the best one
    if f1 > best_f1_score:
        best_f1_score = f1
        best_classifier = name
        # save the best model
        joblib.dump(clf, 'final_model.pkl')

print(f"\nBest classifier: {best_classifier} (F1 score: {best_f1_score:.4f})")

Gaussian Naive Bayes:   0%|          | 0/100 [00:00<?, ?it/s]


Results for Gaussian Naive Bayes:
Best parameters: {'var_smoothing': 1e-09}
Accuracy: 0.6849666495638789
Precision: 0.7416599516518937
Recall: 0.7582797825012358
F1 score: 0.7498777904513607
Confusion matrix:
[[2073 1603]
 [1467 4602]]


Multinomial Naive Bayes:   0%|          | 0/100 [00:00<?, ?it/s]


Results for Multinomial Naive Bayes:
Best parameters: {'alpha': 0.1}
Accuracy: 0.6278091328886608
Precision: 0.6268176152887411
Recall: 0.9943977591036415
F1 score: 0.7689367395043638
Confusion matrix:
[[  83 3593]
 [  34 6035]]


Bernoulli Naive Bayes:   0%|          | 0/100 [00:01<?, ?it/s]


Results for Bernoulli Naive Bayes:
Best parameters: {'alpha': 0.1, 'binarize': 0.5}
Accuracy: 0.6504874294510005
Precision: 0.6529580700746698
Recall: 0.936562860438293
F1 score: 0.7694598619195886
Confusion matrix:
[[ 655 3021]
 [ 385 5684]]


Complement Naive Bayes:   0%|          | 0/100 [00:00<?, ?it/s]

Results for Complement Naive Bayes:
Best parameters: {'alpha': 2.0}
Accuracy: 0.6536685479733196
Precision: 0.726462676529926
Recall: 0.7119789092107431
F1 score: 0.7191478738453857
Confusion matrix:
[[2049 1627]
 [1748 4321]]

Best classifier: Bernoulli Naive Bayes (F1 score: 0.7695)



