In [1]:
# Project: Final Team Predictive Models
# COMP_SCI 349-0 Machine Learning | Professor V.S.
# By Sengdao Inthavong, Lillian Torres, Haylie Wu, Ernie Wang

# This file contains the code for the predictive models used in our project.
# The models are trained on the training data and tested on the testing data for the CDs and vinyls.

In [2]:
# Import basic libraries
import os
import numpy as np
import pandas as pd

# Import libraries for text processing
import nltk
from nltk.sentiment.vader import SentimentIntensityAnalyzer
from sklearn.preprocessing import LabelEncoder 

# Import libraries for machine learning
import joblib
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix
from sklearn.naive_bayes import GaussianNB, MultinomialNB, BernoulliNB, ComplementNB, CategoricalNB
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import AdaBoostClassifier, RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV

# Import libraries for logging
import logging
from tqdm import tqdm

In [3]:
# #GETTING THE DATA
# #getting to the data directory
# data_dir = 'devided_dataset_v2'
# categories = ['CDs_and_Vinyl', 'Grocery_and_Gourmet_Food', 'Toys_and_Games']
# #getting the products and the reviews for the shit
# resultpath = os.path.join(data_dir, categories[0], 'train', 'product_training.json')
# trainingpath = os.path.join(data_dir, categories[0], 'train', 'review_training.json')
# #reading the file with json
# reviewdata = pd.read_json(trainingpath)
# resultpath = pd.read_json(resultpath)
# #combine the 2 shits together
# training_data = resultpath.merge(reviewdata, how='left', on='asin')

In [None]:
# Load product and review data for CDs and vinyls from the training set
data_dir = ''
categories = ['CDs_and_Vinyl', 'Grocery_and_Gourmet_Food', 'Toys_and_Games']

file_path = os.path.join(data_dir, categories[0], 'train', 'product_training.json')
product_training = pd.read_json(file_path)

file_path = os.path.join(data_dir, categories[0], 'train', 'review_training.json')
review_training = pd.read_json(file_path)

# Merge product and review data
training_data = review_training.merge(product_training, on='asin', how='left')

In [4]:
# Fill in any missing values
training_data['reviewText'].fillna('', inplace=True)
training_data['summary'].fillna('', inplace=True)

In [5]:
# Give each review a unique ID
training_data['reviewID'] = training_data.index

In [6]:
# Run sentiment analysis on the review text and summary
# Columns: neg, neu, pos, compound

# sid = SentimentIntensityAnalyzer()

# review_sentiments = pd.DataFrame(columns=['reviewID', 'reviewText_neg', 'reviewText_neu', 'reviewText_pos', 'reviewText_compound', 'summary_neg', 'summary_neu', 'summary_pos', 'summary_compound'])

# for index, row in tqdm(training_data.iterrows(), total=training_data.shape[0], desc="Sentiment Analysis"):
#     review_text_sentiment = sid.polarity_scores(row['reviewText'])
#     summary_text_sentiment = sid.polarity_scores(row['summary'])
    
#     sentiment_row = {'reviewID': row['reviewID'],
#                      'reviewText_neg': review_text_sentiment['neg'],
#                      'reviewText_neu': review_text_sentiment['neu'],
#                      'reviewText_pos': review_text_sentiment['pos'],
#                      'reviewText_compound': review_text_sentiment['compound'],
#                      'summary_neg': summary_text_sentiment['neg'],
#                      'summary_neu': summary_text_sentiment['neu'],
#                      'summary_pos': summary_text_sentiment['pos'],
#                      'summary_compound': summary_text_sentiment['compound']}
    
#     review_sentiments = review_sentiments.append(sentiment_row, ignore_index=True)

# # Save the sentiment data to a csv file for future use
# file_path = os.path.join(data_dir, categories[0], 'csv', 'review_sentiments.csv')
# review_sentiments.to_csv(file_path, index=False)
review_sentiments = pd.read_csv('review_sentiments.csv')

In [7]:
# Merge the sentiment data with the training data
training_data = training_data.merge(review_sentiments, on='reviewID', how='left')

In [8]:
# Process the columns that are not numeric
encoder = LabelEncoder()

# Define a function to extract the format information from the "style" column
def extract_format(style):
    if style is None:
        return "None"
    else:
        return style.get("Format:", "None").strip()

# Apply the function to the "style" column to extract the format information
training_data["style"] = training_data["style"].apply(extract_format)

In [9]:
# Encode the columns
encoder = LabelEncoder()
training_data["style"] = encoder.fit_transform(training_data["style"])

# Encode the "verified" column
training_data["verified"] = encoder.fit_transform(training_data["verified"])

# Encode the "reviewerID" column
training_data["reviewerID"] = encoder.fit_transform(training_data["reviewerID"])

# Encode the "vote"" column
training_data["vote"] = training_data["vote"].apply(lambda x: float(x.replace(",", "")) if x is not None else 0)

# Encode the "image" column
training_data["image"] = training_data["image"].apply(lambda x: len(x) if x is not None else 0)

In [10]:
# Compute the length of reviewText and summary columns
training_data["reviewText_len"] = training_data["reviewText"].apply(len)
training_data["summary_len"] = training_data["summary"].apply(len)

In [11]:
# # Filter out the reviews that are not verified, have no votes, and have no images unless there is no verified and voted reviews
# training_data = training_data[(training_data["verified"] == 1) | (training_data["vote"] > 0) | (training_data["image"] > 0)]

# Normalize the compound scores
training_data["reviewText_compound_norm"] = (training_data["reviewText_compound"] - training_data["reviewText_compound"].mean()) / training_data["reviewText_compound"].std()
training_data["summary_compound_norm"] = (training_data["summary_compound"] - training_data["summary_compound"].mean()) / training_data["summary_compound"].std()

# Calculate the absolute difference between the normalized compound scores and the awesomeness
training_data["reviewText_compound_diff"] = abs(training_data["reviewText_compound_norm"] - training_data["awesomeness"])
training_data["summary_compound_diff"] = abs(training_data["summary_compound_norm"] - training_data["awesomeness"])

# Calculate the average difference between the normalized compound scores and the awesomeness for each asin
compound_diff_mean = training_data.groupby("asin")[["reviewText_compound_diff", "summary_compound_diff"]].mean()

# Sort the reviews for each asin by the average difference between the normalized compound scores and the awesomeness
compound_diff_mean["compound_diff_mean"] = compound_diff_mean.mean(axis=1)
compound_diff_mean = compound_diff_mean.sort_values("compound_diff_mean", ascending=False)

# Keep the top 2/3 of the reviews for each asin
num_asins = len(compound_diff_mean)
top_reviews_per_asin = int(num_asins * 2/3)
top_asins = compound_diff_mean.iloc[:top_reviews_per_asin].index
training_data = training_data[training_data["asin"].isin(top_asins)]
training_data

Unnamed: 0,asin,awesomeness,reviewerID,unixReviewTime,vote,verified,reviewTime,style,reviewerName,reviewText,...,summary_neg,summary_neu,summary_pos,summary_compound,reviewText_len,summary_len,reviewText_compound_norm,summary_compound_norm,reviewText_compound_diff,summary_compound_diff
0,0000B049F5B33CD310EB1AB236E20191,1,94522,1412294400,0.0,1,"10 3, 2014",3,12A80DAD02AB007538C670D2CF5F0999,"Even tho I love this album, I am having proble...",...,0.0,1.000,0.000,0.0000,604,12,0.709300,-0.616331,0.290700,1.616331
1,0000B049F5B33CD310EB1AB236E20191,1,77388,1413417600,0.0,0,"10 16, 2014",3,08434218ABA526223A66E2A8B4C38DA8,I LOVE IT!!!,...,0.0,1.000,0.000,0.0000,12,10,-0.889340,-0.616331,1.889340,1.616331
2,0000B049F5B33CD310EB1AB236E20191,1,22879,1427068800,0.0,1,"03 23, 2015",3,3AA76E176E4BE75233EB0557F9C1738E,Nancy Wilson is still one of the most distinct...,...,0.0,1.000,0.000,0.0000,768,51,0.213063,-0.616331,0.786937,1.616331
3,0000B049F5B33CD310EB1AB236E20191,1,92550,933552000,6.0,0,"08 2, 1999",3,802D103F0C999BF1E71DF82E52BA5F0D,Having been a Nancy Wilson fan for over twenty...,...,0.0,0.426,0.574,0.4678,619,38,0.114102,0.754990,0.885898,0.245010
22,000281A9CAC43FF1F335726A390636DA,0,24191,1417219200,0.0,1,"11 29, 2014",3,29E396D64652C1F41CE996C30AD91968,Great,...,0.0,0.227,0.773,0.5267,5,10,0.724323,0.927651,0.724323,0.927651
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
770781,FFFF5A3D9CB0B40FF0FE6B95F05D26FE,1,11206,1100563200,5.0,0,"11 16, 2004",3,2AF77877103CB27C4FA8F432847460BD,a come back from the very suscessful last albu...,...,0.0,1.000,0.000,0.0000,968,43,0.620831,-0.616331,0.379169,1.616331
770782,FFFF5A3D9CB0B40FF0FE6B95F05D26FE,1,78948,1100044800,3.0,0,"11 10, 2004",3,459BB37B144704161F27BCFC13A8071D,"Welll first thier young and hopeless, and suck...",...,0.0,1.000,0.000,0.0000,680,15,-0.520918,-0.616331,1.520918,1.616331
770783,FFFF5A3D9CB0B40FF0FE6B95F05D26FE,1,86162,1391126400,0.0,0,"01 31, 2014",3,21F0635609927C23142499715DA76546,So I'm riding to work with this dude. Nice br...,...,0.0,1.000,0.000,0.0000,2454,24,-0.376888,-0.616331,1.376888,1.616331
770784,FFFF5A3D9CB0B40FF0FE6B95F05D26FE,1,46059,1097020800,3.0,0,"10 6, 2004",3,8734979613D4F8D97756DE9AEFC245B3,"Oh s**t, first they TRIED to be punk and now t...",...,0.0,0.328,0.672,0.6249,2109,26,0.117441,1.215517,0.882559,0.215517


In [12]:
# Aggregate the training data by asin
training_data = training_data.groupby("asin").agg({
    "reviewerID": "count",
    "unixReviewTime": ["min", "max", "mean", "std"],
    "verified": ["mean", "sum"],
    "vote": ["mean", "sum"],
    "image": ["mean", "sum"],
    "style": ["mean", "sum"],
    "reviewText_neg": ["mean", "std"],
    "reviewText_neu": ["mean", "std"],
    "reviewText_pos": ["mean", "std"],
    "reviewText_compound": ["mean", "std"],
    "summary_neg": ["mean", "std"],
    "summary_neu": ["mean", "std"],
    "summary_pos": ["mean", "std"],
    "summary_compound": ["mean", "std"],
    "reviewText_len": ["mean", "std"],
    "summary_len": ["mean", "std"],
}).reset_index()

In [13]:
# Add +1 to compound columns to avoid negative values
training_data["reviewText_compound"] += 1
training_data["summary_compound"] += 1
# Replace NaN values with 0
training_data.fillna(0, inplace=True)

In [14]:
column_dict = {
    #"reviewerID": ["count"],
    "unixReviewTime": ["min", "max", "mean", "std"],
    "verified": ["mean", "sum"],
    "vote": ["mean", "sum"],
    "image": ["mean", "sum"],
    "style": ["mean", "sum"],
    "reviewText_neg": ["mean", "std"],
    "reviewText_neu": ["mean", "std"],
    "reviewText_pos": ["mean", "std"],
    "reviewText_compound": ["mean", "std"],
    "summary_neg": ["mean", "std"],
    "summary_neu": ["mean", "std"],
    "summary_pos": ["mean", "std"],
    "summary_compound": ["mean", "std"],
    "reviewText_len": ["mean", "std"],
    "summary_len": ["mean", "std"],
}
column_list = []
for k in column_dict:
    for n in column_dict[k]:
        column_list.append((k, n))

In [15]:
from sklearn.preprocessing import MinMaxScaler
scaler = MinMaxScaler()
# Define the columns to normalize
#cols_to_normalize = ['unixReviewTime', 'verified', 'vote', 'image', 'style', 'reviewText_neg', 'reviewText_neu', 'reviewText_pos', 'reviewText_compound', 'summary_neg', 'summary_neu', 'summary_pos', 'summary_compound', 'reviewText_len', 'summary_len']
#cols_to_normalize = [('verified','mean')]
# Normalize the data using the MinMaxScaler
#scaled_df = scaler.fit_transform(training_data[cols_to_normalize])
# training_data
training_data[column_list] = scaler.fit_transform(training_data[column_list])
# training_data

In [16]:
# Merge the training data with the awesomeness data
file_path = os.path.join(data_dir, categories[0], 'train', 'product_training.json')
product_training = pd.read_json(file_path)
training_data.columns = training_data.columns.to_flat_index()

In [17]:
training_data['asin'] = training_data[('asin', '')]

In [18]:
training_data = training_data.merge(product_training, on='asin', how='left')

# Visualize the absolute correlation between the features on "awesomeness"
#training_data.corr()["awesomeness"].abs().sort_values(ascending=False)
training_data

Unnamed: 0,"(asin, )","(reviewerID, count)","(unixReviewTime, min)","(unixReviewTime, max)","(unixReviewTime, mean)","(unixReviewTime, std)","(verified, mean)","(verified, sum)","(vote, mean)","(vote, sum)",...,"(summary_pos, mean)","(summary_pos, std)","(summary_compound, mean)","(summary_compound, std)","(reviewText_len, mean)","(reviewText_len, std)","(summary_len, mean)","(summary_len, std)",asin,awesomeness
0,0000B049F5B33CD310EB1AB236E20191,4,0.071141,0.826052,0.628401,0.576181,0.500000,0.000137,0.010638,0.000061,...,0.143500,0.405879,0.545472,0.593393,0.045040,0.019032,0.198000,0.240570,0000B049F5B33CD310EB1AB236E20191,1
1,000281A9CAC43FF1F335726A390636DA,4,0.237047,0.810632,0.562543,0.400172,0.750000,0.000206,0.008865,0.000051,...,0.620000,0.229138,0.841406,0.544539,0.062961,0.057590,0.124000,0.111358,000281A9CAC43FF1F335726A390636DA,0
2,00030884DF109F325638A6BFD5B13CFF,29,0.175973,0.935344,0.658161,0.354800,0.586207,0.001166,0.020298,0.000839,...,0.255138,0.506831,0.562077,0.656340,0.040425,0.044603,0.185931,0.211929,00030884DF109F325638A6BFD5B13CFF,1
3,000577BC760B4C7BD980939F0CB41F65,4,0.652081,0.834573,0.787491,0.142609,0.750000,0.000206,0.019504,0.000111,...,0.393500,0.376160,0.769233,0.652596,0.064872,0.076245,0.120000,0.183855,000577BC760B4C7BD980939F0CB41F65,0
4,0006BBE1B5FFF50EF4715D8AE6F86572,6,0.327651,0.930069,0.734267,0.344701,0.833333,0.000343,0.009456,0.000081,...,0.251667,0.411379,0.660065,0.656912,0.014301,0.015770,0.081333,0.082135,0006BBE1B5FFF50EF4715D8AE6F86572,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
48716,FFFCA74DB5BAF31CDB9CB1E0D7299197,2,0.466040,0.487623,0.474504,0.038965,0.000000,0.000000,0.053191,0.000152,...,0.163000,0.326000,0.602433,0.630668,0.170219,0.073260,0.336000,0.101695,FFFCA74DB5BAF31CDB9CB1E0D7299197,1
48717,FFFDD3C72D23AF858D6E0ED92612370D,64,0.040805,0.951982,0.568466,0.444411,0.375000,0.001646,0.006316,0.000576,...,0.232609,0.483928,0.571888,0.660855,0.085444,0.066925,0.195250,0.205306,FFFDD3C72D23AF858D6E0ED92612370D,1
48718,FFFEB3EE2372807964F024707D50FB21,2,0.711678,0.708914,0.711415,0.010759,1.000000,0.000137,0.000000,0.000000,...,0.189500,0.379000,0.676054,0.721428,0.058599,0.007370,0.244000,0.076271,FFFEB3EE2372807964F024707D50FB21,0
48719,FFFF4545AB232D81D0F9B208388BB7AA,7,0.820940,0.855810,0.836090,0.029617,0.571429,0.000274,0.011145,0.000111,...,0.289714,0.419329,0.636393,0.685847,0.097114,0.066898,0.420571,0.245895,FFFF4545AB232D81D0F9B208388BB7AA,1


In [19]:
training_data = training_data.drop(training_data.columns[1], axis=1)

In [20]:
training_data

Unnamed: 0,"(asin, )","(unixReviewTime, min)","(unixReviewTime, max)","(unixReviewTime, mean)","(unixReviewTime, std)","(verified, mean)","(verified, sum)","(vote, mean)","(vote, sum)","(image, mean)",...,"(summary_pos, mean)","(summary_pos, std)","(summary_compound, mean)","(summary_compound, std)","(reviewText_len, mean)","(reviewText_len, std)","(summary_len, mean)","(summary_len, std)",asin,awesomeness
0,0000B049F5B33CD310EB1AB236E20191,0.071141,0.826052,0.628401,0.576181,0.500000,0.000137,0.010638,0.000061,0.0,...,0.143500,0.405879,0.545472,0.593393,0.045040,0.019032,0.198000,0.240570,0000B049F5B33CD310EB1AB236E20191,1
1,000281A9CAC43FF1F335726A390636DA,0.237047,0.810632,0.562543,0.400172,0.750000,0.000206,0.008865,0.000051,0.0,...,0.620000,0.229138,0.841406,0.544539,0.062961,0.057590,0.124000,0.111358,000281A9CAC43FF1F335726A390636DA,0
2,00030884DF109F325638A6BFD5B13CFF,0.175973,0.935344,0.658161,0.354800,0.586207,0.001166,0.020298,0.000839,0.0,...,0.255138,0.506831,0.562077,0.656340,0.040425,0.044603,0.185931,0.211929,00030884DF109F325638A6BFD5B13CFF,1
3,000577BC760B4C7BD980939F0CB41F65,0.652081,0.834573,0.787491,0.142609,0.750000,0.000206,0.019504,0.000111,0.0,...,0.393500,0.376160,0.769233,0.652596,0.064872,0.076245,0.120000,0.183855,000577BC760B4C7BD980939F0CB41F65,0
4,0006BBE1B5FFF50EF4715D8AE6F86572,0.327651,0.930069,0.734267,0.344701,0.833333,0.000343,0.009456,0.000081,0.0,...,0.251667,0.411379,0.660065,0.656912,0.014301,0.015770,0.081333,0.082135,0006BBE1B5FFF50EF4715D8AE6F86572,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
48716,FFFCA74DB5BAF31CDB9CB1E0D7299197,0.466040,0.487623,0.474504,0.038965,0.000000,0.000000,0.053191,0.000152,0.0,...,0.163000,0.326000,0.602433,0.630668,0.170219,0.073260,0.336000,0.101695,FFFCA74DB5BAF31CDB9CB1E0D7299197,1
48717,FFFDD3C72D23AF858D6E0ED92612370D,0.040805,0.951982,0.568466,0.444411,0.375000,0.001646,0.006316,0.000576,0.0,...,0.232609,0.483928,0.571888,0.660855,0.085444,0.066925,0.195250,0.205306,FFFDD3C72D23AF858D6E0ED92612370D,1
48718,FFFEB3EE2372807964F024707D50FB21,0.711678,0.708914,0.711415,0.010759,1.000000,0.000137,0.000000,0.000000,0.0,...,0.189500,0.379000,0.676054,0.721428,0.058599,0.007370,0.244000,0.076271,FFFEB3EE2372807964F024707D50FB21,0
48719,FFFF4545AB232D81D0F9B208388BB7AA,0.820940,0.855810,0.836090,0.029617,0.571429,0.000274,0.011145,0.000111,0.0,...,0.289714,0.419329,0.636393,0.685847,0.097114,0.066898,0.420571,0.245895,FFFF4545AB232D81D0F9B208388BB7AA,1


In [21]:
# Prepare the data for training
# Keep only the most important features for predicting awesomeness
X = training_data[[    
    ('reviewText_pos', 'mean'),    
    ('summary_neu', 'mean'),
    ('reviewText_neg', 'mean'),
    ('summary_neg', 'std'),
    ('summary_neg', 'mean'),
    ('reviewText_neu', 'mean'),
    ('reviewText_neu', 'std')
]].values
y = training_data["awesomeness"].values

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


In [22]:
X_train

array([[0.32966667, 1.        , 0.044     , ..., 0.        , 0.62633333,
        0.26195165],
       [0.19077778, 0.75644444, 0.05177778, ..., 0.        , 0.75755556,
        0.23639809],
       [0.193     , 0.616     , 0.016     , ..., 0.        , 0.791     ,
        0.        ],
       ...,
       [0.18114286, 0.81357143, 0.07642857, ..., 0.08614286, 0.74228571,
        0.17403393],
       [0.20113043, 0.65256522, 0.02721739, ..., 0.02595652, 0.7716087 ,
        0.23617395],
       [0.28308333, 0.77858333, 0.032     , ..., 0.01966667, 0.68483333,
        0.3552591 ]])

In [23]:
# Train the model on naive bayes, decision tree, and random forest classifiers

# Define hyperparameters for each classifier
gnb_params = {
'var_smoothing': [1e-9, 1e-8, 1e-7, 1e-6, 1e-5]
}
mnb_params = {
'alpha': [0.1, 0.5, 1.0, 2.0]
}
bnb_params = {
'alpha': [0.1, 0.5, 1.0, 2.0],
'binarize': [0.0, 0.1, 0.5, 1.0]
}
cnb_params = {
'alpha': [0.1, 0.5, 1.0, 2.0]
}

# fit classifiers and make predictions on test set
best_f1_score = 0
best_classifier = None
classifiers = [GaussianNB(), MultinomialNB(), BernoulliNB(), ComplementNB()]
classifier_params = [gnb_params, mnb_params, bnb_params, cnb_params]
classifier_names = ["Gaussian Naive Bayes", "Multinomial Naive Bayes", "Bernoulli Naive Bayes", "Complement Naive Bayes"]

for classifier, params, name in zip(classifiers, classifier_params, classifier_names):
    clf = GridSearchCV(classifier, params, scoring='f1', cv=10)
    with tqdm(total=100, desc=name) as pbar:
        clf.fit(X_train, y_train)
        y_pred = clf.predict(X_test)

    # compute evaluation metrics
    accuracy = accuracy_score(y_test, y_pred)
    precision = precision_score(y_test, y_pred)
    recall = recall_score(y_test, y_pred)
    f1 = f1_score(y_test, y_pred)
    cm = confusion_matrix(y_test, y_pred)

    # print results
    print(f"Results for {name}:")
    print(f"Best parameters: {clf.best_params_}")
    print(f"Accuracy: {accuracy}")
    print(f"Precision: {precision}")
    print(f"Recall: {recall}")
    print(f"F1 score: {f1}")
    print(f"Confusion matrix:\n{cm}")

    # check if current classifier is the best one
    if f1 > best_f1_score:
        best_f1_score = f1
        best_classifier = name
        # save the best model
        joblib.dump(clf, best_classifier + '_model.pkl')

print(f"\nBest classifier: {best_classifier} (F1 score: {best_f1_score:.4f})")

Gaussian Naive Bayes:   0%|                                                                     | 0/100 [00:00<?, ?it/s]


Results for Gaussian Naive Bayes:
Best parameters: {'var_smoothing': 1e-06}
Accuracy: 0.6551051821446896
Precision: 0.7166465135026199
Recall: 0.8098997874278773
F1 score: 0.7604248342718654
Confusion matrix:
[[1050 2109]
 [1252 5334]]


Multinomial Naive Bayes:   0%|                                                                  | 0/100 [00:00<?, ?it/s]


Results for Multinomial Naive Bayes:
Best parameters: {'alpha': 0.1}
Accuracy: 0.6758337609030272
Precision: 0.6758337609030272
Recall: 1.0
F1 score: 0.8065642030494152
Confusion matrix:
[[   0 3159]
 [   0 6586]]


Bernoulli Naive Bayes:   0%|                                                                    | 0/100 [00:01<?, ?it/s]


Results for Bernoulli Naive Bayes:
Best parameters: {'alpha': 0.1, 'binarize': 1.0}
Accuracy: 0.6758337609030272
Precision: 0.6758337609030272
Recall: 1.0
F1 score: 0.8065642030494152
Confusion matrix:
[[   0 3159]
 [   0 6586]]


Complement Naive Bayes:   0%|                                                                   | 0/100 [00:00<?, ?it/s]

Results for Complement Naive Bayes:
Best parameters: {'alpha': 2.0}
Accuracy: 0.5942534633145202
Precision: 0.7272099447513812
Recall: 0.6395384148193137
F1 score: 0.6805622879301988
Confusion matrix:
[[1579 1580]
 [2374 4212]]

Best classifier: Multinomial Naive Bayes (F1 score: 0.8066)



