In [68]:
# sklearn for various machine learning algorithms
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
# CountVectorizer for converting text data into a matrix of token counts
# Not the only way to do this, but a simple and common way
from sklearn.feature_extraction.text import CountVectorizer

# Pandas for working with dataframes (including the one already created by the data processing notebook)
import pandas as pd

In [69]:
# Load the data
data = pd.read_csv('Datasets/goodreads_data_onehot_genres.csv')

# Print the first few rows of the data
print(data.head())
# Print the shape of the data
print(data.shape)

                                                Book  \
0                              To Kill a Mockingbird   
1  Harry Potter and the Philosopher’s Stone (Harr...   
2                                Pride and Prejudice   
3                          The Diary of a Young Girl   
4                                  The Little Prince   

                                         Description  Fantasy  Adult  \
0  The unforgettable novel of a childhood in a sl...        0      1   
1  Harry Potter thinks he is an ordinary boy - un...        1      1   
2  Since its immediate success in 1813, Pride and...        0      0   
3  Discovered in the attic in which she spent the...        0      0   
4  A pilot stranded in the desert awakes one morn...        1      1   

   Historical  Roman  Romance  Young Adult  Historical Fiction  Science  ...  \
0           1      0        0            1                   1        0  ...   
1           0      0        0            1                   0        

In [70]:
# Get the list of genres from the columns of the dataframe
genres = data.columns[2:]
print(genres)

Index(['Fantasy', 'Adult', 'Historical', 'Roman', 'Romance', 'Young Adult',
       'Historical Fiction', 'Science', 'Mystery', 'Contemporary', 'Thriller',
       'Science Fiction', 'History', 'Adventure', 'Philosophy', 'Biography',
       'Crime', 'Self Help', 'Psychology', 'Mystery Thriller', 'Memoir',
       'Childrens', 'Humor', 'Suspense', 'Horror'],
      dtype='object')


In [71]:
# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(data['Description'], data[genres], test_size=0.2, random_state=42)

# Create a vectorizer to convert the text data into a matrix of token counts
vectorizer = CountVectorizer()

# Fit the vectorizer to the training data
vectorizer.fit(X_train)

# Transform the training and testing data
X_train = vectorizer.transform(X_train)
X_test = vectorizer.transform(X_test)

In [72]:
# Function to train a Naive Bayes classifier for a given genre
def train_genre_classifier(genre):
    # Create a Naive Bayes classifier
    classifier = MultinomialNB()
    # Train the classifier on the training data
    classifier.fit(X_train, y_train[genre])
    return classifier

In [73]:
# Get the list of genres from the columns of the dataframe
genres = data.columns[2:]
print(genres)

# Create a dictionary to store the classifiers for each genre
classifiers = {}

Index(['Fantasy', 'Adult', 'Historical', 'Roman', 'Romance', 'Young Adult',
       'Historical Fiction', 'Science', 'Mystery', 'Contemporary', 'Thriller',
       'Science Fiction', 'History', 'Adventure', 'Philosophy', 'Biography',
       'Crime', 'Self Help', 'Psychology', 'Mystery Thriller', 'Memoir',
       'Childrens', 'Humor', 'Suspense', 'Horror'],
      dtype='object')


In [74]:
# Train a classifier for each genre
# Check if the classifier is already trained
for genre in genres:
    if genre not in classifiers:
        print('Training classifier for', genre)
        classifiers[genre] = train_genre_classifier(genre)
        print('Done training classifier for', genre)

Training classifier for Fantasy
Done training classifier for Fantasy
Training classifier for Adult
Done training classifier for Adult
Training classifier for Historical
Done training classifier for Historical
Training classifier for Roman
Done training classifier for Roman
Training classifier for Romance
Done training classifier for Romance
Training classifier for Young Adult
Done training classifier for Young Adult
Training classifier for Historical Fiction
Done training classifier for Historical Fiction
Training classifier for Science
Done training classifier for Science
Training classifier for Mystery
Done training classifier for Mystery
Training classifier for Contemporary
Done training classifier for Contemporary
Training classifier for Thriller
Done training classifier for Thriller
Training classifier for Science Fiction
Done training classifier for Science Fiction
Training classifier for History
Done training classifier for History
Training classifier for Adventure
Done training

In [75]:
# Function to calculate the accurace, precision, recall, and F1 score of a classifier for a given genre
def evaluate_classifier(classifier, genre):
    # Get the predictions of the classifier
    predictions = classifier.predict(X_test)
    # Calculate the accuracy of the classifier
    accuracy = classifier.score(X_test, y_test[genre])
    # Calculate the precision, recall, and F1 score of the classifier
    true_positives = (predictions == 1) & (y_test[genre] == 1)
    false_positives = (predictions == 1) & (y_test[genre] == 0)
    false_negatives = (predictions == 0) & (y_test[genre] == 1)
    precision = true_positives.sum() / (true_positives.sum() + false_positives.sum())
    recall = true_positives.sum() / (true_positives.sum() + false_negatives.sum())
    f1 = 2 * precision * recall / (precision + recall)
    # Create a dictionary to store the evaluation metrics
    evaluation = {
        'accuracy': accuracy,
        'precision': precision,
        'recall': recall,
        'f1': f1
    }
    return evaluation

In [76]:
# Create a dictionary to store the evaluation metrics for each genre
evaluations = {}
for genre in genres:
    print('Evaluating classifier for', genre)
    evaluations[genre] = evaluate_classifier(classifiers[genre], genre)
    print('Done evaluating classifier for', genre)

Evaluating classifier for Fantasy
Done evaluating classifier for Fantasy
Evaluating classifier for Adult
Done evaluating classifier for Adult
Evaluating classifier for Historical
Done evaluating classifier for Historical
Evaluating classifier for Roman
Done evaluating classifier for Roman
Evaluating classifier for Romance
Done evaluating classifier for Romance
Evaluating classifier for Young Adult
Done evaluating classifier for Young Adult
Evaluating classifier for Historical Fiction
Done evaluating classifier for Historical Fiction
Evaluating classifier for Science
Done evaluating classifier for Science
Evaluating classifier for Mystery
Done evaluating classifier for Mystery
Evaluating classifier for Contemporary
Done evaluating classifier for Contemporary
Evaluating classifier for Thriller
Done evaluating classifier for Thriller
Evaluating classifier for Science Fiction
Done evaluating classifier for Science Fiction
Evaluating classifier for History
Done evaluating classifier for His

In [77]:
# Print the evaluation metrics for each genre
for genre in genres:
    print(genre)
    print(evaluations[genre])

Fantasy
{'accuracy': 0.8721291123525761, 'precision': 0.792507204610951, 'recall': 0.6723716381418093, 'f1': 0.7275132275132276}
Adult
{'accuracy': 0.7970204841713222, 'precision': 0.6431535269709544, 'recall': 0.6666666666666666, 'f1': 0.6546990496304119}
Historical
{'accuracy': 0.8646803227808815, 'precision': 0.7316017316017316, 'recall': 0.52, 'f1': 0.6079136690647482}
Roman
{'accuracy': 0.8584729981378026, 'precision': 0.6956521739130435, 'recall': 0.5382262996941896, 'f1': 0.6068965517241379}
Romance
{'accuracy': 0.8659217877094972, 'precision': 0.6976744186046512, 'recall': 0.5660377358490566, 'f1': 0.625}
Young Adult
{'accuracy': 0.8578522656734947, 'precision': 0.7017543859649122, 'recall': 0.4984423676012461, 'f1': 0.5828779599271402}
Historical Fiction
{'accuracy': 0.8715083798882681, 'precision': 0.7388888888888889, 'recall': 0.4539249146757679, 'f1': 0.5623678646934461}
Science
{'accuracy': 0.8888888888888888, 'precision': 0.8760330578512396, 'recall': 0.3925925925925926, 

In [78]:
# Write the evaluation metrics to a CSV file
evaluation_data = pd.DataFrame(evaluations)
evaluation_data.to_csv('naive_bayes_goodreads_evaluation_data.csv', index=False)

In [79]:
# Create a dataset that contains the title, description, and scores for each genre
# Copy the title and description columns from the original data
genre_scores = data[['Book', 'Description']]
# Add a column for each genre that contains the score of the classifier for that genre
for genre in genres:
    genre_scores[genre] = classifiers[genre].predict_proba(vectorizer.transform(data['Description']))[:, 1]
# Print the first few rows of the dataset
print(genre_scores.head())
# Write the dataset to a CSV file
genre_scores.to_csv('Datasets/naive_bayes_goodreads_genre_scores.csv', index=False)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  genre_scores[genre] = classifiers[genre].predict_proba(vectorizer.transform(data['Description']))[:, 1]
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  genre_scores[genre] = classifiers[genre].predict_proba(vectorizer.transform(data['Description']))[:, 1]
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy

                                                Book  \
0                              To Kill a Mockingbird   
1  Harry Potter and the Philosopher’s Stone (Harr...   
2                                Pride and Prejudice   
3                          The Diary of a Young Girl   
4                                  The Little Prince   

                                         Description       Fantasy  \
0  The unforgettable novel of a childhood in a sl...  1.001910e-20   
1  Harry Potter thinks he is an ordinary boy - un...  1.000000e+00   
2  Since its immediate success in 1813, Pride and...  1.763531e-09   
3  Discovered in the attic in which she spent the...  5.384492e-18   
4  A pilot stranded in the desert awakes one morn...  4.711487e-01   

          Adult    Historical         Roman       Romance   Young Adult  \
0  3.867999e-13  9.937054e-01  1.511318e-23  8.771378e-24  7.511400e-22   
1  9.999911e-01  5.386035e-05  1.200312e-07  1.852226e-07  9.999975e-01   
2  1.122488e-09  