In [28]:
# sklearn for various machine learning algorithms
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
# CountVectorizer for converting text data into a matrix of token counts
# Not the only way to do this, but a simple and common way
from sklearn.feature_extraction.text import CountVectorizer

# Pandas for working with dataframes (including the one already created by the data processing notebook)
import pandas as pd

In [32]:
# Load the data
data = pd.read_csv('Datasets/goodreads_data_onehot_genres.csv')

# Print the first few rows of the data
print(data.head())
# Print the shape of the data
print(data.shape)

                                                Book  \
0                              To Kill a Mockingbird   
1  Harry Potter and the Philosopher’s Stone (Harr...   
2                                Pride and Prejudice   
3                          The Diary of a Young Girl   
4                                        Animal Farm   

                                         Description  Fiction  Nonfiction  \
0  The unforgettable novel of a childhood in a sl...        1           0   
1  Harry Potter thinks he is an ordinary boy - un...        1           0   
2  Since its immediate success in 1813, Pride and...        1           0   
3  Discovered in the attic in which she spent the...        0           1   
4  Librarian's note: There is an Alternate Cover ...        1           0   

   Fantasy  Adult  Classics  Historical  Roman  Literature  ...  Audiobook  \
0        0      1         1           1      0           1  ...          0   
1        1      1         1           0     

In [30]:
# Get the list of genres from the columns of the dataframe
genres = data.columns[2:]
print(genres)

Index(['Fiction', 'Nonfiction', 'Fantasy', 'Adult', 'Classics', 'Historical',
       'Roman', 'Literature', 'Romance', 'Young Adult', 'Historical Fiction',
       'Science', 'Mystery', 'Contemporary', 'Novels', 'Audiobook', 'Thriller',
       'Science Fiction', 'History', 'Adventure', 'Philosophy', 'Biography',
       'Crime', 'Self Help', 'Psychology'],
      dtype='object')


In [34]:
# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(data['Description'], data[genres], test_size=0.2, random_state=42)

# Create a vectorizer to convert the text data into a matrix of token counts
vectorizer = CountVectorizer()

# Fit the vectorizer to the training data
vectorizer.fit(X_train)

# Transform the training and testing data
X_train = vectorizer.transform(X_train)
X_test = vectorizer.transform(X_test)

In [35]:
# Function to train a Naive Bayes classifier for a given genre
def train_genre_classifier(genre):
    # Create a Naive Bayes classifier
    classifier = MultinomialNB()
    # Train the classifier on the training data
    classifier.fit(X_train, y_train[genre])
    return classifier

In [36]:
# Get the list of genres from the columns of the dataframe
genres = data.columns[2:]
print(genres)

# Create a dictionary to store the classifiers for each genre
classifiers = {}

Index(['Fiction', 'Nonfiction', 'Fantasy', 'Adult', 'Classics', 'Historical',
       'Roman', 'Literature', 'Romance', 'Young Adult', 'Historical Fiction',
       'Science', 'Mystery', 'Contemporary', 'Novels', 'Audiobook', 'Thriller',
       'Science Fiction', 'History', 'Adventure', 'Philosophy', 'Biography',
       'Crime', 'Self Help', 'Psychology'],
      dtype='object')


In [37]:
# Train a classifier for each genre
# Check if the classifier is already trained
for genre in genres:
    if genre not in classifiers:
        print('Training classifier for', genre)
        classifiers[genre] = train_genre_classifier(genre)
        print('Done training classifier for', genre)

Training classifier for Fiction
Done training classifier for Fiction
Training classifier for Nonfiction
Done training classifier for Nonfiction
Training classifier for Fantasy
Done training classifier for Fantasy
Training classifier for Adult
Done training classifier for Adult
Training classifier for Classics
Done training classifier for Classics
Training classifier for Historical
Done training classifier for Historical
Training classifier for Roman
Done training classifier for Roman
Training classifier for Literature
Done training classifier for Literature
Training classifier for Romance
Done training classifier for Romance
Training classifier for Young Adult
Done training classifier for Young Adult
Training classifier for Historical Fiction
Done training classifier for Historical Fiction
Training classifier for Science
Done training classifier for Science
Training classifier for Mystery
Done training classifier for Mystery
Training classifier for Contemporary
Done training classifier

In [38]:
# Function to calculate the accurace, precision, recall, and F1 score of a classifier for a given genre
def evaluate_classifier(classifier, genre):
    # Get the predictions of the classifier
    predictions = classifier.predict(X_test)
    # Calculate the accuracy of the classifier
    accuracy = classifier.score(X_test, y_test[genre])
    # Calculate the precision, recall, and F1 score of the classifier
    true_positives = (predictions == 1) & (y_test[genre] == 1)
    false_positives = (predictions == 1) & (y_test[genre] == 0)
    false_negatives = (predictions == 0) & (y_test[genre] == 1)
    precision = true_positives.sum() / (true_positives.sum() + false_positives.sum())
    recall = true_positives.sum() / (true_positives.sum() + false_negatives.sum())
    f1 = 2 * precision * recall / (precision + recall)
    # Create a dictionary to store the evaluation metrics
    evaluation = {
        'accuracy': accuracy,
        'precision': precision,
        'recall': recall,
        'f1': f1
    }
    return evaluation

In [39]:
# Create a dictionary to store the evaluation metrics for each genre
evaluations = {}
for genre in genres:
    print('Evaluating classifier for', genre)
    evaluations[genre] = evaluate_classifier(classifiers[genre], genre)
    print('Done evaluating classifier for', genre)

Evaluating classifier for Fiction
Done evaluating classifier for Fiction
Evaluating classifier for Nonfiction
Done evaluating classifier for Nonfiction
Evaluating classifier for Fantasy
Done evaluating classifier for Fantasy
Evaluating classifier for Adult
Done evaluating classifier for Adult
Evaluating classifier for Classics
Done evaluating classifier for Classics
Evaluating classifier for Historical
Done evaluating classifier for Historical
Evaluating classifier for Roman
Done evaluating classifier for Roman
Evaluating classifier for Literature
Done evaluating classifier for Literature
Evaluating classifier for Romance
Done evaluating classifier for Romance
Evaluating classifier for Young Adult
Done evaluating classifier for Young Adult
Evaluating classifier for Historical Fiction
Done evaluating classifier for Historical Fiction
Evaluating classifier for Science
Done evaluating classifier for Science
Evaluating classifier for Mystery
Done evaluating classifier for Mystery
Evaluatin

In [40]:
# Print the evaluation metrics for each genre
for genre in genres:
    print(genre)
    print(evaluations[genre])

Fiction
{'accuracy': 0.8482880755608029, 'precision': 0.8505392912172574, 'recall': 0.9460154241645244, 'f1': 0.895740365111562}
Nonfiction
{'accuracy': 0.9096812278630461, 'precision': 0.8524590163934426, 'recall': 0.7591240875912408, 'f1': 0.803088803088803}
Fantasy
{'accuracy': 0.8636363636363636, 'precision': 0.7868421052631579, 'recall': 0.6659242761692651, 'f1': 0.721351025331725}
Adult
{'accuracy': 0.8057851239669421, 'precision': 0.6328600405679513, 'recall': 0.6782608695652174, 'f1': 0.6547743966421826}
Classics
{'accuracy': 0.8258559622195986, 'precision': 0.7185185185185186, 'recall': 0.46973365617433416, 'f1': 0.568081991215227}
Historical
{'accuracy': 0.8476977567886659, 'precision': 0.6633663366336634, 'recall': 0.41358024691358025, 'f1': 0.5095057034220533}
Roman
{'accuracy': 0.8689492325855962, 'precision': 0.6506024096385542, 'recall': 0.5454545454545454, 'f1': 0.5934065934065933}
Literature
{'accuracy': 0.8435655253837072, 'precision': 0.7054794520547946, 'recall': 0.