In [2]:
# sklearn for various machine learning algorithms
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
# CountVectorizer for converting text data into a matrix of token counts
# Not the only way to do this, but a simple and common way
from sklearn.feature_extraction.text import CountVectorizer

# Pandas for working with dataframes (including the one already created by the data processing notebook)
import pandas as pd
# Numpy for working with arrays
import numpy as np

In [3]:
# Load the data
data = pd.read_csv('Datasets/goodreads_data_onehot_genres.csv')

# Print the first few rows of the data
print(data.head())
# Print the shape of the data
print(data.shape)

                                                Book  \
0                              To Kill a Mockingbird   
1  Harry Potter and the Philosopher’s Stone (Harr...   
2                                Pride and Prejudice   
3                                  The Little Prince   
4                                               1984   

                                         Description  \
0  The unforgettable novel of a childhood in a sl...   
1  Harry Potter thinks he is an ordinary boy - un...   
2  Since its immediate success in 1813, Pride and...   
3  A pilot stranded in the desert awakes one morn...   
4  The new novel by George Orwell is the major wo...   

                     Author  Fantasy  Adult  Romance  Young Adult  Historical  \
0                Harper Lee        0      1        0            1           1   
1              J.K. Rowling        1      1        0            1           0   
2               Jane Austen        0      0        1            0           1   
3 

In [4]:
# Get the list of genres from the columns of the dataframe
genres = data.columns[2:]
print(genres)

Index(['Author', 'Fantasy', 'Adult', 'Romance', 'Young Adult', 'Historical',
       'Historical Fiction', 'Mystery', 'Contemporary', 'Thriller', 'Science',
       'Science Fiction', 'Adventure', 'Mystery Thriller', 'Crime',
       'Childrens', 'Suspense', 'Horror', 'Paranormal', 'Magic',
       'Science Fiction Fantasy', 'Humor', 'Middle Grade', 'Literary Fiction',
       'Drama', 'American'],
      dtype='object')


In [5]:
# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(data['Description'], data[genres], test_size=0.2, random_state=42)

# Create a vectorizer to convert the text data into a matrix of token counts
vectorizer = CountVectorizer()

# Fit the vectorizer to the training data
vectorizer.fit(X_train)

# Transform the training and testing data
X_train = vectorizer.transform(X_train)
X_test = vectorizer.transform(X_test)

In [6]:
# Function to train a Naive Bayes classifier for a given genre
def train_genre_classifier(genre):
    # Create a Naive Bayes classifier
    classifier = MultinomialNB()
    # Train the classifier on the training data
    classifier.fit(X_train, y_train[genre])
    return classifier

In [7]:
# Get the list of genres from the columns of the dataframe
genres = data.columns[3:]
print(genres)

# Create a dictionary to store the classifiers for each genre
classifiers = {}

Index(['Fantasy', 'Adult', 'Romance', 'Young Adult', 'Historical',
       'Historical Fiction', 'Mystery', 'Contemporary', 'Thriller', 'Science',
       'Science Fiction', 'Adventure', 'Mystery Thriller', 'Crime',
       'Childrens', 'Suspense', 'Horror', 'Paranormal', 'Magic',
       'Science Fiction Fantasy', 'Humor', 'Middle Grade', 'Literary Fiction',
       'Drama', 'American'],
      dtype='object')


In [8]:
# Train a classifier for each genre
# Check if the classifier is already trained
for genre in genres:
    if genre not in classifiers:
        print('Training classifier for', genre)
        classifiers[genre] = train_genre_classifier(genre)
        print('Done training classifier for', genre)

Training classifier for Fantasy
Done training classifier for Fantasy
Training classifier for Adult
Done training classifier for Adult
Training classifier for Romance
Done training classifier for Romance
Training classifier for Young Adult
Done training classifier for Young Adult
Training classifier for Historical
Done training classifier for Historical
Training classifier for Historical Fiction
Done training classifier for Historical Fiction
Training classifier for Mystery
Done training classifier for Mystery
Training classifier for Contemporary
Done training classifier for Contemporary
Training classifier for Thriller
Done training classifier for Thriller
Training classifier for Science
Done training classifier for Science
Training classifier for Science Fiction
Done training classifier for Science Fiction
Training classifier for Adventure
Done training classifier for Adventure
Training classifier for Mystery Thriller
Done training classifier for Mystery Thriller
Training classifier f

In [9]:
# Function to calculate the accurace, precision, recall, and F1 score of a classifier for a given genre
def evaluate_classifier(classifier, genre):
    # Get the predictions of the classifier
    predictions = classifier.predict(X_test)
    # Calculate the accuracy of the classifier
    accuracy = classifier.score(X_test, y_test[genre])
    # Calculate the precision, recall, and F1 score of the classifier
    true_positives = (predictions == 1) & (y_test[genre] == 1)
    false_positives = (predictions == 1) & (y_test[genre] == 0)
    false_negatives = (predictions == 0) & (y_test[genre] == 1)
    precision = true_positives.sum() / (true_positives.sum() + false_positives.sum())
    recall = true_positives.sum() / (true_positives.sum() + false_negatives.sum())
    f1 = 2 * precision * recall / (precision + recall)
    # Create a dictionary to store the evaluation metrics
    evaluation = {
        'accuracy': accuracy,
        'precision': precision,
        'recall': recall,
        'f1': f1
    }
    return evaluation

In [10]:
# Create a dictionary to store the evaluation metrics for each genre
evaluations = {}
for genre in genres:
    print('Evaluating classifier for', genre)
    evaluations[genre] = evaluate_classifier(classifiers[genre], genre)
    print('Done evaluating classifier for', genre)

Evaluating classifier for Fantasy
Done evaluating classifier for Fantasy
Evaluating classifier for Adult
Done evaluating classifier for Adult
Evaluating classifier for Romance
Done evaluating classifier for Romance
Evaluating classifier for Young Adult
Done evaluating classifier for Young Adult
Evaluating classifier for Historical
Done evaluating classifier for Historical
Evaluating classifier for Historical Fiction
Done evaluating classifier for Historical Fiction
Evaluating classifier for Mystery
Done evaluating classifier for Mystery
Evaluating classifier for Contemporary
Done evaluating classifier for Contemporary
Evaluating classifier for Thriller
Done evaluating classifier for Thriller
Evaluating classifier for Science
Done evaluating classifier for Science
Evaluating classifier for Science Fiction
Done evaluating classifier for Science Fiction
Evaluating classifier for Adventure
Done evaluating classifier for Adventure
Evaluating classifier for Mystery Thriller
Done evaluating c

In [11]:
# Print the evaluation metrics for each genre
for genre in genres:
    print(genre)
    print(evaluations[genre])

Fantasy
{'accuracy': 0.8667763157894737, 'precision': 0.8259587020648967, 'recall': 0.7310704960835509, 'f1': 0.7756232686980609}
Adult
{'accuracy': 0.7672697368421053, 'precision': 0.6794055201698513, 'recall': 0.7079646017699115, 'f1': 0.6933911159263271}
Romance
{'accuracy': 0.8379934210526315, 'precision': 0.7262773722627737, 'recall': 0.6199376947040498, 'f1': 0.6689075630252099}
Young Adult
{'accuracy': 0.834703947368421, 'precision': 0.7295081967213115, 'recall': 0.5686900958466453, 'f1': 0.6391382405745063}
Historical
{'accuracy': 0.8338815789473685, 'precision': 0.7280701754385965, 'recall': 0.5424836601307189, 'f1': 0.6217228464419475}
Historical Fiction
{'accuracy': 0.8379934210526315, 'precision': 0.7297297297297297, 'recall': 0.5418060200668896, 'f1': 0.6218809980806141}
Mystery
{'accuracy': 0.8692434210526315, 'precision': 0.8658536585365854, 'recall': 0.5089605734767025, 'f1': 0.6410835214446953}
Contemporary
{'accuracy': 0.8289473684210527, 'precision': 0.67295597484276

In [12]:
# The cosine similarity function
def cosine_similarity(v1, v2):
    # Convert the vectors to numpy arrays
    v1 = np.array(v1)
    v2 = np.array(v2)
    # Calculate the dot product of the two vectors
    dot_product = np.dot(v1, v2)
    norm_v1 = np.linalg.norm(v1)
    norm_v2 = np.linalg.norm(v2)
    return dot_product / (norm_v1 * norm_v2)

In [13]:
# Load the recommendation dataset
recommendations = pd.read_csv('Datasets/goodreads_recommendations.csv')

# Print the first few rows of the recommendations
print(recommendations.head())

                                                Book  \
0                              To Kill a Mockingbird   
1  Harry Potter and the Philosopher’s Stone (Harr...   
2                                Pride and Prejudice   
3                                               1984   
4                                   The Great Gatsby   

                                             Similar  \
0                                  Go Set a Watchman   
1  Harry Potter and the Order of the Phoenix (Har...   
2                              Sense and Sensibility   
3                                 Animal Farm / 1984   
4                The Curious Case of Benjamin Button   

                                          Dissimilar  \
0              The Complete Poems of Emily Dickinson   
1                                        Strip Tease   
2  Homeland (Forgotten Realms: The Dark Elf Trilo...   
3                                   Everything Flows   
4                           Ojo por ojo (Talio

In [14]:
# Get the book descriptions from the recommendations
descriptions = recommendations['Book Description']
similardescriptions = recommendations['Similar Description']
dissimilardescriptions = recommendations['Dissimilar Description']

# Run the vectorizer on the descriptions
descriptions = vectorizer.transform(descriptions)
similardescriptions = vectorizer.transform(similardescriptions)
dissimilardescriptions = vectorizer.transform(dissimilardescriptions)

# Create a list of genre scores for each
description_scores = []
similardescription_scores = []
dissimilardescription_scores = []

# Get the first set of book descriptions
book1 = descriptions[0]
book2 = similardescriptions[0]
book3 = dissimilardescriptions[0]

# Calculate the genre scores for each book
for genre in genres:
    classifier = classifiers[genre]
    # Instead of using the predict method, we use the predict_proba method
    # This gives us the probability of the book being in the genre
    # We use the probability of the book being in the genre as the score
    description_scores.append(classifier.predict_proba(book1)[0][1])
    similardescription_scores.append(classifier.predict_proba(book2)[0][1])
    dissimilardescription_scores.append(classifier.predict_proba(book3)[0][1])

# Print the genre scores for each book
print(description_scores)
print(similardescription_scores)
print(dissimilardescription_scores)

# Calculate the cosine similarity between the first book and the second book
similarity = cosine_similarity(np.array(description_scores), np.array(similardescription_scores))
print(similarity)

# Calculate the cosine similarity between the first book and the third book
dissimilarity = cosine_similarity(np.array(description_scores), np.array(dissimilardescription_scores))
print(dissimilarity)

[1.6693803215528125e-18, 2.1144844006661158e-11, 5.853368626857321e-21, 8.965822319073162e-17, 0.9999999983517682, 0.999999994271775, 4.7779008380892954e-20, 2.1577602768446062e-15, 1.6711028165942523e-23, 1.0331997425121541e-22, 8.427481583513628e-23, 3.4386599653113344e-30, 3.021645248588356e-31, 8.874706280965021e-29, 1.5970985004815294e-28, 1.7345135629090753e-32, 7.65893158467698e-27, 5.992297704584142e-38, 9.096062460636482e-37, 1.2754058926336668e-33, 1.6477968747050187e-33, 8.866549342447523e-40, 1.470023213396665e-20, 1.282933175584514e-29, 2.2611412650697167e-26]
[3.729425623281907e-27, 3.2440544781039293e-07, 3.361472505659429e-28, 2.845626844830449e-28, 1.0, 1.0, 1.2200263874830398e-22, 4.372686341662931e-17, 2.527088877350162e-27, 7.308839231079749e-35, 1.076655578265218e-34, 2.658015192775016e-39, 3.6634648051702356e-36, 2.8656430187598515e-34, 8.866716672739465e-49, 6.322837254993188e-41, 8.875641146769144e-39, 3.913730968113749e-48, 1.4953493810652463e-43, 1.05603284267

In [15]:
# Function to take a row of the recommendations dataframe and calculate the similarity scores
def calculate_similarity(row):
    # Get the book descriptions
    book1 = vectorizer.transform([row['Book Description']])
    book2 = vectorizer.transform([row['Similar Description']])
    book3 = vectorizer.transform([row['Dissimilar Description']])
    # Create a list of genre scores for each
    description_scores = []
    similardescription_scores = []
    dissimilardescription_scores = []
    # Calculate the genre scores for each book
    for genre in genres:
        classifier = classifiers[genre]
        description_scores.append(classifier.predict_proba(book1)[0][1])
        similardescription_scores.append(classifier.predict_proba(book2)[0][1])
        dissimilardescription_scores.append(classifier.predict_proba(book3)[0][1])
    # Calculate the cosine similarity between the first book and the second book
    similarity = cosine_similarity(description_scores, similardescription_scores)
    # Calculate the cosine similarity between the first book and the third book
    dissimilarity = cosine_similarity(description_scores, dissimilardescription_scores)
    return similarity, dissimilarity

In [17]:
# Create a new dataframe to store the similarity scores for each row
similarity_scores = pd.DataFrame(columns=['Similarity', 'Dissimilarity'])

# Calculate the similarity scores for each row
for index, row in recommendations.iterrows():
    #print(index)
    similarity, dissimilarity = calculate_similarity(row)
    similarity_scores.loc[index] = [similarity, dissimilarity]

# Print the similarity scores
print(similarity_scores.head())

   Similarity  Dissimilarity
0    1.000000   7.168083e-13
1    0.774487   2.367824e-06
2    1.000000   3.468775e-06
3    0.831147   9.500289e-02
4    0.994236   1.578341e-08


In [18]:
print(similarity_scores.head())

   Similarity  Dissimilarity
0    1.000000   7.168083e-13
1    0.774487   2.367824e-06
2    1.000000   3.468775e-06
3    0.831147   9.500289e-02
4    0.994236   1.578341e-08


In [19]:
# Calculate the percentage of correct predictions
correct = (similarity_scores['Similarity'] > similarity_scores['Dissimilarity']).mean()
print("Percentage of correct predictions")
print(correct)

# Count the percentage of times the similarity score is greater than 0.5
similar_correct = (similarity_scores['Similarity'] > 0.5).mean()
print("Percentage of similarity score greater than 0.5")
print(similar_correct)

# Count the percentage of times the dissimilarity score is less than 0.5
dissimilar_correct = (similarity_scores['Dissimilarity'] < 0.5).mean()
print("Percentage of dissimilarity score less than 0.5")
print(dissimilar_correct)

Percentage of correct predictions
0.9260812581913499
Percentage of similarity score greater than 0.5
0.7491480996068152
Percentage of dissimilarity score less than 0.5
0.9425950196592399


In [29]:
# Calculate the precision, recall, and F1 score of the similarity scores
true_positives = sum(similarity_scores['Similarity'] > similarity_scores['Dissimilarity'])
print(true_positives)
false_positives = sum(similarity_scores['Similarity'] < 0.5)
print(false_positives)
false_negatives = sum(similarity_scores['Dissimilarity'] > 0.5)
print(false_negatives)

accuracy = correct
precision = true_positives / (true_positives + false_positives)
recall = true_positives / (true_positives + false_negatives)
f1 = 2 * precision * recall / (precision + recall)

print("Accuracy:", accuracy)
print("Precision:", precision)
print("Recall:", recall)
print("F1 Score:", f1)

# Save the accuracy, precision, recall, and F1 score to a file
with open('NBresults.txt', 'w') as f:
    f.write('Accuracy: ' + str(accuracy) + '\n')
    f.write('Precision: ' + str(precision) + '\n')
    f.write('Recall: ' + str(recall) + '\n')
    f.write('F1 Score: ' + str(f1) + '\n')

3533
957
219
Accuracy: 0.9260812581913499
Precision: 0.7868596881959911
Recall: 0.9416311300639659
F1 Score: 0.8573161853918951
