In [None]:
# CountVectorizer for converting text data into a matrix of token counts
from sklearn.feature_extraction.text import CountVectorizer

# Pandas for working with dataframes (including the one already created by the data processing notebook)
import pandas as pd
# Numpy for working with arrays
import numpy as np

In [2]:
# Load the data
data = pd.read_csv('Datasets/goodreads_data_onehot_genres.csv')

# Print the first few rows of the data
print(data.head())
# Print the shape of the data
print(data.shape)

                                                Book  \
0                              To Kill a Mockingbird   
1  Harry Potter and the Philosopher’s Stone (Harr...   
2                                Pride and Prejudice   
3                                  The Little Prince   
4                                               1984   

                                         Description  \
0  The unforgettable novel of a childhood in a sl...   
1  Harry Potter thinks he is an ordinary boy - un...   
2  Since its immediate success in 1813, Pride and...   
3  A pilot stranded in the desert awakes one morn...   
4  The new novel by George Orwell is the major wo...   

                     Author  Fantasy  Adult  Romance  Young Adult  Historical  \
0                Harper Lee        0      1        0            1           1   
1              J.K. Rowling        1      1        0            1           0   
2               Jane Austen        0      0        1            0           1   
3 

In [3]:
# Drop the genre columns (only the first three columns are needed)
reduced_data = data[:].drop(data.columns[3:], axis=1)
print(reduced_data.head())

                                                Book  \
0                              To Kill a Mockingbird   
1  Harry Potter and the Philosopher’s Stone (Harr...   
2                                Pride and Prejudice   
3                                  The Little Prince   
4                                               1984   

                                         Description                    Author  
0  The unforgettable novel of a childhood in a sl...                Harper Lee  
1  Harry Potter thinks he is an ordinary boy - un...              J.K. Rowling  
2  Since its immediate success in 1813, Pride and...               Jane Austen  
3  A pilot stranded in the desert awakes one morn...  Antoine de Saint-Exupéry  
4  The new novel by George Orwell is the major wo...             George Orwell  


In [4]:
# Create a CountVectorizer object to convert the text data into a matrix of token counts
vectorizer = CountVectorizer()

# Convert the text data into a matrix of token counts
X = vectorizer.fit_transform(reduced_data['Description'])

# Print the shape of the matrix
print(X.shape)

print(X)

(6079, 49471)
  (0, 41245)	4
  (0, 43177)	1
  (0, 28713)	1
  (0, 29069)	4
  (0, 7705)	1
  (0, 20647)	4
  (0, 37984)	1
  (0, 38557)	1
  (0, 41970)	1
  (0, 2299)	9
  (0, 9908)	1
  (0, 9048)	1
  (0, 41239)	1
  (0, 35265)	1
  (0, 21762)	4
  (0, 41706)	6
  (0, 22914)	2
  (0, 26986)	2
  (0, 4466)	1
  (0, 5697)	1
  (0, 2245)	2
  (0, 21214)	1
  (0, 4867)	1
  (0, 9926)	1
  (0, 39795)	1
  :	:
  (6078, 41462)	1
  (6078, 27433)	1
  (6078, 18872)	1
  (6078, 39840)	1
  (6078, 40284)	1
  (6078, 41862)	1
  (6078, 9579)	1
  (6078, 2903)	1
  (6078, 12548)	1
  (6078, 8207)	1
  (6078, 44715)	1
  (6078, 31057)	1
  (6078, 36851)	1
  (6078, 6698)	1
  (6078, 34426)	1
  (6078, 27123)	1
  (6078, 40268)	1
  (6078, 38097)	1
  (6078, 12527)	1
  (6078, 42568)	2
  (6078, 5623)	1
  (6078, 45539)	1
  (6078, 1353)	2
  (6078, 39186)	1
  (6078, 15124)	1


In [5]:
# Create a normalized version of the matrix by dividing each count by the total number of counts for that row
X_norm = X / X.sum(axis=1)

# Print the shape of the normalized matrix
print(X_norm.shape)

print(X_norm)

(6079, 49471)
[[0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 ...
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]]


  return np.true_divide(self.todense(), other)


In [18]:
# Function that measures the cosine similarity between two vectors
def cosine_similarity(v1, v2):
    return np.dot(v1, v2.T) / (np.linalg.norm(v1) * np.linalg.norm(v2))

In [7]:
# Relative similarity of one book compared to two others
# Takes three book vectors, calculates the cosine similarity between the first and the other two
# Returns the ratio of the two similarities
# If the first book is more similar to the second book, the result will be greater than 1
# If the first book is more similar to the third book, the result will be less than 1
def relative_similarity(v1, v2, v3):
    sim1 = cosine_similarity(v1, v2)
    sim2 = cosine_similarity(v1, v3)
    return sim1 / sim2

In [8]:
# Load the recommendation dataset
recommendations = pd.read_csv('Datasets/goodreads_recommendations.csv')

# Print the first few rows of the recommendations
print(recommendations.head())

                                                Book  \
0                              To Kill a Mockingbird   
1  Harry Potter and the Philosopher’s Stone (Harr...   
2                                Pride and Prejudice   
3                                               1984   
4                                   The Great Gatsby   

                                             Similar  \
0                                  Go Set a Watchman   
1  Harry Potter and the Order of the Phoenix (Har...   
2                              Sense and Sensibility   
3                                 Animal Farm / 1984   
4                The Curious Case of Benjamin Button   

                                          Dissimilar  \
0              The Complete Poems of Emily Dickinson   
1                                        Strip Tease   
2  Homeland (Forgotten Realms: The Dark Elf Trilo...   
3                                   Everything Flows   
4                           Ojo por ojo (Talio

In [9]:
# Get the book descriptions from the recommendations
descriptions = recommendations['Book Description']
similardescriptions = recommendations['Similar Description']
dissimilardescriptions = recommendations['Dissimilar Description']

# Get the first set of book descriptions
book1 = descriptions[0]
book2 = similardescriptions[0]
book3 = dissimilardescriptions[0]

In [10]:
print(book1)
print(book2)
print(book3)

The unforgettable novel of a childhood in a sleepy Southern town and the crisis of conscience that rocked it. "To Kill A Mockingbird" became both an instant bestseller and a critical success when it was first published in 1960. It went on to win the Pulitzer Prize in 1961 and was later made into an Academy Award-winning film, also a classic.Compassionate, dramatic, and deeply moving, "To Kill A Mockingbird" takes readers to the roots of human behavior - to innocence and experience, kindness and cruelty, love and hatred, humor and pathos. Now with over 18 million copies in print and translated into forty languages, this regional story by a young Alabama woman claims universal appeal. Harper Lee always considered her book to be a simple love story. Today it is regarded as a masterpiece of American literature.
From Harper Lee comes a landmark new novel set two decades after her beloved Pulitzer Prize-winning masterpiece, To Kill a Mockingbird. Maycomb, Alabama. Twenty-six-year-old Jean Lo

In [19]:
# Convert the book descriptions into a matrix of token counts
X1 = vectorizer.transform([book1])
X2 = vectorizer.transform([book2])
X3 = vectorizer.transform([book3])

# Normalize the matrices
X1_norm = X1 / X1.sum(axis=1)
X2_norm = X2 / X2.sum(axis=1)
X3_norm = X3 / X3.sum(axis=1)

# Calculate the relative similarity of the first book compared to the other two
print(relative_similarity(X1_norm, X2_norm, X3_norm))

[[1.4041236]]


In [21]:
# Convert all book descriptions into a matrix of token counts
X_descriptions = vectorizer.transform(descriptions)
X_similardescriptions = vectorizer.transform(similardescriptions)
X_dissimilardescriptions = vectorizer.transform(dissimilardescriptions)

# Normalize the matrices
X_descriptions_norm = X_descriptions / X_descriptions.sum(axis=1)
X_similardescriptions_norm = X_similardescriptions / X_similardescriptions.sum(axis=1)
X_dissimilardescriptions_norm = X_dissimilardescriptions / X_dissimilardescriptions.sum(axis=1)

# Calculate the relative similarities of all books compared to the other two
similarities = []
for i in range(len(recommendations)):
    similarities.append(relative_similarity(X_descriptions_norm[i], X_similardescriptions_norm[i], X_dissimilardescriptions_norm[i]))

  return sim1 / sim2
  return sim1 / sim2


ZeroDivisionError: division by zero

In [24]:

# Score the recommendations based on the relative similarities - if the book is more similar to the similar book, it scores 1, otherwise it scores 0
recommendations['Score'] = [1 if s > 1 else 0 for s in similarities]

print(recommendations['Score'])
print(recommendations['Score'].sum())

0       1
1       1
2       1
3       1
4       1
       ..
3810    1
3811    1
3812    1
3813    0
3814    1
Name: Score, Length: 3815, dtype: int64
2596


In [37]:

# Function to calculate the accuracy of the recommendations
def accuracy(predictions, actual):
    return sum(predictions == actual) / len(actual)

# Function to calculate the precision of the recommendations
def precision(predictions, actual):
    true_positives = sum(m == 1 and n == 1 for m, n in zip(predictions, actual))
    false_positives = sum(m == 1 and n == 0 for m, n in zip(predictions, actual))
    return true_positives / (true_positives + false_positives) if (true_positives + false_positives) > 0 else 0

# Function to calculate the recall of the recommendations
def recall(predictions, actual):
    true_positives = sum(m == 1 and n == 1 for m, n in zip(predictions, actual))
    false_negatives = sum(m == 0 and n == 1 for m, n in zip(predictions, actual))
    return true_positives / (true_positives + false_negatives) if (true_positives + false_negatives) > 0 else 0

# Function to calculate the F1 score of the recommendations
def f1_score(predictions, actual):
    prec = precision(predictions, actual)
    rec = recall(predictions, actual)
    return 2 * prec * rec / (prec + rec) if (prec + rec) > 0 else 0

# Calculate the accuracy, precision, recall, and F1 score of the recommendations
acc = accuracy(recommendations['Score'], [1] * len(recommendations))
prec = precision(recommendations['Score'], [1] * len(recommendations))
rec = recall(recommendations['Score'], [1] * len(recommendations))
f1 = f1_score(recommendations['Score'], [1] * len(recommendations))

# Print the results
print('Accuracy:', acc)
print('Precision:', prec)
print('Recall:', rec)
print('F1 Score:', f1)

# Save the accuracy, precision, recall, and F1 score to a file
with open('bagofwordsresults.txt', 'w') as f:
    f.write('Accuracy: ' + str(acc) + '\n')
    f.write('Precision: ' + str(prec) + '\n')
    f.write('Recall: ' + str(rec) + '\n')
    f.write('F1 Score: ' + str(f1) + '\n')

Accuracy: 0.6804718217562254
Precision: 1.0
Recall: 0.6804718217562254
F1 Score: 0.80985805646545
