In [None]:
# Import tf-idf vectorizer
from sklearn.feature_extraction.text import TfidfVectorizer

# Pandas for working with dataframes (including the one already created by the data processing notebook)
import pandas as pd
# Numpy for working with arrays
import numpy as np

In [14]:
# Load the data
data = pd.read_csv('Datasets/goodreads_data_onehot_genres.csv')

# Print the first few rows of the data
print(data.head())
# Print the shape of the data
print(data.shape)

                                                Book  \
0                              To Kill a Mockingbird   
1  Harry Potter and the Philosopher’s Stone (Harr...   
2                                Pride and Prejudice   
3                                  The Little Prince   
4                                               1984   

                                         Description  \
0  The unforgettable novel of a childhood in a sl...   
1  Harry Potter thinks he is an ordinary boy - un...   
2  Since its immediate success in 1813, Pride and...   
3  A pilot stranded in the desert awakes one morn...   
4  The new novel by George Orwell is the major wo...   

                     Author  Fantasy  Adult  Romance  Young Adult  Historical  \
0                Harper Lee        0      1        0            1           1   
1              J.K. Rowling        1      1        0            1           0   
2               Jane Austen        0      0        1            0           1   
3 

In [15]:
# Drop the genre columns (only the first three columns are needed)
reduced_data = data[:].drop(data.columns[3:], axis=1)
print(reduced_data.head())

                                                Book  \
0                              To Kill a Mockingbird   
1  Harry Potter and the Philosopher’s Stone (Harr...   
2                                Pride and Prejudice   
3                                  The Little Prince   
4                                               1984   

                                         Description                    Author  
0  The unforgettable novel of a childhood in a sl...                Harper Lee  
1  Harry Potter thinks he is an ordinary boy - un...              J.K. Rowling  
2  Since its immediate success in 1813, Pride and...               Jane Austen  
3  A pilot stranded in the desert awakes one morn...  Antoine de Saint-Exupéry  
4  The new novel by George Orwell is the major wo...             George Orwell  


In [16]:
# Create a tf-idf vectorizer
vectorizer = TfidfVectorizer()

# Fit the vectorizer to the data
X = vectorizer.fit_transform(reduced_data['Description'])

# Print the shape of X
print(X.shape)

# Print the first few rows of X
print(X[:5])

(6079, 49471)
  (0, 41245)	0.07974148622171125
  (0, 43177)	0.08299254695480573
  (0, 28713)	0.04939525054260856
  (0, 29069)	0.08245421907326173
  (0, 7705)	0.084385305033751
  (0, 20647)	0.08882066230265245
  (0, 37984)	0.1238533774970916
  (0, 38557)	0.10476901517355929
  (0, 41970)	0.06765601181593785
  (0, 2299)	0.18242800891718797
  (0, 9908)	0.10952213623405813
  (0, 9048)	0.1191451853405405
  (0, 41239)	0.02806147792008456
  (0, 35265)	0.14132915921553943
  (0, 21762)	0.13284467056936236
  (0, 41706)	0.12766781702257898
  (0, 22914)	0.1741403258967615
  (0, 26986)	0.30214356866135283
  (0, 4466)	0.09508256774530054
  (0, 5697)	0.06273861488275119
  (0, 2245)	0.06407344665889238
  (0, 21214)	0.11307153677152156
  (0, 4867)	0.09300570583859526
  (0, 9926)	0.11307153677152156
  (0, 39795)	0.09612313365563192
  :	:
  (4, 37223)	0.03383127528596821
  (4, 18630)	0.07582038526823667
  (4, 38018)	0.1104700435223645
  (4, 33647)	0.07920111165762894
  (4, 46083)	0.0479736375066311
  (4, 

In [17]:
# Function to calculate the cosine similarity between two vectors
def cosine_similarity(v1, v2):
    return np.dot(v1, v2.T) / (np.linalg.norm(v1) * np.linalg.norm(v2))

In [18]:
# Relative similarity of one book compared to two others
# Takes three book vectors, calculates the cosine similarity between the first and the other two
# Returns the ratio of the two similarities
# If the first book is more similar to the second book, the result will be greater than 1
# If the first book is more similar to the third book, the result will be less than 1
def relative_similarity(v1, v2, v3):
    sim1 = cosine_similarity(v1, v2)
    sim2 = cosine_similarity(v1, v3)
    return sim1 / sim2

In [19]:
# Load the recommendation dataset
recommendations = pd.read_csv('Datasets/goodreads_recommendations.csv')

# Print the first few rows of the recommendations
print(recommendations.head())

                                                Book  \
0                              To Kill a Mockingbird   
1  Harry Potter and the Philosopher’s Stone (Harr...   
2                                Pride and Prejudice   
3                                               1984   
4                                   The Great Gatsby   

                                             Similar  \
0                                  Go Set a Watchman   
1  Harry Potter and the Order of the Phoenix (Har...   
2                              Sense and Sensibility   
3                                 Animal Farm / 1984   
4                The Curious Case of Benjamin Button   

                                          Dissimilar  \
0              The Complete Poems of Emily Dickinson   
1                                        Strip Tease   
2  Homeland (Forgotten Realms: The Dark Elf Trilo...   
3                                   Everything Flows   
4                           Ojo por ojo (Talio

In [20]:
# Get the book descriptions from the recommendations
descriptions = recommendations['Book Description']
similardescriptions = recommendations['Similar Description']
dissimilardescriptions = recommendations['Dissimilar Description']

# Get the first set of book descriptions
book1 = descriptions[0]
book2 = similardescriptions[0]
book3 = dissimilardescriptions[0]

In [21]:
# Calculate the tf-idf vectors for the book descriptions
book1_vector = vectorizer.transform([book1]).toarray().flatten()
book2_vector = vectorizer.transform([book2]).toarray().flatten()
book3_vector = vectorizer.transform([book3]).toarray().flatten()

# Calculate the cosine similarity between book1 and book2, and between book1 and book3
similarity = cosine_similarity(book1_vector, book2_vector)
dissimilarity = cosine_similarity(book1_vector, book3_vector)

# Print the similarity and dissimilarity
print(similarity)
print(dissimilarity)

0.319450349043018
0.06950528901461778


In [22]:
print(book1)
print(book2)
print(book3)

The unforgettable novel of a childhood in a sleepy Southern town and the crisis of conscience that rocked it. "To Kill A Mockingbird" became both an instant bestseller and a critical success when it was first published in 1960. It went on to win the Pulitzer Prize in 1961 and was later made into an Academy Award-winning film, also a classic.Compassionate, dramatic, and deeply moving, "To Kill A Mockingbird" takes readers to the roots of human behavior - to innocence and experience, kindness and cruelty, love and hatred, humor and pathos. Now with over 18 million copies in print and translated into forty languages, this regional story by a young Alabama woman claims universal appeal. Harper Lee always considered her book to be a simple love story. Today it is regarded as a masterpiece of American literature.
From Harper Lee comes a landmark new novel set two decades after her beloved Pulitzer Prize-winning masterpiece, To Kill a Mockingbird. Maycomb, Alabama. Twenty-six-year-old Jean Lo

In [23]:
# Convert all book descriptions to vectors
vectors = vectorizer.transform(descriptions).toarray()
similarvectors = vectorizer.transform(similardescriptions).toarray()
dissimilarvectors = vectorizer.transform(dissimilardescriptions).toarray()

# Calculate the relative similarity of each book to the first book
relative_similarities = []
for i in range(len(vectors)):
    relative_similarities.append(relative_similarity(vectors[i], similarvectors[i], dissimilarvectors[i]))

# Print the relative similarities
print(relative_similarities)

# Function to calculate the accuracy of the recommendations
def accuracy(predictions, actual):
    return sum(predictions == actual) / len(actual)

# Function to calculate the precision of the recommendations
def precision(predictions, actual):
    true_positives = sum(m == 1 and n == 1 for m, n in zip(predictions, actual))
    false_positives = sum(m == 1 and n == 0 for m, n in zip(predictions, actual))
    return true_positives / (true_positives + false_positives) if (true_positives + false_positives) > 0 else 0

# Function to calculate the recall of the recommendations
def recall(predictions, actual):
    true_positives = sum(m == 1 and n == 1 for m, n in zip(predictions, actual))
    false_negatives = sum(m == 0 and n == 1 for m, n in zip(predictions, actual))
    return true_positives / (true_positives + false_negatives) if (true_positives + false_negatives) > 0 else 0

# Function to calculate the F1 score of the recommendations
def f1(predictions, actual):
    prec = precision(predictions, actual)
    rec = recall(predictions, actual)
    return 2 * prec * rec / (prec + rec)

# Calculate the accuracy, precision, recall, and F1 score of the recommendations
accuracy_score = accuracy(np.array(relative_similarities) > 1, np.ones(len(relative_similarities)))
precision_score = precision(np.array(relative_similarities) > 1, np.ones(len(relative_similarities)))
recall_score = recall(np.array(relative_similarities) > 1, np.ones(len(relative_similarities)))
f1_score = f1(np.array(relative_similarities) > 1, np.ones(len(relative_similarities)))

# Print the accuracy, precision, recall, and F1 score
print(accuracy_score)
print(precision_score)
print(recall_score)
print(f1_score)

# Save the accuracy, precision, recall, and F1 score to a file
with open('tfidfresults.txt', 'w') as f:
    f.write('Accuracy: ' + str(accuracy_score) + '\n')
    f.write('Precision: ' + str(precision_score) + '\n')
    f.write('Recall: ' + str(recall_score) + '\n')
    f.write('F1 Score: ' + str(f1_score) + '\n')

  return sim1 / sim2
  return sim1 / sim2


[4.596058135602225, 10.516949521292602, 1.941088309525666, 2.029058157477883, 30.957146213574486, 0.849942526068553, 4.619323879977728, 0.7778796467665692, 3.7448324122232926, 1.4037495141097405, 1.000812551364679, 1.9481637148316275, 4.030290190072119, 3.0028896104364025, 1.9697451376371111, 8.150831008973473, 3.7496089015034983, 2.8632091162935547, 3.218757360621811, 0.09956784619387228, 0.8522133481131604, 0.7826748594529684, 2.2212032069612575, 4.91438194850472, 0.6796122403925235, 1.6286245489866147, 8.202218703172028, 7.421359026328917, 2.1282429564988625, 1.259512508892602, 0.5622779701445507, 0.7629596090591089, 3.4323938815559663, 2.0974425032726938, 0.683974347182344, 1.51550645163592, 2.9584330337768976, 0.653296883383456, 0.9498342659530146, 3.4106965001422203, 1.2358526663306941, 5.853619598621408, 5.179652625939937, 5.988449122968768, 1.062493319222202, 0.3272425436611657, 1.8743113485469731, 1.8069348868414448, 0.500072719466295, 1.7295821931545472, 2.3976251863133906, 1