In [24]:
# Pandas for working with dataframes (including the one already created by the data processing notebook)
import pandas as pd
# Numpy for working with arrays
import numpy as np
# Import library to tokenize the text using word2vec
import gensim
import gensim.downloader

In [25]:
# Load the data
data = pd.read_csv('Datasets/goodreads_data_onehot_genres.csv')

# Print the first few rows of the data
print(data.head())
# Print the shape of the data
print(data.shape)

                                                Book  \
0                              To Kill a Mockingbird   
1  Harry Potter and the Philosopher’s Stone (Harr...   
2                                Pride and Prejudice   
3                                  The Little Prince   
4                                               1984   

                                         Description  \
0  The unforgettable novel of a childhood in a sl...   
1  Harry Potter thinks he is an ordinary boy - un...   
2  Since its immediate success in 1813, Pride and...   
3  A pilot stranded in the desert awakes one morn...   
4  The new novel by George Orwell is the major wo...   

                     Author  Fantasy  Adult  Romance  Young Adult  Historical  \
0                Harper Lee        0      1        0            1           1   
1              J.K. Rowling        1      1        0            1           0   
2               Jane Austen        0      0        1            0           1   
3 

In [26]:
# Drop the genre columns (only the first three columns are needed)
reduced_data = data[:].drop(data.columns[3:], axis=1)
print(reduced_data.head())

                                                Book  \
0                              To Kill a Mockingbird   
1  Harry Potter and the Philosopher’s Stone (Harr...   
2                                Pride and Prejudice   
3                                  The Little Prince   
4                                               1984   

                                         Description                    Author  
0  The unforgettable novel of a childhood in a sl...                Harper Lee  
1  Harry Potter thinks he is an ordinary boy - un...              J.K. Rowling  
2  Since its immediate success in 1813, Pride and...               Jane Austen  
3  A pilot stranded in the desert awakes one morn...  Antoine de Saint-Exupéry  
4  The new novel by George Orwell is the major wo...             George Orwell  


In [27]:
# Load the recommendations dataset
recommendations = pd.read_csv('Datasets/goodreads_recommendations.csv')

In [28]:
# Function to calculate the cosine similarity between two vectors
def cosine_similarity(v1, v2):
    return np.dot(v1, v2.T) / (np.linalg.norm(v1) * np.linalg.norm(v2))

# Relative similarity of one book compared to two others
# Takes three book vectors, calculates the cosine similarity between the first and the other two
# Returns the ratio of the two similarities
# If the first book is more similar to the second book, the result will be greater than 1
# If the first book is more similar to the third book, the result will be less than 1
def relative_similarity(v1, v2, v3):
    sim1 = cosine_similarity(v1, v2)
    sim2 = cosine_similarity(v1, v3)
    return sim1 / sim2

In [29]:
# Get the book descriptions from the recommendations
descriptions = recommendations['Book Description']
similardescriptions = recommendations['Similar Description']
dissimilardescriptions = recommendations['Dissimilar Description']

# Get the first set of book descriptions
book1 = descriptions[0]
book2 = similardescriptions[0]
book3 = dissimilardescriptions[0]

# Print the first book description
print(book1)
print(book2)
print(book3)


The unforgettable novel of a childhood in a sleepy Southern town and the crisis of conscience that rocked it. "To Kill A Mockingbird" became both an instant bestseller and a critical success when it was first published in 1960. It went on to win the Pulitzer Prize in 1961 and was later made into an Academy Award-winning film, also a classic.Compassionate, dramatic, and deeply moving, "To Kill A Mockingbird" takes readers to the roots of human behavior - to innocence and experience, kindness and cruelty, love and hatred, humor and pathos. Now with over 18 million copies in print and translated into forty languages, this regional story by a young Alabama woman claims universal appeal. Harper Lee always considered her book to be a simple love story. Today it is regarded as a masterpiece of American literature.
From Harper Lee comes a landmark new novel set two decades after her beloved Pulitzer Prize-winning masterpiece, To Kill a Mockingbird. Maycomb, Alabama. Twenty-six-year-old Jean Lo

In [30]:
 # Show the list of available models
for model_name in list(gensim.downloader.info()['models'].keys()):
  print(model_name)

# Download the glove-wiki-gigaword-50 model
glove_model = gensim.downloader.load('glove-wiki-gigaword-50')

fasttext-wiki-news-subwords-300
conceptnet-numberbatch-17-06-300
word2vec-ruscorpora-300
word2vec-google-news-300
glove-wiki-gigaword-50
glove-wiki-gigaword-100
glove-wiki-gigaword-200
glove-wiki-gigaword-300
glove-twitter-25
glove-twitter-50
glove-twitter-100
glove-twitter-200
__testing_word2vec-matrix-synopsis


In [31]:
# Test the model
print(glove_model.most_similar('king'))

# Print the word vector for the word 'king'
print(glove_model['king'])

[('prince', 0.8236179351806641), ('queen', 0.7839043140411377), ('ii', 0.7746230363845825), ('emperor', 0.7736246585845947), ('son', 0.766719400882721), ('uncle', 0.7627151012420654), ('kingdom', 0.7542160749435425), ('throne', 0.753991425037384), ('brother', 0.7492411136627197), ('ruler', 0.7434253692626953)]
[ 0.50451   0.68607  -0.59517  -0.022801  0.60046  -0.13498  -0.08813
  0.47377  -0.61798  -0.31012  -0.076666  1.493    -0.034189 -0.98173
  0.68229   0.81722  -0.51874  -0.31503  -0.55809   0.66421   0.1961
 -0.13495  -0.11476  -0.30344   0.41177  -2.223    -1.0756   -1.0783
 -0.34354   0.33505   1.9927   -0.04234  -0.64319   0.71125   0.49159
  0.16754   0.34344  -0.25663  -0.8523    0.1661    0.40102   1.1685
 -1.0137   -0.21585  -0.15155   0.78321  -0.91241  -1.6106   -0.64426
 -0.51042 ]


In [32]:
# Function to get the average word vector for a piece of text
def get_average_word_vectors(text, model):
    # Split the text into words
    words = text.split()
    # Initialize an array for the vectors
    word_vectors = []
    # Loop over the words
    for word in words:
        # Try to get the word vector
        try:
            word_vector = model[word]
            word_vectors.append(word_vector)
        # Skip the word if it's not in the model
        except:
            pass
    # Return the average word vector
    return np.mean(word_vectors, axis=0)

In [33]:
# Get the average word vectors for the three book descriptions
book1_vector = get_average_word_vectors(book1, glove_model)
book2_vector = get_average_word_vectors(book2, glove_model)
book3_vector = get_average_word_vectors(book3, glove_model)

print(book1_vector)
print(book2_vector)
print(book3_vector)

[ 2.36608520e-01  2.58960456e-01 -2.72087485e-01 -1.21809594e-01
  3.76585633e-01  3.87032539e-01 -4.39929545e-01 -1.96883142e-01
 -1.09586582e-01  1.12416349e-01  6.24890148e-04  1.04548791e-02
 -2.82425106e-01 -1.29612461e-01  3.79126132e-01 -9.76402685e-02
 -4.98268716e-02 -2.11072434e-03 -4.05108273e-01 -7.25806132e-02
  9.22659412e-02  2.74150938e-01  1.84227109e-01 -5.15049463e-03
  1.58968389e-01 -1.53424549e+00 -4.76014167e-01  9.69733670e-03
  1.23563215e-01 -9.14772823e-02  3.12074876e+00 -1.16522983e-01
 -1.48111358e-01 -3.50349128e-01 -1.84679627e-02  6.72897091e-03
 -3.35613154e-02  1.35838926e-01 -2.16751359e-02 -2.07677528e-01
 -2.37321910e-02  8.81130472e-02 -7.96069875e-02 -3.36265638e-02
 -1.20714180e-01  1.25129446e-01 -1.60482109e-01 -2.05376938e-01
 -2.42251325e-02 -1.69615149e-01]
[ 3.54482949e-01  2.87642270e-01 -2.94670552e-01 -9.26085338e-02
  4.27579045e-01  3.43130231e-01 -2.71142811e-01 -1.44135267e-01
 -1.02071054e-01  9.60824117e-02 -4.33698297e-02  1.0435

In [39]:
# Get the relative similarity of the first book compared to the other two
similarity = relative_similarity(book1_vector, book2_vector, book3_vector)
print(similarity)
print(cosine_similarity(book1_vector, book2_vector))
print(cosine_similarity(book1_vector, book3_vector))

1.0099155
0.9884774
0.97877246


In [41]:
# Get the average word vectors for all the book descriptions
vectors = []
similarvectors = []
dissimilarvectors = []

for i in range(len(descriptions)):
    vectors.append(get_average_word_vectors(descriptions[i], glove_model))
    similarvectors.append(get_average_word_vectors(similardescriptions[i], glove_model))
    dissimilarvectors.append(get_average_word_vectors(dissimilardescriptions[i], glove_model))

print(vectors[0])
print(similarvectors[0])
print(dissimilarvectors[0])

# Calculate the relative similarity
simtest = relative_similarity(vectors[0], similarvectors[0], dissimilarvectors[0])
print(simtest)

  return _methods._mean(a, axis=axis, dtype=dtype,
  ret = ret.dtype.type(ret / rcount)


[ 2.36608520e-01  2.58960456e-01 -2.72087485e-01 -1.21809594e-01
  3.76585633e-01  3.87032539e-01 -4.39929545e-01 -1.96883142e-01
 -1.09586582e-01  1.12416349e-01  6.24890148e-04  1.04548791e-02
 -2.82425106e-01 -1.29612461e-01  3.79126132e-01 -9.76402685e-02
 -4.98268716e-02 -2.11072434e-03 -4.05108273e-01 -7.25806132e-02
  9.22659412e-02  2.74150938e-01  1.84227109e-01 -5.15049463e-03
  1.58968389e-01 -1.53424549e+00 -4.76014167e-01  9.69733670e-03
  1.23563215e-01 -9.14772823e-02  3.12074876e+00 -1.16522983e-01
 -1.48111358e-01 -3.50349128e-01 -1.84679627e-02  6.72897091e-03
 -3.35613154e-02  1.35838926e-01 -2.16751359e-02 -2.07677528e-01
 -2.37321910e-02  8.81130472e-02 -7.96069875e-02 -3.36265638e-02
 -1.20714180e-01  1.25129446e-01 -1.60482109e-01 -2.05376938e-01
 -2.42251325e-02 -1.69615149e-01]
[ 3.54482949e-01  2.87642270e-01 -2.94670552e-01 -9.26085338e-02
  4.27579045e-01  3.43130231e-01 -2.71142811e-01 -1.44135267e-01
 -1.02071054e-01  9.60824117e-02 -4.33698297e-02  1.0435

In [53]:
# Calculate the relative similarities for all the books
relative_similarities = []
for i in range(len(vectors)):
    relative_similarities.append(relative_similarity(vectors[i], similarvectors[i], dissimilarvectors[i]))

# Remove any non-numerical values
relative_similarities = [x for x in relative_similarities if type(x) is not np.ndarray]

print(relative_similarities)

[1.0099155, 0.99990964, 0.98972267, 0.99651605, 4.0830097, 0.9904781, 1.0490496, 1.0069635, 1.0042297, 0.99977535, 0.9998361, 1.0038545, 1.0098085, 1.0394763, 1.0035534, 1.0323029, 1.0042803, 1.0070385, 1.0039861, 0.4740075, 1.0035489, 0.9240066, 1.0113846, 0.9921292, 0.93880934, 0.99360543, 1.0318866, 1.0102654, 0.99851453, 0.99935246, 0.9905554, 1.0001137, 1.0255967, 1.0061262, 1.0050471, 1.0171617, 1.0108875, 1.0012689, 0.99343246, 1.0181895, 1.0136633, 0.99660164, 1.0140717, 1.0268065, 1.0094422, 0.9936324, 1.0165535, 1.0078146, 0.95331925, 0.9957033, 0.9951893, 0.9975993, 0.9978442, 1.0232954, 1.0957158, 1.0148777, 0.9984308, 0.97962874, 0.98184806, 0.9926954, 1.0116887, 1.0074776, 1.033715, 1.0066254, 0.98978454, 0.98784673, 1.000873, 1.0068786, 1.0303552, 1.0005491, 1.0183096, 1.0084409, 0.98145527, 1.001554, 0.99621457, 1.0063796, 0.99087644, 0.9772308, 0.98879766, 0.9925688, 1.043217, 1.0346979, 0.9997143, 0.99288523, 1.000838, 0.9996281, 1.024834, 0.99961853, 0.9934449, 1.000

In [54]:

# Function to calculate the accuracy of the recommendations
def accuracy(predictions, actual):
    return sum(predictions == actual) / len(actual)

# Function to calculate the precision of the recommendations
def precision(predictions, actual):
    true_positives = sum(m == 1 and n == 1 for m, n in zip(predictions, actual))
    false_positives = sum(m == 1 and n == 0 for m, n in zip(predictions, actual))
    return true_positives / (true_positives + false_positives) if (true_positives + false_positives) > 0 else 0

# Function to calculate the recall of the recommendations
def recall(predictions, actual):
    true_positives = sum(m == 1 and n == 1 for m, n in zip(predictions, actual))
    false_negatives = sum(m == 0 and n == 1 for m, n in zip(predictions, actual))
    return true_positives / (true_positives + false_negatives) if (true_positives + false_negatives) > 0 else 0

# Function to calculate the F1 score of the recommendations
def f1(predictions, actual):
    prec = precision(predictions, actual)
    rec = recall(predictions, actual)
    return 2 * prec * rec / (prec + rec)

# Calculate the accuracy, precision, recall, and F1 score of the recommendations
accuracy_score = accuracy(np.array(relative_similarities) > 1, np.ones(len(relative_similarities)))
precision_score = precision(np.array(relative_similarities) > 1, np.ones(len(relative_similarities)))
recall_score = recall(np.array(relative_similarities) > 1, np.ones(len(relative_similarities)))
f1_score = f1(np.array(relative_similarities) > 1, np.ones(len(relative_similarities)))

# Print the accuracy, precision, recall, and F1 score
print(accuracy_score)
print(precision_score)
print(recall_score)
print(f1_score)

# Save the accuracy, precision, recall, and F1 score to a file
with open('word2vecresults.txt', 'w') as f:
    f.write('Accuracy: ' + str(accuracy_score) + '\n')
    f.write('Precision: ' + str(precision_score) + '\n')
    f.write('Recall: ' + str(recall_score) + '\n')
    f.write('F1 Score: ' + str(f1_score) + '\n')

0.7241561181434599
1.0
0.7241561181434599
0.8400122361578465
