# 

In [1]:
import pandas as pd
from sklearn.metrics.pairwise import linear_kernel

In [2]:
import json

data = []

# Load the JSON data from a file with multiple JSON objects
with open('goodreads_books_poetry.json', 'r', encoding='utf-8') as file:
    for line in file:
        try:
            record = json.loads(line)
            data.append(record)
        except json.JSONDecodeError:
            pass  # Handle invalid lines if needed

# Now 'data' contains a list of dictionaries, each representing a JSON object



In [3]:
type(data[0])

dict

In [4]:
df = pd.DataFrame(data)

In [5]:
df.columns


Index(['isbn', 'text_reviews_count', 'series', 'country_code', 'language_code',
       'popular_shelves', 'asin', 'is_ebook', 'average_rating', 'kindle_asin',
       'similar_books', 'description', 'format', 'link', 'authors',
       'publisher', 'num_pages', 'publication_day', 'isbn13',
       'publication_month', 'edition_information', 'publication_year', 'url',
       'image_url', 'book_id', 'ratings_count', 'work_id', 'title',
       'title_without_series'],
      dtype='object')

In [6]:
df[df['language_code']=='eng']

Unnamed: 0,isbn,text_reviews_count,series,country_code,language_code,popular_shelves,asin,is_ebook,average_rating,kindle_asin,...,publication_month,edition_information,publication_year,url,image_url,book_id,ratings_count,work_id,title,title_without_series
0,,1,[],US,eng,"[{'count': '8', 'name': 'to-read'}, {'count': ...",,false,3.83,,...,11,,1887,https://www.goodreads.com/book/show/16037549-v...,https://images.gr-assets.com/books/1348176637m...,16037549,3,5212748,Vision of Sir Launfal and Other Poems,Vision of Sir Launfal and Other Poems
4,1942004192,4,[],US,eng,"[{'count': '228', 'name': 'to-read'}, {'count'...",,false,5.00,,...,12,First,2015,https://www.goodreads.com/book/show/29065952-l...,https://images.gr-assets.com/books/1455198396m...,29065952,9,49294781,Louder Than Everything You Love,Louder Than Everything You Love
6,,3,[],US,eng,"[{'count': '5', 'name': 'to-read'}, {'count': ...",,false,4.75,,...,,,2009,https://www.goodreads.com/book/show/15861988-i...,https://images.gr-assets.com/books/1346225281m...,15861988,8,21611807,Into Temptation,Into Temptation
10,0692265295,10,[],US,eng,"[{'count': '853', 'name': 'to-read'}, {'count'...",,false,3.95,B00SM9ITQS,...,1,,2015,https://www.goodreads.com/book/show/24849837-n...,https://images.gr-assets.com/books/1423580531m...,24849837,27,44304270,Naked Soul: The Erotic Love Poems,Naked Soul: The Erotic Love Poems
17,,1,[],US,eng,"[{'count': '206', 'name': 'to-read'}, {'count'...",,true,4.00,,...,10,,2000,https://www.goodreads.com/book/show/17729612-t...,https://s.gr-assets.com/assets/nophoto/book/11...,17729612,13,24801816,The More Loving One,The More Loving One
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
36491,,5,[],US,eng,"[{'count': '131', 'name': 'to-read'}, {'count'...",,false,4.55,B0076BTK2K,...,6,,2011,https://www.goodreads.com/book/show/13452060-r...,https://images.gr-assets.com/books/1328356634m...,13452060,10,18977757,Ramblings & Rhymes: An Anthology of Poetry,Ramblings & Rhymes: An Anthology of Poetry
36492,1609640101,5,[],US,eng,"[{'count': '15', 'name': 'to-read'}, {'count':...",,false,4.89,,...,4,,2011,https://www.goodreads.com/book/show/11066682-f...,https://images.gr-assets.com/books/1329313687m...,11066682,9,15987970,"Field Work: Notes, Songs, Poems 1997-2010","Field Work: Notes, Songs, Poems 1997-2010"
36494,,2,[],US,eng,"[{'count': '21', 'name': 'to-read'}, {'count':...",,false,4.92,,...,6,,2009,https://www.goodreads.com/book/show/6554908-at...,https://s.gr-assets.com/assets/nophoto/book/11...,6554908,12,6747498,"At night, the dead:","At night, the dead:"
36504,1943977046,6,[],US,eng,"[{'count': '200', 'name': 'to-read'}, {'count'...",,false,4.28,,...,3,,2016,https://www.goodreads.com/book/show/28923921-c...,https://images.gr-assets.com/books/1455223614m...,28923921,17,46806042,Call Me by My Other Name,Call Me by My Other Name


In [7]:
total_books = len(df)

# Set threshold frequencies for common and rare shelves
common_threshold = 0.5  # 50%
rare_threshold = 0.001  # 0.1%

# Define a function to filter shelves
def filter_shelves(shelf_list):
    filtered_shelves = []
    for shelf in shelf_list:
        shelf_occurrence = int(shelf['count']) / total_books
        if common_threshold >= shelf_occurrence >= rare_threshold:
            filtered_shelves.append(shelf)
    return filtered_shelves

In [8]:
# Apply the filter_shelves function to the 'shelves' column
df['filtered_shelves'] = df['popular_shelves'].apply(filter_shelves)

# Now 'filtered_shelves' column contains shelves that meet your filtering criteria
print(df['filtered_shelves'])





0                                                       []
1                    [{'count': '100', 'name': 'to-read'}]
2                                                       []
3        [{'count': '554', 'name': 'to-read'}, {'count'...
4                    [{'count': '228', 'name': 'to-read'}]
                               ...                        
36509                                                   []
36510    [{'count': '1158', 'name': 'to-read'}, {'count...
36511                                                   []
36512    [{'count': '14252', 'name': 'classics'}, {'cou...
36513                 [{'count': '37', 'name': 'to-read'}]
Name: filtered_shelves, Length: 36514, dtype: object


In [9]:
df['description']

0        Number 30 in a series of literary pamphlets pu...
1        Fairy Tales gathers the unconventional verse d...
2        Three poems describe the nighttime adventures ...
3        A modern verse play about the search for meani...
4        Louder Than Everything You Love is about trans...
                               ...                        
36509    Appetizers\n*Poetry- Acrostic\nWo(Man)- Diaman...
36510    There was an Old Derry down Derry, who loved t...
36511                                                     
36512    'Muse, tell me of a man: a man of much resourc...
36513    Gathers poems by William Blake, Emily Bronte, ...
Name: description, Length: 36514, dtype: object

In [10]:
df['authors'][8]

[{'author_id': '619932', 'role': ''}]

In [11]:
df['filtered_shelves_text'] = df['filtered_shelves'].apply(lambda x: ' '.join([shelf['name'] for shelf in x]))


In [12]:
df['filtered_shelves_text']

0                                                         
1                                                  to-read
2                                                         
3                             to-read plays drama classics
4                                                  to-read
                               ...                        
36509                                                     
36510                     to-read poetry currently-reading
36511                                                     
36512    classics fiction poetry classic favorites myth...
36513                                              to-read
Name: filtered_shelves_text, Length: 36514, dtype: object

In [13]:
df['description']

0        Number 30 in a series of literary pamphlets pu...
1        Fairy Tales gathers the unconventional verse d...
2        Three poems describe the nighttime adventures ...
3        A modern verse play about the search for meani...
4        Louder Than Everything You Love is about trans...
                               ...                        
36509    Appetizers\n*Poetry- Acrostic\nWo(Man)- Diaman...
36510    There was an Old Derry down Derry, who loved t...
36511                                                     
36512    'Muse, tell me of a man: a man of much resourc...
36513    Gathers poems by William Blake, Emily Bronte, ...
Name: description, Length: 36514, dtype: object

In [14]:
df['description'][844]

"Le poesie di Michelstaedter, finora troppo poco conosciute, ci fanno sentire, in un'altra forma, la stessa vibrazione estrema di La persuasione e la rettorica. Composte fra il 1905 e il 1910, risentono solo superficialmente del clima letterario italiano di quegli anni. Mentre subito vi affiorano quei temi ultimi a cui Michelstaedter dedico la sua riflessione filosofica: i temi di chi e mosso da un'invincibile vocazione a spingersi di la dal bordo della vita, <>. All'inizio con timbro adolescenziale, e ancora tenuto alla sudditanza verso temi obbligati, poi con un piglio sempre piu sicuro, e distaccandosi rapidamente da ogni dipendenza, Michelstaedter svela anche qui il suo dono specifico, quello dell'immediatezza nel pensiero, e ci guida <>attraverso un mare sempre piu aperto e pericoloso, il vero <>, un mare assente, rispetto al quale si puo dire che <>."

In [15]:
df['description'] = df['description'].str.replace('-', ' ')


In [16]:
df['description'] = df['description'].str.replace('\s+', ' ')


In [17]:
df['description'] = df['description'].str.replace('\n', ' ')

In [18]:
df['description'] = df['description'].str.replace('*', ' ')

In [19]:
df.head()

Unnamed: 0,isbn,text_reviews_count,series,country_code,language_code,popular_shelves,asin,is_ebook,average_rating,kindle_asin,...,publication_year,url,image_url,book_id,ratings_count,work_id,title,title_without_series,filtered_shelves,filtered_shelves_text
0,,1,[],US,eng,"[{'count': '8', 'name': 'to-read'}, {'count': ...",,False,3.83,,...,1887,https://www.goodreads.com/book/show/16037549-v...,https://images.gr-assets.com/books/1348176637m...,16037549,3,5212748,Vision of Sir Launfal and Other Poems,Vision of Sir Launfal and Other Poems,[],
1,811223981.0,2,[],US,,"[{'count': '100', 'name': 'to-read'}, {'count'...",,False,3.83,B00U2WY9U8,...,2015,https://www.goodreads.com/book/show/22466716-f...,https://images.gr-assets.com/books/1404958407m...,22466716,37,41905435,Fairy Tales: Dramolettes,Fairy Tales: Dramolettes,"[{'count': '100', 'name': 'to-read'}]",to-read
2,374428115.0,7,[],US,,"[{'count': '32', 'name': 'to-read'}, {'count':...",,False,4.38,,...,2008,https://www.goodreads.com/book/show/926662.Gro...,https://s.gr-assets.com/assets/nophoto/book/11...,926662,45,911665,Growltiger's Last Stand and Other Poems,Growltiger's Last Stand and Other Poems,[],
3,156182890.0,12,[],US,,"[{'count': '554', 'name': 'to-read'}, {'count'...",,False,3.71,B00IWTRB1W,...,1964,https://www.goodreads.com/book/show/926667.The...,https://images.gr-assets.com/books/1382939971m...,926667,115,995066,The Cocktail Party,The Cocktail Party,"[{'count': '554', 'name': 'to-read'}, {'count'...",to-read plays drama classics
4,1942004192.0,4,[],US,eng,"[{'count': '228', 'name': 'to-read'}, {'count'...",,False,5.0,,...,2015,https://www.goodreads.com/book/show/29065952-l...,https://images.gr-assets.com/books/1455198396m...,29065952,9,49294781,Louder Than Everything You Love,Louder Than Everything You Love,"[{'count': '228', 'name': 'to-read'}]",to-read


In [20]:
df.columns

Index(['isbn', 'text_reviews_count', 'series', 'country_code', 'language_code',
       'popular_shelves', 'asin', 'is_ebook', 'average_rating', 'kindle_asin',
       'similar_books', 'description', 'format', 'link', 'authors',
       'publisher', 'num_pages', 'publication_day', 'isbn13',
       'publication_month', 'edition_information', 'publication_year', 'url',
       'image_url', 'book_id', 'ratings_count', 'work_id', 'title',
       'title_without_series', 'filtered_shelves', 'filtered_shelves_text'],
      dtype='object')

In [21]:

df_english = df[df['language_code'] == 'eng'].copy()

# Now, 'df_english' contains only the rows where the language code is 'eng', which corresponds to English books.
df_english.reset_index(drop=True, inplace=True)


In [22]:
len(df_english)

8393

In [23]:
df['description']

0        Number 30 in a series of literary pamphlets pu...
1        Fairy Tales gathers the unconventional verse d...
2        Three poems describe the nighttime adventures ...
3        A modern verse play about the search for meani...
4        Louder Than Everything You Love is about trans...
                               ...                        
36509    Appetizers  Poetry  Acrostic Wo(Man)  Diamante...
36510    There was an Old Derry down Derry, who loved t...
36511                                                     
36512    'Muse, tell me of a man: a man of much resourc...
36513    Gathers poems by William Blake, Emily Bronte, ...
Name: description, Length: 36514, dtype: object

In [24]:
import spacy

nlp = spacy.load("en_core_web_sm")
def lemmatize_text(text):
    doc = nlp(text)
    lemmatized_text = " ".join([token.lemma_ for token in doc])
    return lemmatized_text



In [25]:
# Apply lemmatization to the 'descriptions' column
df_english['lemmatized_description'] = df_english['description'].apply(lemmatize_text)

In [26]:

df_english[['description','lemmatized_description']].sample(10)

Unnamed: 0,description,lemmatized_description
4719,"Black Misery was first published in 1969, but ...","Black Misery be first publish in 1969 , but th..."
432,At once an intimate autobiography and a collec...,at once an intimate autobiography and a collec...
5804,"'Muse, tell me of a man: a man of much resourc...","' muse , tell I of a man : a man of much resou..."
394,This book was converted from its physical edit...,this book be convert from its physical edition...
4543,"We are not born to stake a claim, but to claim...","we be not bear to stake a claim , but to claim..."
4596,,
4663,Bestselling poet r.h. Sin completes the trilog...,bestselle poet r.h . sin complete the trilogy ...
2068,$0.99 limited time ebook price Everyone wants ...,$ 0.99 limited time ebook price everyone want ...
4299,Annabel Lee is the last complete poem composed...,Annabel Lee be the last complete poem compose ...
1529,I believe that everyone who wishes to can writ...,I believe that everyone who wish to can write ...


In [27]:
print(df_english['popular_shelves'].head(1))

0    [{'count': '8', 'name': 'to-read'}, {'count': ...
Name: popular_shelves, dtype: object


In [28]:
df_english[['filtered_shelves_text','popular_shelves']].sample(20)

Unnamed: 0,filtered_shelves_text,popular_shelves
1654,to-read,"[{'count': '155', 'name': 'to-read'}, {'count'..."
2248,to-read thanksgiving,"[{'count': '172', 'name': 'to-read'}, {'count'..."
2939,to-read,"[{'count': '75', 'name': 'to-read'}, {'count':..."
7899,classics poetry fiction favorites classic myth...,"[{'count': '8816', 'name': 'classics'}, {'coun..."
1606,,"[{'count': '10', 'name': 'to-read'}, {'count':..."
468,to-read,"[{'count': '56', 'name': 'to-read'}, {'count':..."
8236,classics fiction poetry classic favorites myth...,"[{'count': '246940', 'name': 'to-read'}, {'cou..."
2630,to-read poetry currently-reading classics favo...,"[{'count': '11003', 'name': 'to-read'}, {'coun..."
5706,to-read poetry,"[{'count': '124', 'name': 'to-read'}, {'count'..."
7334,to-read,"[{'count': '81', 'name': 'to-read'}, {'count':..."


In [29]:
from sklearn.feature_extraction.text import TfidfVectorizer
import pandas as pd

# Combine the features into a single text feature
df_english['combined_features'] = df_english['title'] + ' ' + df_english['description'] + ' ' + df_english['publication_year'].astype(str) + ' ' + df_english['filtered_shelves_text']

# Initialize a TF-IDF vectorizer
tfidf_vectorizer = TfidfVectorizer(analyzer='word', ngram_range=(1, 3), min_df=5, stop_words='english')

# Create the TF-IDF matrix for the combined text feature
tfidf_matrix_combined = tfidf_vectorizer.fit_transform(df_english['combined_features'])

# tfidf_matrix_combined now contains the TF-IDF representation of all four features.


In [30]:

# Compute the cosine similarity matrix
cosine_sim = linear_kernel(tfidf_matrix_combined, tfidf_matrix_combined)


In [31]:
# Create a reverse mapping of book titles and DataFrame indices
indices = pd.Series(df_english.index, index=df_english['title']).drop_duplicates()

def get_recommendations(title, k):
    # Get the index of the book that matches the title
    idx = indices[title]

    # Get the pairwise similarity scores of all books with that book
    sim_scores = list(enumerate(cosine_sim[idx]))

    # Sort the books based on the similarity scores
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)

    # Get the scores of the top 10 most similar books
    sim_scores = sim_scores[1:k]

    # Get the book indices
    book_indices = [i[0] for i in sim_scores]

    # Return the top 10 most similar books
    return df_english['title'].iloc[book_indices]


In [32]:
recommendations = get_recommendations('The More Loving One',20)
print(recommendations)


1019                                         Lady Lazarus
1323                    The Voice of the Poet: T.S. Eliot
7134                      What W. H. Auden Can Do for You
7114                                       Selected Poems
3469                               The Sea and the Mirror
2005                 The Voice of the Poet: Robert Lowell
1996                       Selected Poetry of W. H. Auden
6446                                                House
3309                                       Conamara Blues
5234                   What the Dragon Said: A Love Story
4848    A Million Little Paper Airplane Stories, (Writ...
2934                               Complete Minimal Poems
5746                       Selected Poetry of W. H. Auden
2589                                      Collected Poems
8146                                      Collected Poems
5629                         Tell Me the Truth about Love
594               Tell me the truth about love: ten poems
7376    The No

In [33]:
import numpy as np

In [34]:
from sklearn.metrics import pairwise_distances

manhattan_distance = pairwise_distances(tfidf_matrix_combined, metric='manhattan')


In [35]:
manhattan_similarity = 1 / (1 + manhattan_distance)


In [36]:
manhattan_similarity

array([[1.        , 0.08672591, 0.09431457, ..., 0.14737435, 0.08037976,
        0.06586129],
       [0.08672591, 1.        , 0.07633685, ..., 0.11438895, 0.06821498,
        0.05799686],
       [0.09431457, 0.07633685, 1.        , ..., 0.12280961, 0.07092711,
        0.06013502],
       ...,
       [0.14737435, 0.11438895, 0.12280961, ..., 1.        , 0.09815348,
        0.07834065],
       [0.08037976, 0.06821498, 0.07092711, ..., 0.09815348, 1.        ,
        0.05460326],
       [0.06586129, 0.05799686, 0.06013502, ..., 0.07834065, 0.05460326,
        1.        ]])

In [37]:


# Calculate the Pearson Correlation Coefficient
pearson_correlation = np.corrcoef(tfidf_matrix_combined.toarray())


  c /= stddev[:, None]
  c /= stddev[None, :]


In [38]:
print(pearson_correlation.shape)


(8393, 8393)


In [39]:
indices = pd.Series(df_english.index, index=df_english['title']).to_dict()

def get_recommendations(title, k):
    # Get the index of the movie that matches the title
    idx = indices[title]

    # Get the pairwise similarity scores with that movie
    sim_scores = list(enumerate(manhattan_similarity[idx]))

    # Sort the movies based on the similarity scores
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)

    # Get the scores of the 10 most similar movies
    sim_scores = sim_scores[1:k]

    # Get the movie indices
    movie_indices = [i[0] for i in sim_scores]
   
    # Return the top 10 most similar movies
    return df_english['title'].iloc[movie_indices]

In [40]:
rec_combined_without_year_manhattan=get_recommendations('The More Loving One',20).head(20)

In [41]:
indices = pd.Series(df_english.index, index=df_english['title']).to_dict()

def get_recommendations(title, k):
    # Get the index of the movie that matches the title
    idx = indices[title]

    # Get the pairwise similarity scores with that movie
    sim_scores = list(enumerate(pearson_correlation[idx]))

    # Sort the movies based on the similarity scores
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)

    # Get the scores of the 10 most similar movies
    sim_scores = sim_scores[1:k]

    # Get the movie indices
    movie_indices = [i[0] for i in sim_scores]
   
    # Return the top 10 most similar movies
    return df_english['title'].iloc[movie_indices]

In [42]:
rec_combined_without_year_pearsons=get_recommendations('The More Loving One',20).head(20)

In [43]:
rec_combined_without_year_manhattan

1019                     Lady Lazarus
75                          Fairyland
2544                    Shapeshifting
2664                       Alphaville
3733                          Chelate
5347                     The Seafarer
5419                           Fledge
6479                         LETTERRS
7537                        Dreamland
8142                   Womanslaughter
2179                            Fasti
2890           Some More of Me Poetry
171                           You & I
1788                         Tannoura
5301                          Atticus
6250                              Him
6997                       Isla Negra
8217                        Whitsongs
1283    Write About an Empty Birdcage
Name: title, dtype: object

In [44]:
rec_combined_without_year_pearsons

1019                                    Lady Lazarus
1323               The Voice of the Poet: T.S. Eliot
2005            The Voice of the Poet: Robert Lowell
1996                  Selected Poetry of W. H. Auden
2934                          Complete Minimal Poems
2589                                 Collected Poems
2502          Pictures from Brueghel and Other Poems
403                            The Insomniac's House
2125                                         Tell Me
2414                                Rational Numbers
1596                          Poetry of the Thirties
2302                   Upon the Burning of Our House
1492    All Watched Over by Machines of Loving Grace
1446               An Introduction to English Poetry
1569                                          peluda
1174                                    Fool's House
1038                 Great Poets of the Romantic Age
2146                                Whispers of LOVE
1800            Barking Spiders and Other Such

In [45]:
recommendations

1019                                         Lady Lazarus
1323                    The Voice of the Poet: T.S. Eliot
7134                      What W. H. Auden Can Do for You
7114                                       Selected Poems
3469                               The Sea and the Mirror
2005                 The Voice of the Poet: Robert Lowell
1996                       Selected Poetry of W. H. Auden
6446                                                House
3309                                       Conamara Blues
5234                   What the Dragon Said: A Love Story
4848    A Million Little Paper Airplane Stories, (Writ...
2934                               Complete Minimal Poems
5746                       Selected Poetry of W. H. Auden
2589                                      Collected Poems
8146                                      Collected Poems
5629                         Tell Me the Truth about Love
594               Tell me the truth about love: ten poems
7376    The No

In [46]:
df_english['word2vec_features'] = df_english['title'] + ' ' + df_english['description'] + ' ' + df_english['filtered_shelves_text']


In [47]:
import nltk
nltk.download('punkt')
from nltk.tokenize import word_tokenize

# Convert titles to lowercase and tokenize
df_english['title_tokens'] = df_english['title'].str.lower().apply(word_tokenize)


[nltk_data] Downloading package punkt to
[nltk_data]     /home/FYP/nipun001/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [48]:
# Combine title tokens, description, and filtered shelves
df_english['combined_tokens'] = df_english.apply(lambda row: row['title_tokens'] + row['description'].split() + row['filtered_shelves_text'].split(), axis=1)


In [49]:
df_english['word2vec_features'] = df_english['title'] + ' ' + df_english['description'] + ' ' + df_english['filtered_shelves_text']


In [50]:
from gensim.models import Word2Vec

# Train the model
model = Word2Vec(vector_size=100,      # Dimensionality of the word vectors
                 window=5,             # Maximum distance between current and predicted word within a sentence
                 min_count=1,          # Ignores words with total frequency lower than this
                 workers=4,            # Use these many worker threads to train the model (parallelization)
                 sg=0)        

In [51]:
corpus = df_english['combined_tokens'].tolist()


In [52]:
# Build the vocabulary
model.build_vocab(corpus)

# Train the model
model.train(corpus, total_examples=model.corpus_count, epochs=10)

(7599780, 9366780)

In [53]:
model.save("word2vec_model.model")


In [54]:
import numpy as np

def get_document_embedding(book_tokens, model):
    embeddings = [model.wv[word] for word in book_tokens if word in model.wv]
    if embeddings:
        return np.mean(embeddings, axis=0)
    else:
        return np.zeros(model.vector_size)

df_english['doc_embedding'] = df_english['combined_tokens'].apply(lambda x: get_document_embedding(x, model))

In [55]:
def get_document_embedding(book_tokens, model):
    embeddings = [model.wv[word] for word in book_tokens if word in model.wv]
    if embeddings:
        return np.mean(embeddings, axis=0)
    else:
        return np.zeros(model.vector_size)

df_english['doc_embedding'] = df_english['combined_tokens'].apply(lambda x: get_document_embedding(x, model))


In [56]:
def recommend_books_by_title(target_title, df, model, top_n=5):
    # Retrieve the embedding of the target book using its title
    target_embedding = df[df['title'] == target_title]['doc_embedding'].iloc[0]
    
    # Compute cosine similarities between the target and all other books
    similarities = linear_kernel([target_embedding], df['doc_embedding'].tolist())[0]
    
    # Get indices of books sorted by similarity
    sorted_indices = np.argsort(similarities)
    
    # Get the indices of the top_n most similar books
    # Exclude the target book itself by using [-top_n-1:-1]
    most_similar_indices = sorted_indices[-top_n-1:-1][::-1]
    
    # Retrieve the titles of the most similar books
    similar_books = df['title'].iloc[most_similar_indices].tolist()
    
    return similar_books

# Example usage:



In [57]:
target_title = "The More Loving One"
recommended = recommend_books_by_title(target_title, df_english, model,10)
recommended

['After One',
 'Selected Works',
 'The End of the World',
 'The Complete Poems',
 'Here In The',
 'An Anthology of Verse',
 '100 American Poems of the Twentieth Century',
 'The Best Poems of the English Language',
 'Collection of Poems',
 'The Selected Poems']

# Evaluation

In [58]:
import json

data = []

with open('goodreads_interactions_poetry.json', 'r', encoding='utf-8') as file:
    for line in file:
        try:
            record = json.loads(line)
            data.append(record)
        except json.JSONDecodeError:
            pass  # Handle invalid lines if needed

# Now 'data' contains a list of dictionaries, each representing a JSON object


In [59]:
df_users = pd.DataFrame(data)

In [60]:


with open('goodreads_books_poetry.json', 'r', encoding='utf-8') as file:
    for line in file:
        try:
            record = json.loads(line)
            data.append(record)
        except json.JSONDecodeError:
            pass  # Handle invalid lines if needed


In [61]:
df_books = pd.DataFrame(data)

In [202]:
#filtering all eng language books
df_books = df_books[df_books['language_code'] == 'eng']

# Extract the unique book_id values from df_books
valid_book_ids = df_books['book_id'].unique()

# Filter the rows in df_users
df_users = df_users[df_users['book_id'].isin(valid_book_ids)]

In [203]:
df_users=df_users[['user_id', 'book_id', 'rating']]

book_data = df_users[df_users['rating'] == 5]
book_data

Unnamed: 0,user_id,book_id,rating
2,8842281e1d1347389f2ab93d60773d4d,30119,5
12,06316bec7a49286f1f98d5acce24f923,30119,5
13,06316bec7a49286f1f98d5acce24f923,23919,5
14,06316bec7a49286f1f98d5acce24f923,1420,5
20,220ef9c058a2132e6a9827f93a821d87,52820,5
...,...,...,...
2734291,ec8f7af656d6e448eea5f47ae504e706,15997,5
2734298,b939ec4533958d7da7942253c81e421f,13123245,5
2734300,9176ecf11d54ab4be384a12353a73e24,1420,5
2734321,bfc558b791304f0ce74ad1c3a6ab08f7,1381,5


In [91]:
movie_id_to_row_index = {movie_id: index for index, movie_id in enumerate(df_english['movie_id'].unique())}


In [81]:
book_data

Unnamed: 0,user_id,book_id,rating
2,8842281e1d1347389f2ab93d60773d4d,30119,5
12,06316bec7a49286f1f98d5acce24f923,30119,5
13,06316bec7a49286f1f98d5acce24f923,23919,5
14,06316bec7a49286f1f98d5acce24f923,1420,5
20,220ef9c058a2132e6a9827f93a821d87,52820,5
...,...,...,...
2734291,ec8f7af656d6e448eea5f47ae504e706,15997,5
2734298,b939ec4533958d7da7942253c81e421f,13123245,5
2734300,9176ecf11d54ab4be384a12353a73e24,1420,5
2734321,bfc558b791304f0ce74ad1c3a6ab08f7,1381,5


In [83]:
book_data[book_data['user_id'] == '8842281e1d1347389f2ab93d60773d4d']

Unnamed: 0,user_id,book_id,rating
2,8842281e1d1347389f2ab93d60773d4d,30119,5


In [113]:
def get_user_book_recommendations(user_id, tfidf_matrix, book_data, k=10):
    # Create a mapping from book_id to row index in the matrix
    book_id_to_row_index = {book_id: index for index, book_id in enumerate(book_data['book_id'].unique())}
    
    # Fetch books liked or rated highly by the user
    liked_books = book_data[book_data['user_id'] == user_id]
    

    # For simplicity, let's consider books rated 5 as liked books
    liked_book_ids = liked_books[liked_books['rating'] == 5]['book_id'].tolist()
    
    # Convert liked_book_ids to liked_book_indices using the mapping
    liked_book_indices = [book_id_to_row_index[book_id] for book_id in liked_book_ids if book_id in book_id_to_row_index]
    # print(liked_book_indices)
    # If no liked books found, return an empty list
    if not liked_book_indices:
        return []

    # Create a user profile by averaging the TF-IDF vectors of books they liked
    user_profile = np.mean(np.vstack([tfidf_matrix[i].toarray() for i in liked_book_indices]), axis=0)
    
    # Compute cosine similarity between the user profile and all book vectors
    sim_scores = linear_kernel([user_profile], tfidf_matrix).flatten()
    
    # Get the indices of the top k books
    book_indices = sim_scores.argsort()[-k:][::-1]
    
    
    return book_indices


In [125]:
def precision_at_k_for_books(user_id, tfidf_matrix, book_data, k=10):
    # Get the top k recommendations for the user
    recommended_book_indices = get_user_book_recommendations(user_id, tfidf_matrix, book_data, k)
    
    # Fetch books liked or rated highly by the user
    liked_books = book_data[book_data['user_id'] == user_id]
    
    liked_book_indices = liked_books[liked_books['rating'] == 5]['book_id'].tolist()
    
    # Calculate how many of the top k recommended books were actually liked by the user
    relevant_recommendations = len(set(recommended_book_indices) & set(liked_book_indices))
    

    return relevant_recommendations / k


In [126]:
all_users = book_data['user_id'].unique()
average_P_at_k_for_books = np.mean([precision_at_k_for_books(user_id, tfidf_matrix_combined, book_data) for user_id in all_users])


In [128]:
print(f"Average Precision at k=10: {average_P_at_k_for_books}")

Average Precision at k=10: 0.002458622449184


In [218]:
from sklearn.metrics.pairwise import cosine_similarity

def get_user_book_recommendations_word2vec(user_id, model, book_data, k=10):
    # Create a mapping from book_id to row index in the matrix
    book_id_to_row_index = {book_id: index for index, book_id in enumerate(book_data['book_id'].unique())}
    
    # Fetch books liked or rated highly by the user
    liked_books = book_data[book_data['user_id'] == user_id]
    
    # For simplicity, let's consider books rated 5 as liked books
    liked_book_ids = liked_books[liked_books['rating'] == 5]['book_id'].tolist()
    
    # Create a user profile by averaging the Word2Vec vectors of books they liked
    liked_book_vectors = [model.wv[book_id] for book_id in liked_book_ids if book_id in model.wv]
    if not liked_book_vectors:
        return []
    
    user_profile = np.mean(liked_book_vectors, axis=0)

    # Compute cosine similarity between the user profile and all book vectors
    all_book_ids = book_data['book_id'].unique()
    all_book_vectors = [model.wv[book_id] for book_id in all_book_ids if book_id in model.wv]
    sim_scores = cosine_similarity([user_profile], all_book_vectors).flatten()
    
    # Get the indices of the top k books
    book_indices = sim_scores.argsort()[-k:][::-1]
    
    # Return the book IDs of the top k recommendations
    recommended_book_ids = [all_book_ids[i] for i in book_indices]
    
    return recommended_book_ids


In [221]:
def precision_at_k_for_books_word2vec(user_id, model, book_data, k=10):
    # Get the top k recommendations for the user
    recommended_book_indices = get_user_book_recommendations_word2vec(user_id, model, book_data, k)
    
    # Fetch books liked or rated highly by the user
    liked_books = book_data[book_data['user_id'] == user_id]
    liked_book_ids = liked_books[liked_books['rating'] == 5]['book_id'].tolist()

    # Calculate how many of the top k recommended books were actually liked by the user
    relevant_recommendations = len(set(recommended_book_indices) & set(liked_book_ids))

    return relevant_recommendations / k


In [223]:
all_users = book_data['user_id'].unique()
average_P_at_k_for_books_word2vec = np.mean([precision_at_k_for_books_word2vec(user_id, model, book_data) for user_id in all_users])


In [226]:
print(f"Average Precision at k=10: {average_P_at_k_for_books_word2vec}")

Average Precision at k=10: 0.009759678393435


# Some extra experiments done for content based, but not significant enough for our repor

In [73]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import normalize
# Individual TF-IDF for Description
tfidf_description = TfidfVectorizer(analyzer='word', ngram_range=(1, 2), min_df=0, stop_words='english')
tfidf_matrix_description = tfidf_description.fit_transform(df_english['description'])
tfidf_matrix_description = normalize(tfidf_matrix_description, axis=1)  # Normalize TF-IDF matrix


In [74]:
tfidf_matrix_description

<8393x343522 sparse matrix of type '<class 'numpy.float64'>'
	with 827828 stored elements in Compressed Sparse Row format>

In [75]:
# Individual TF-IDF for Niche Shelves

tfidf_niche_shelves = TfidfVectorizer(analyzer='word', min_df=0, stop_words='english')
tfidf_matrix_niche_shelves = tfidf_niche_shelves.fit_transform(df_english['filtered_shelves_text'])
tfidf_matrix_niche_shelves = normalize(tfidf_matrix_niche_shelves, axis=1)


In [76]:
tfidf_matrix_niche_shelves

<8393x386 sparse matrix of type '<class 'numpy.float64'>'
	with 42812 stored elements in Compressed Sparse Row format>

In [77]:
df_english['publication_year']

0        1887
4        2015
6        2009
10       2015
17       2000
         ... 
36491    2011
36492    2011
36494    2009
36504    2016
36509        
Name: publication_year, Length: 8393, dtype: object

In [78]:


df_english['Pub_Year_Title'] = df_english['publication_year'].astype(str) + ' ' + df_english['title']

# Common TF-IDF for Publication Year and Title
tfidf_pub_year_title = TfidfVectorizer(analyzer='word', min_df=0, stop_words='english')
tfidf_matrix_common = tfidf_pub_year_title.fit_transform(df_english['Pub_Year_Title'])
tfidf_matrix_common= normalize(tfidf_matrix_common, axis=1)





A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """Entry point for launching an IPython kernel.


In [79]:
tfidf_matrix_common

<8393x7134 sparse matrix of type '<class 'numpy.float64'>'
	with 31395 stored elements in Compressed Sparse Row format>

In [40]:
import numpy as np



# Save each TF-IDF matrix to a separate file
np.save('tfidf_matrix_description.npy', tfidf_matrix_description)
np.save('tfidf_matrix_niche_shelves.npy', tfidf_matrix_niche_shelves)
np.save('tfidf_matrix_common.npy', tfidf_matrix_common)


In [11]:
print(tfidf_matrix_description.shape)
print(tfidf_matrix_niche_shelves.shape)
print(tfidf_matrix_common.shape)


()
()
()


In [14]:
import numpy as np
# Load the TF-IDF matrices from the saved files
tfidf_matrix_description = np.load('/Users/nipunbhatia/Desktop/y4s1/dam/tfidf_matrix_common.npy',allow_pickle=True)

tfidf_matrix_niche_shelves = np.load('tfidf_matrix_niche_shelves.npy',)
tfidf_matrix_common = np.load('tfidf_matrix_common.npy')


ValueError: Object arrays cannot be loaded when allow_pickle=False

In [17]:
tfidf_matrix_description

array(<23205x14494 sparse matrix of type '<class 'numpy.float64'>'
	with 91665 stored elements in Compressed Sparse Row format>, dtype=object)

In [80]:
import numpy as np
from tqdm import tqdm

# Define batch size (the number of columns to process in each batch)
batch_size = 1000

# Initialize an empty final matrix
final_tfidf_matrix = None

# Calculate the number of batches needed
num_batches = tfidf_matrix_description.shape[1] // batch_size + 1

# Process data in batches
for i in tqdm(range(num_batches)):
    start_col = i * batch_size
    end_col = (i + 1) * batch_size

    # Slice and horizontally concatenate the TF-IDF matrices in this batch
    batch_matrix = np.hstack((tfidf_matrix_description[:, start_col:end_col].toarray(),
                               tfidf_matrix_niche_shelves[:, start_col:end_col].toarray(),
                               tfidf_matrix_common[:, start_col:end_col].toarray()))

    if final_tfidf_matrix is None:
        final_tfidf_matrix = batch_matrix
    else:
        final_tfidf_matrix = np.hstack((final_tfidf_matrix, batch_matrix))

    # Free up memory by deleting the batch_matrix
    del batch_matrix

# Now, final_tfidf_matrix contains the concatenated TF-IDF matrices in batches


 37%|███▋      | 127/344 [13:34<47:01, 13.00s/it]

KeyboardInterrupt: 

In [None]:
np.save('final_tfidf_matrix.npy', final_tfidf_matrix)

In [42]:
import numpy as np

In [43]:
tfidf_matrix_final = np.load('final_tfidf_matrix.npy',allow_pickle=True)


In [81]:
len(tfidf_matrix_final[0])

351042