In [38]:
import pandas as pd
import numpy as np
# Libraries for vectorization of text and one hot encoding, BERT, GPT, RoBERTa
import torch
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import MultiLabelBinarizer
from transformers import BertTokenizer, BertModel, GPT2Tokenizer, GPT2Model, RobertaTokenizer, RobertaModel
from torch.utils.data import DataLoader, Dataset
import ast
import re
from sklearn.decomposition import TruncatedSVD
from sklearn.metrics.pairwise import cosine_similarity

In [229]:
# Import dataset
df1 = pd.read_csv('processed_books.csv')

# Drop some columns
df1 = df1.drop(['series','price', 'language', 'primary_lists'], axis=1)

# Strip book_id column
df1['book_id'] = df1['book_id'].str.strip()

# Get duplicate count in book_id column
print(df1['book_id'].duplicated().sum())
df1.drop_duplicates(subset=['book_id'], inplace=True)

# Drop rows with missing values
df1 = df1.dropna()


df1.head()

0


Unnamed: 0,book_id,title,author,genres,publisher,year_published,description,current_readers,wanted_to_read,num_reviews,num_ratings,rating,awards
0,77203.The_Kite_Runner,The Kite Runner,Khaled Hosseini,"['Fiction', 'Historical Fiction', 'Classics', ...",Riverhead Books,2004-05-01,1970s Afghanistan: Twelve-year-old Amir is des...,42900.0,1000000.0,90234,2935385,4.33,['Borders Original Voices Award for Fiction (2...
1,929.Memoirs_of_a_Geisha,Memoirs of a Geisha,Arthur Golden,"['Fiction', 'Historical Fiction', 'Romance', '...",Vintage Books USA,2005-11-22,"A literary sensation and runaway bestseller, t...",12300.0,793000.0,34102,1922540,4.14,[]
2,128029.A_Thousand_Splendid_Suns,A Thousand Splendid Suns,Khaled Hosseini,"['Fiction', 'Historical Fiction', 'Contemporar...",Riverhead Books,2007-06-01,Mariam is only fifteen when she is sent to Kab...,32700.0,760000.0,69431,1417260,4.42,['British Book Award for Best Read of the Year...
3,19063.The_Book_Thief,The Book Thief,Markus Zusak,"['Historical Fiction', 'Fiction', 'Young Adult...",Alfred A. Knopf,2006-03-14,Librarian's note: An alternate cover edition c...,86000.0,2000000.0,134883,2345385,4.39,['National Jewish Book Award for Children’s an...
4,4214.Life_of_Pi,Life of Pi,Yann Martel,"['Fiction', 'Fantasy', 'Classics', 'Adventure'...",Seal Books,2006-08-29,Life of Pi is a fantasy adventure novel by Yan...,24900.0,726000.0,51257,1544622,3.93,"['Booker Prize (2002)', 'Bollinger Everyman Wo..."


In [41]:
# For title and description we will build three different word embeddings:
# Bert, GPT and RoBERTa

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(device)


class TextDataset(Dataset):
    def __init__(self, texts, tokenizer, max_length=512):
        self.texts = texts
        self.tokenizer = tokenizer
        self.max_length = max_length

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        input_ids = self.tokenizer.encode(self.texts[idx], add_special_tokens=True, max_length=self.max_length, truncation=True)
        return torch.tensor(input_ids)



def get_embeddings(texts, tokenizer, model, batch_size=8):
    dataset = TextDataset(texts, tokenizer)
    loader = DataLoader(dataset, batch_size=batch_size, collate_fn=lambda x: torch.nn.utils.rnn.pad_sequence(x, batch_first=True))

    embeddings = []
    with torch.no_grad():
        for batch in loader:
            batch = batch.to(device)
            outputs = model(batch)
            batch_embeddings = outputs[0].mean(dim=1).cpu().numpy()
            embeddings.extend(batch_embeddings)
    return embeddings



# BERT
bert_tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
bert_model = BertModel.from_pretrained('bert-base-uncased').to(device)
title_bert = get_embeddings(df1['title'].tolist(), bert_tokenizer, bert_model)
description_bert = get_embeddings(df1['description'].tolist(), bert_tokenizer, bert_model)

del bert_model
torch.cuda.empty_cache()

# GPT-2
gpt_tokenizer = GPT2Tokenizer.from_pretrained('gpt2')
gpt_model = GPT2Model.from_pretrained('gpt2').to(device)
title_gpt = get_embeddings(df1['title'].tolist(), gpt_tokenizer, gpt_model)
description_gpt = get_embeddings(df1['description'].tolist(), gpt_tokenizer, gpt_model)

del gpt_model
torch.cuda.empty_cache()

# RoBERTa
roberta_tokenizer = RobertaTokenizer.from_pretrained('roberta-base')
roberta_model = RobertaModel.from_pretrained('roberta-base').to(device)
title_roberta = get_embeddings(df1['title'].tolist(), roberta_tokenizer, roberta_model)
description_roberta = get_embeddings(df1['description'].tolist(), roberta_tokenizer, roberta_model)

del roberta_model
torch.cuda.empty_cache()



cuda


Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertModel: ['cls.predictions.transform.LayerNorm.bias', 'cls.predictions.decoder.weight', 'cls.seq_relationship.weight', 'cls.predictions.bias', 'cls.seq_relationship.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.bias']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of the model checkpoint at roberta-base were not used when initializing RobertaModel: ['lm_head.layer_norm.weight', 'lm_head.decoder.weight', 

In [204]:
# Create dataframes for each set of embeddings
title_bert_df = pd.DataFrame(title_bert, columns=[f"title_bert_{i}" for i in range(len(title_bert[0]))])
description_bert_df = pd.DataFrame(description_bert, columns=[f"description_bert_{i}" for i in range(len(description_bert[0]))])

title_gpt_df = pd.DataFrame(title_gpt, columns=[f"title_gpt_{i}" for i in range(len(title_gpt[0]))])
description_gpt_df = pd.DataFrame(description_gpt, columns=[f"description_gpt_{i}" for i in range(len(description_gpt[0]))])

title_roberta_df = pd.DataFrame(title_roberta, columns=[f"title_roberta_{i}" for i in range(len(title_roberta[0]))])
description_roberta_df = pd.DataFrame(description_roberta, columns=[f"description_roberta_{i}" for i in range(len(description_roberta[0]))])

# Reset the index of the original dataframe
df1.reset_index(drop=True, inplace=True)

# Reset the index of each embeddings dataframe
title_bert_df.reset_index(drop=True, inplace=True)
description_bert_df.reset_index(drop=True, inplace=True)
title_gpt_df.reset_index(drop=True, inplace=True)
description_gpt_df.reset_index(drop=True, inplace=True)
title_roberta_df.reset_index(drop=True, inplace=True)
description_roberta_df.reset_index(drop=True, inplace=True)

# Concatenate the embeddings dataframes with the original dataframe
df_text_desc_embeds = pd.concat([df1, title_bert_df, description_bert_df, title_gpt_df, description_gpt_df, title_roberta_df, description_roberta_df], axis=1)

In [206]:
df_text_desc_embeds.head()
# Drop the original title and description columns as well as year_published
df_text_desc_embeds = df_text_desc_embeds.drop(['title', 'description', 'year_published'], axis=1)

In [207]:
def standardize_publisher_name(publisher):
    # Replace ".com" with an empty string
    publisher = publisher.replace(".com", "")
    
    # Replace 'self-published' and 'self published' with 'self_published'
    publisher = re.sub(r"self[-\s]?published", "self_published", publisher, flags=re.IGNORECASE)

    # Remove all non-alphanumeric characters (except underscores)
    publisher = re.sub(r"[^\w\s]", "", publisher)

    # Convert the publisher name to lowercase
    publisher = publisher.lower()
    
    # Remove leading and trailing whitespaces
    publisher = publisher.strip()
    
    return publisher

# Apply the function to the publisher column
df_text_desc_embeds['publisher'] = df_text_desc_embeds['publisher'].apply(standardize_publisher_name)

In [208]:
min_freq = 10
# Group less frequent authors into "Others"
author_counts = df_text_desc_embeds['author'].value_counts()
authors_to_replace = author_counts[author_counts < min_freq].index
df_text_desc_embeds['authors_reduced'] = df_text_desc_embeds['author'].replace(authors_to_replace, 'Others')

# Group less frequent publishers into "Others"
publisher_counts = df_text_desc_embeds['publisher'].value_counts()
publishers_to_replace = publisher_counts[publisher_counts < min_freq].index
df_text_desc_embeds['publisher_reduced'] = df_text_desc_embeds['publisher'].replace(publishers_to_replace, 'Others')

In [209]:
duplicate_columns = df_text_desc_embeds.columns.duplicated().sum()

# Print the number of duplicate column names
print(f"Number of duplicate column names: {duplicate_columns}")

# Show the shape of the dataframe
print(f"Shape of the dataframe: {df_text_desc_embeds.shape}")

Number of duplicate column names: 0
Shape of the dataframe: (4668, 4620)


In [210]:
""" # Apply ast.literal_eval() using a lambda operation to the 'awards' column
print(df_text_desc_embeds.shape)
df_text_desc_embeds['genres'] = df_text_desc_embeds['genres'].apply(lambda x: ast.literal_eval(x))
print(df_text_desc_embeds.shape)

# One hot encode genres and publishers and author with separate multi label binarizers
mlb_genres = MultiLabelBinarizer()
genres = mlb_genres.fit_transform(df_text_desc_embeds['genres'])
print(genres.shape)

# Create dataframes for each set of one hot encoded features
genres_df = pd.DataFrame(genres, columns=[f"genre_{c}" for c in mlb_genres.classes_])
print(genres_df.shape)

# Reset the index of each one hot encoded dataframe
df_text_desc_embeds.reset_index(drop=True, inplace=True)
genres_df.reset_index(drop=True, inplace=True)

# Concatenate the one hot encoded dataframes with the original dataframe
df_text_desc_embeds = pd.concat([df_text_desc_embeds, genres_df], axis=1)
print(df_text_desc_embeds.shape)

# For the author and publisher a simple one hot encoding will be used
df_text_desc_embeds_authors = pd.get_dummies(df_text_desc_embeds, columns=['authors_reduced'], prefix='author')
df_text_desc_embeds_authors_publishers = pd.get_dummies(df_text_desc_embeds_authors, columns=['publisher_reduced'], prefix='publisher')
print(df_text_desc_embeds_authors.shape)
print(df_text_desc_embeds_authors_publishers.shape)

# Reset the index of each one hot encoded dataframe
df_text_desc_embeds_authors.reset_index(drop=True, inplace=True)
df_text_desc_embeds_authors_publishers.reset_index(drop=True, inplace=True)

# Concat the original dataframe with the one hot encoded author and publisher dataframes
df_text_desc_embeds = pd.concat([df_text_desc_embeds_authors_publishers, df_text_desc_embeds_authors], axis=1)
print(df_text_desc_embeds.shape)

# Drop the original authors, genres and publishers columns
df_text_desc_embeds.drop(columns=['genres', 'authors_reduced', 'publisher_reduced'], inplace=True)
print(df_text_desc_embeds.shape)
 """

' # Apply ast.literal_eval() using a lambda operation to the \'awards\' column\nprint(df_text_desc_embeds.shape)\ndf_text_desc_embeds[\'genres\'] = df_text_desc_embeds[\'genres\'].apply(lambda x: ast.literal_eval(x))\nprint(df_text_desc_embeds.shape)\n\n# One hot encode genres and publishers and author with separate multi label binarizers\nmlb_genres = MultiLabelBinarizer()\ngenres = mlb_genres.fit_transform(df_text_desc_embeds[\'genres\'])\nprint(genres.shape)\n\n# Create dataframes for each set of one hot encoded features\ngenres_df = pd.DataFrame(genres, columns=[f"genre_{c}" for c in mlb_genres.classes_])\nprint(genres_df.shape)\n\n# Reset the index of each one hot encoded dataframe\ndf_text_desc_embeds.reset_index(drop=True, inplace=True)\ngenres_df.reset_index(drop=True, inplace=True)\n\n# Concatenate the one hot encoded dataframes with the original dataframe\ndf_text_desc_embeds = pd.concat([df_text_desc_embeds, genres_df], axis=1)\nprint(df_text_desc_embeds.shape)\n\n# For the 

In [211]:
# Apply ast.literal_eval() using a lambda operation to the 'awards' column
print(df_text_desc_embeds.shape)
df_text_desc_embeds['genres'] = df_text_desc_embeds['genres'].apply(lambda x: ast.literal_eval(x))
print(df_text_desc_embeds.shape)

# One hot encode genres and publishers and author with separate multi label binarizers
mlb_genres = MultiLabelBinarizer()
genres = mlb_genres.fit_transform(df_text_desc_embeds['genres'])
print(genres.shape)

# Create dataframes for each set of one hot encoded features
genres_df = pd.DataFrame(genres, columns=[f"genre_{c}" for c in mlb_genres.classes_])
print(genres_df.shape)

# Reset the index of each one hot encoded dataframe
df_text_desc_embeds.reset_index(drop=True, inplace=True)
genres_df.reset_index(drop=True, inplace=True)

# Concatenate the one hot encoded dataframes with the original dataframe
df_text_desc_embeds = pd.concat([df_text_desc_embeds, genres_df], axis=1)
print(df_text_desc_embeds.shape)

# For the author and publisher a simple one hot encoding will be used
df_text_desc_embeds_authors_publishers = pd.get_dummies(df_text_desc_embeds, columns=['authors_reduced', 'publisher_reduced'], prefix=['author', 'publisher'])
print(df_text_desc_embeds_authors_publishers.shape)

# Reset the index of the one hot encoded dataframe
df_text_desc_embeds_authors_publishers.reset_index(drop=True, inplace=True)

# Assign the modified dataframe to the original variable
df_text_desc_embeds = df_text_desc_embeds_authors_publishers
print(df_text_desc_embeds.shape)

# Drop the original authors, genres, and publishers columns
df_text_desc_embeds.drop(columns=['genres', 'author', 'publisher'], inplace=True)
print(df_text_desc_embeds.shape)


(4668, 4620)
(4668, 4620)
(4668, 552)
(4668, 552)
(4668, 5172)
(4668, 5283)
(4668, 5283)
(4668, 5280)


In [213]:
df_text_desc_embeds.head()

Unnamed: 0,book_id,current_readers,wanted_to_read,num_reviews,num_ratings,rating,awards,title_bert_0,title_bert_1,title_bert_2,...,publisher_st martins paperbacks,publisher_st martins press,publisher_tor books,publisher_university of chicago press,publisher_viking,publisher_vintage,publisher_vintage crimeblack lizard,publisher_w w norton company,publisher_william morrow,publisher_william morrow paperbacks
0,77203.The_Kite_Runner,42900.0,1000000.0,90234,2935385,4.33,['Borders Original Voices Award for Fiction (2...,-0.07898,-0.442372,0.061085,...,0,0,0,0,0,0,0,0,0,0
1,929.Memoirs_of_a_Geisha,12300.0,793000.0,34102,1922540,4.14,[],-0.416897,-0.30487,-0.202104,...,0,0,0,0,0,0,0,0,0,0
2,128029.A_Thousand_Splendid_Suns,32700.0,760000.0,69431,1417260,4.42,['British Book Award for Best Read of the Year...,0.121623,0.220515,0.253245,...,0,0,0,0,0,0,0,0,0,0
3,19063.The_Book_Thief,86000.0,2000000.0,134883,2345385,4.39,['National Jewish Book Award for Children’s an...,0.00188,-0.438034,0.189876,...,0,0,0,0,0,0,0,0,0,0
4,4214.Life_of_Pi,24900.0,726000.0,51257,1544622,3.93,"['Booker Prize (2002)', 'Bollinger Everyman Wo...",-0.123796,-0.479276,0.318562,...,0,0,0,0,0,0,0,0,0,0


In [215]:
# For every entry in the 'awards' column only use alphanumeric characters and '[]' and ',' and convert to lowercase
df_text_desc_embeds['awards'] = df_text_desc_embeds['awards'].astype(str).apply(lambda x: re.sub(r"[^a-zA-Z0-9\s\[\],]", "", x).lower())

In [216]:
df_text_desc_embeds.head()

Unnamed: 0,book_id,current_readers,wanted_to_read,num_reviews,num_ratings,rating,awards,title_bert_0,title_bert_1,title_bert_2,...,publisher_st martins paperbacks,publisher_st martins press,publisher_tor books,publisher_university of chicago press,publisher_viking,publisher_vintage,publisher_vintage crimeblack lizard,publisher_w w norton company,publisher_william morrow,publisher_william morrow paperbacks
0,77203.The_Kite_Runner,42900.0,1000000.0,90234,2935385,4.33,[borders original voices award for fiction 200...,-0.07898,-0.442372,0.061085,...,0,0,0,0,0,0,0,0,0,0
1,929.Memoirs_of_a_Geisha,12300.0,793000.0,34102,1922540,4.14,[],-0.416897,-0.30487,-0.202104,...,0,0,0,0,0,0,0,0,0,0
2,128029.A_Thousand_Splendid_Suns,32700.0,760000.0,69431,1417260,4.42,[british book award for best read of the year ...,0.121623,0.220515,0.253245,...,0,0,0,0,0,0,0,0,0,0
3,19063.The_Book_Thief,86000.0,2000000.0,134883,2345385,4.39,[national jewish book award for childrens and ...,0.00188,-0.438034,0.189876,...,0,0,0,0,0,0,0,0,0,0
4,4214.Life_of_Pi,24900.0,726000.0,51257,1544622,3.93,"[booker prize 2002, bollinger everyman wodehou...",-0.123796,-0.479276,0.318562,...,0,0,0,0,0,0,0,0,0,0


In [217]:
# Apply ast.literal_eval() using a lambda operation to the 'awards' column
# df_text_desc_embeds['awards'] = df_text_desc_embeds['awards'].apply(lambda x: ast.literal_eval(x))

def parse_awards(awards_str):
    awards_list = awards_str.strip('][').split(', ')
    return [award.strip() for award in awards_list if award]

df_text_desc_embeds['awards'] = df_text_desc_embeds['awards'].apply(parse_awards)

# Get the number of awards for each book
df_text_desc_embeds = df_text_desc_embeds.reset_index(drop=True)
df_text_desc_embeds['num_awards'] = df_text_desc_embeds['awards'].apply(lambda x: len(x))
# Drop the original awards column
df_text_desc_embeds = df_text_desc_embeds.drop(['awards'], axis=1)

In [218]:
# Turn all float64 columns into float32
for col in df_text_desc_embeds.columns:
    if df_text_desc_embeds[col].dtype == 'float64' or df_text_desc_embeds[col].dtype == 'int64' or df_text_desc_embeds[col].dtype == 'int32':
        df_text_desc_embeds[col] = df_text_desc_embeds[col].astype('float32')


# Remove commas from the number of ratings and reviews
df_text_desc_embeds['num_ratings'] = df_text_desc_embeds['num_ratings'].apply(lambda x: x.replace(',', ''))
df_text_desc_embeds['num_reviews'] = df_text_desc_embeds['num_reviews'].apply(lambda x: x.replace(',', ''))
# Turn the number of ratings and reviews into float32
df_text_desc_embeds['num_ratings'] = df_text_desc_embeds['num_ratings'].astype('float32')
df_text_desc_embeds['num_reviews'] = df_text_desc_embeds['num_reviews'].astype('float32')

# View the types of the columns values counts
df_text_desc_embeds.dtypes.value_counts()

float32    5166
uint8       113
object        1
dtype: int64

In [219]:
# Separate the book_id column
book_ids = df_text_desc_embeds['book_id']
numeric_data = df_text_desc_embeds.drop('book_id', axis=1)

# Calculate the number of components
target_ratio = 1/5
n_components = int(numeric_data.shape[1] * target_ratio)

# Initialize the TruncatedSVD object
trunc_svd = TruncatedSVD(n_components=n_components)

# Fit the TruncatedSVD model and transform your data
reduced_data = trunc_svd.fit_transform(numeric_data)

# Convert the reduced data back to a DataFrame
reduced_data_df = pd.DataFrame(reduced_data)

# Add the book_id column back to the DataFrame
reduced_data_df['book_id'] = book_ids.values

# Reset the index
reduced_data_df = reduced_data_df.reset_index(drop=True)


In [225]:
# Assuming user_books_ids is a list of book IDs the user has interacted with
# and user_ratings is a list of the corresponding ratings (optional)
user_books_ids = ['77203.The_Kite_Runner', '929.Memoirs_of_a_Geisha', '128029.A_Thousand_Splendid_Suns', '19063.The_Book_Thief', '4214.Life_of_Pi']
user_ratings = [4, 4, 3, 5, 4]

# Filter the DataFrame to get only the rows corresponding to the books in the user_books_ids list
user_books_df = df_text_desc_embeds[df_text_desc_embeds['book_id'].isin(user_books_ids)]

# Drop the 'book_id' column and convert the DataFrame to a NumPy array
user_books_feature_vectors = user_books_df.drop(columns=['book_id']).values

# If you have user ratings, you can weight the feature vectors by the ratings
user_books_feature_vectors = user_books_feature_vectors * np.array(user_ratings)[:, np.newaxis]

user_profile = user_books_feature_vectors.mean(axis=0)


In [226]:
# Assuming you have already calculated the user_profile
# Remove the books that the user has interacted with from the DataFrame
remaining_books_df = df_text_desc_embeds[~df_text_desc_embeds['book_id'].isin(user_books_ids)]

# Drop the 'book_id' column and convert the remaining books DataFrame to a NumPy array
remaining_books_feature_vectors = remaining_books_df.drop(columns=['book_id']).values

# Calculate similarity scores between the user profile and the remaining books
similarity_scores = cosine_similarity([user_profile], remaining_books_feature_vectors)

# Get the indices of the top N most similar books
N = 10
top_indices = np.argsort(similarity_scores[0])[-N:][::-1]

# Get the top N most similar books' book_ids
recommended_book_ids = remaining_books_df.iloc[top_indices]['book_id'].values

print("Recommended books:", recommended_book_ids)


Recommended books: ['13570651-bared-to-you' '20448515-bared-to-you' '7445.The_Glass_Castle'
 '4009.Nine_Stories' '67035.Relic' '33724.Can_You_Keep_a_Secret_'
 '11.The_Hitchhiker_s_Guide_to_the_Galaxy'
 '386162.The_Hitchhiker_s_Guide_to_the_Galaxy' '5206937-a-modest-proposal'
 '21484.The_Winds_of_War']
