In [1]:
!pip install scikit-surprise




In [2]:
import pandas as pd
import gdown
from surprise import SVD, Dataset, Reader
from surprise.model_selection import train_test_split
from surprise import accuracy
from sklearn.metrics import r2_score
#content based
import nltk
from nltk.corpus import stopwords
import re
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import linear_kernel, cosine_similarity


In [3]:
url = "https://drive.google.com/uc?export=download&id=10IbYSLhC-4QqWnHFOUAvaAG-62BtZ14G"
output = "bookdataset.csv"
gdown.download(url, output, quiet=False)


Downloading...
From (original): https://drive.google.com/uc?export=download&id=10IbYSLhC-4QqWnHFOUAvaAG-62BtZ14G
From (redirected): https://drive.google.com/uc?export=download&id=10IbYSLhC-4QqWnHFOUAvaAG-62BtZ14G&confirm=t&uuid=15925d54-35ea-42b8-8a32-5b168aa78f82
To: /content/bookdataset.csv
100%|██████████| 88.7M/88.7M [00:00<00:00, 163MB/s]


'bookdataset.csv'

In [4]:
data = pd.read_csv("bookdataset.csv")

In [5]:
print(data.head())

             User        ISBN              Title  Price  rating  \
0  A39650P2CZUUC9  B000MCKQRS  Cruel and Unusual  22.48       4   
1     AFVQZQ8PW0L  B000MCKQRS  Cruel and Unusual  22.48       5   
2  A2E9Q3T876TQ6C  B000MCKQRS  Cruel and Unusual  22.48       4   
3  A2GBJQ9THOYDAJ  B000MCKQRS  Cruel and Unusual  22.48       4   
4   ABWF7YVZAU1QP  B000MCKQRS  Cruel and Unusual  22.48       4   

                                         description  \
0  Wanneer er in dit achtste deel in de Kay Scarp...   
1  Wanneer er in dit achtste deel in de Kay Scarp...   
2  Wanneer er in dit achtste deel in de Kay Scarp...   
3  Wanneer er in dit achtste deel in de Kay Scarp...   
4  Wanneer er in dit achtste deel in de Kay Scarp...   

                                               image          publisher  \
0  http://books.google.com/books/content?id=AgckD...  Luitingh Sijthoff   
1  http://books.google.com/books/content?id=AgckD...  Luitingh Sijthoff   
2  http://books.google.com/books/co

In [6]:
print(data.isnull().sum())

User                  0
ISBN                  0
Title                 0
Price                 0
rating                0
description           0
image                 0
publisher             0
publishedDate         0
infoLink              0
categories            0
number_of_ratings     0
authors              22
dtype: int64


In [7]:
print(data.columns)

Index(['User', 'ISBN', 'Title', 'Price', 'rating', 'description', 'image',
       'publisher', 'publishedDate', 'infoLink', 'categories',
       'number_of_ratings', 'authors'],
      dtype='object')


In [8]:
cdata = data.dropna(subset=['authors'])

In [9]:
cdata.head()

Unnamed: 0,User,ISBN,Title,Price,rating,description,image,publisher,publishedDate,infoLink,categories,number_of_ratings,authors
0,A39650P2CZUUC9,B000MCKQRS,Cruel and Unusual,22.48,4,Wanneer er in dit achtste deel in de Kay Scarp...,http://books.google.com/books/content?id=AgckD...,Luitingh Sijthoff,1970,https://play.google.com/store/books/details?id...,Fiction,10,Patricia Cornwell
1,AFVQZQ8PW0L,B000MCKQRS,Cruel and Unusual,22.48,5,Wanneer er in dit achtste deel in de Kay Scarp...,http://books.google.com/books/content?id=AgckD...,Luitingh Sijthoff,1970,https://play.google.com/store/books/details?id...,Fiction,10,Patricia Cornwell
2,A2E9Q3T876TQ6C,B000MCKQRS,Cruel and Unusual,22.48,4,Wanneer er in dit achtste deel in de Kay Scarp...,http://books.google.com/books/content?id=AgckD...,Luitingh Sijthoff,1970,https://play.google.com/store/books/details?id...,Fiction,10,Patricia Cornwell
3,A2GBJQ9THOYDAJ,B000MCKQRS,Cruel and Unusual,22.48,4,Wanneer er in dit achtste deel in de Kay Scarp...,http://books.google.com/books/content?id=AgckD...,Luitingh Sijthoff,1970,https://play.google.com/store/books/details?id...,Fiction,10,Patricia Cornwell
4,ABWF7YVZAU1QP,B000MCKQRS,Cruel and Unusual,22.48,4,Wanneer er in dit achtste deel in de Kay Scarp...,http://books.google.com/books/content?id=AgckD...,Luitingh Sijthoff,1970,https://play.google.com/store/books/details?id...,Fiction,10,Patricia Cornwell


In [10]:
# Check for remaining null values in any other column
print(cdata.isnull().sum())

User                 0
ISBN                 0
Title                0
Price                0
rating               0
description          0
image                0
publisher            0
publishedDate        0
infoLink             0
categories           0
number_of_ratings    0
authors              0
dtype: int64


In [11]:
# Load the data (use load_from_df instead of load_from_cdata)
reader = Reader(rating_scale=(1, 5))
data_df = Dataset.load_from_df(data[['User', 'ISBN', 'rating']], reader)

In [12]:
# Split the data into train and test sets
trainset, testset = train_test_split(data_df, test_size=0.25)

In [13]:
# Train the SVD model
model = SVD() #initialize the SVD model (Singular Value Decomposition)
model.fit(trainset) #train the model

<surprise.prediction_algorithms.matrix_factorization.SVD at 0x7d1efaae73d0>

In [14]:
# Predict the ratings for the test set
predictions = model.test(testset)

In [15]:
# Evaluate the accuracy of the model
accuracy.rmse(predictions)

RMSE: 0.7751


0.7751231886526112

In [16]:
# Function to predict the rating for a specific user and book
def predict_rating(user_id, book_isbn):
    return model.predict(user_id, book_isbn)

In [17]:
# Check training completion
print("Collaborative Filtering model has been trained!")

Collaborative Filtering model has been trained!


In [18]:
#Content based filtering
# Download NLTK stopwords (if not already downloaded)
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [19]:
# Function to tokenize the description
def tokenized_description(description):

    # Remove special characters and numbers
    description = re.sub(r'[^a-zA-Z\s]', '', description)

    #tokenize and remove stopwords
    tokens=description.split()
    stop_words = set(stopwords.words('english'))
    tokens=[word for word in tokens if word not in stop_words]

    #join the tokens back to form a cleaned string
    return ''.join(tokens)

In [20]:
#applying tokenizing to the 'description' column
cdata['tokenized_description']=cdata['description'].fillna('').apply(tokenized_description)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  cdata['tokenized_description']=cdata['description'].fillna('').apply(tokenized_description)


In [21]:
#Combine 'categories', 'authors', and 'cleaned_description' to form a new 'content' column
cdata['content'] = (cdata['categories'].fillna('') + ' ' +
                       cdata['authors'].fillna('') + ' ' +
                       cdata['tokenized_description'].fillna(''))

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  cdata['content'] = (cdata['categories'].fillna('') + ' ' +


In [22]:
# Use TF-IDF Vectorizer to vectorize the combined 'content' column
tfidf = TfidfVectorizer(stop_words='english')
tfidf_matrix = tfidf.fit_transform(cdata['content'])

In [None]:
# Compute the cosine similarity matrix
cosine_sim = linear_kernel(tfidf_matrix, tfidf_matrix)

In [None]:
# Function to get recommendations based on a book title
def get_recommendations(title, cosine_sim=cosine_sim):
    # Get the index of the book that matches the title
    idx = cdata[cdata['Title'] == title].index[0]

    # Get the pairwise similarity scores of all books with the given book
    sim_scores = list(enumerate(cosine_sim[idx]))

    # Sort the books based on similarity scores
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)

    # Get the indices of the 10 most similar books
    sim_scores = sim_scores[1:11]  # Skip the first one as it's the same book
    book_indices = [i[0] for i in sim_scores]

    # Return the top 10 most similar books
    return cdata['Title'].iloc[book_indices]

In [None]:
# Test the function with a sample book title
recommended_books = get_recommendations('Some Book Title')
print("Recommended Books:")
print(recommended_books)