# Github Book Reccomendation System

https://github.com/mujtabaali02/Book-Recommendation-System/blob/master/Book_Recommendation_Syatem.ipynb

# Load and Read Data

In [None]:
# Import pandas and numpy for loading, reading, and data preprocessing
import pandas as pd
import numpy as np

# Load and read datasets
books_data = pd.read_csv('Books.csv')
ratings_data = pd.read_csv('Ratings.csv')
user_data = pd.read_csv('Users.csv')

# Initial Data Analysis

In [None]:
books_data.head()

In [None]:
ratings_data.head()

In [None]:
user_data.head()

In [None]:
# View how many rows and columns
print(books_data.shape)
print(ratings_data.shape)
print(user_data.shape)

In [None]:
# View how many rows and columns
print(books_data.shape)
print(ratings_data.shape)
print(user_data.shape)

In [None]:
# Check data type and if columns are correctly typed
print(books_data.dtypes) 
print()
print(ratings_data.dtypes)
print()
print(user_data.dtypes)

In [None]:
# Get the number of unique values in each column
print(books_data.nunique())  
print()
print(ratings_data.nunique())  
print()
print(user_data.nunique())

In [None]:
# View the frequency of different values
print(books_data['Book-Title'].value_counts().head(10))  # Top 10 most common book titles
print()
print(ratings_data['User-ID'].value_counts().head(10))  # Top 10 most active users
print()
print(user_data['Location'].value_counts().head(10))   # Top 10 locations among users 

In [None]:
# Check for missing values
print(books_data.isnull().sum())
print()
print(ratings_data.isnull().sum())
print()
print(user_data.isnull().sum())

In [None]:
# Check for duplicates
print(books_data.duplicated().sum())
print(ratings_data.duplicated().sum())
print(user_data.duplicated().sum())

In [None]:
# Summary statistics 
print(books_data.describe())
print()
print(ratings_data.describe())
print()
print(user_data.describe())

In [None]:
# Drop the non-numeric columns for correlation analysis
ratings_numeric = ratings_data.drop(columns=['ISBN'])

# Calculate the correlation for numeric columns only
correlation_matrix = ratings_numeric.corr()

# Print correlation matrix
print(correlation_matrix)

# Data Preprocessing

Handling Missing Values and Cleaning the Data -- Users, Books, Ratings

In [None]:
# For User Data missing values are in the age column: 

# Calculate the median of the Age column, excluding missing values
median_age = user_data['Age'].median()

# Fill missing values in the Age column with the median value
user_data['Age'].fillna(median_age, inplace=True)

# Verify that there are no more missing values in the Age column
print(user_data['Age'].isnull().sum())

In [None]:
#Capping the outlier rows with Percentiles
upper_lim = user_data['Age'].quantile(.95)
lower_lim = user_data['Age'].quantile(.05)
user_data.loc[(user_data["Age"] > upper_lim),"Age"] = upper_lim
user_data.loc[(user_data["Age"] < lower_lim),"Age"] = lower_lim

In [None]:
user_data['Age'].describe()

In [None]:
# For Book Data missing values are book author and publisher:

# Fill missing values in the Book-Author column with a placeholder
books_data['Book-Author'].fillna('Unknown Author', inplace=True)

# Fill missing values in the Publisher column with a placeholder
books_data['Publisher'].fillna('Unknown Publisher', inplace=True)

# Verify that there are no more missing values in the relevant columns
print(books_data[['Book-Author', 'Publisher']].isnull().sum())

In [None]:
# Book Data (continued), year of publication is an object and inconsistant for further EDA:
books_data['Year-Of-Publication'].unique()

In [None]:
# Convert years from objects to integers
# Convert invalid entries to NaN
books_data['Year-Of-Publication'] = pd.to_numeric(books_data['Year-Of-Publication'], errors='coerce')

# Fill NaN values with with 0
books_data['Year-Of-Publication'].fillna(0, inplace=True)  

# Verify the conversion
print(books_data['Year-Of-Publication'].dtype

In [None]:
# Calculate the upper and lower limits for capping the outliers
upper_lim = 2024  # Set the upper limit to 2024
lower_lim = books_data['Year-Of-Publication'].quantile(0.05)  # 5th percentile

# Cap the outlier rows
books_data.loc[books_data['Year-Of-Publication'] > upper_lim, 'Year-Of-Publication'] = upper_lim
books_data.loc[books_data['Year-Of-Publication'] < lower_lim, 'Year-Of-Publication'] = lower_lim

# Verify the changes
print(books_data['Year-Of-Publication'].describe())

# Merging the Datasets 

For further and consistant EDA and creation of the recommendation system

In [None]:
# Merge users and rating dataframe using the User-ID as the key identifier
users_ratings_df = pd.merge(user_data,ratings_data, on='User-ID')

In [None]:
# Mergecombined data of users and ratings with books data with ISBN as key identifier
merged_df = pd.merge(books_data,users_ratings_df, on='ISBN')

In [None]:
# Verify all 3 dataframes merged correctly
merged_df.head()

In [None]:
merged_df.info()

In [None]:
# Drop unnecessary features
merged_df.drop(columns=['Image-URL-S','Image-URL-M','Image-URL-L'], axis=1,inplace=True)

merged_df.head()

# Exploratory Data Analysis

Ratings per Book

In [None]:
#Import necessary visualization libraries
import matplotlib.pyplot as plt
import seaborn as sns

# Group by 'Book-Title' and count the number of ratings for each book
ratings_per_book = merged_df.groupby('Book-Title').count()['Book-Rating'].reset_index()
ratings_per_book.columns = ['Book-Title', 'Num-Ratings']

# Sort the books by the number of ratings in descending order 
ratings_per_book = ratings_per_book.sort_values(by='Num-Ratings', ascending=False)

# Plot the distribution of ratings per book
plt.figure(figsize=(12, 6))
plt.hist(ratings_per_book['Num-Ratings'], bins=50, color='skyblue', edgecolor='black')
plt.xlabel('Number of Ratings')
plt.ylabel('Number of Books')
plt.title('Distribution of Ratings per Book')
plt.yscale('log')  # Log scale to account for large skew in the data
plt.show()

Top 10 Most Occuring Books

In [None]:
# Get the top 10 most frequently occurring books in the dataset
Top10_Book = merged_df['Book-Title'].value_counts().reset_index().head(10)
Top10_Book.columns = ['Book_Title', 'Count']  # Rename columns appropriately

# Check the column names to ensure they were renamed correctly
print(Top10_Book.columns)

In [None]:
# Plot the barplot for top 10 books
plt.rcParams['figure.figsize'] = (10, 5)
sns.barplot(x='Book_Title', y='Count', data=Top10_Book)
plt.xticks(rotation=70, horizontalalignment="center")
plt.grid(axis='y', linestyle='--')
plt.title('Top 10 Occuring Books')
plt.xlabel('Book Title')
plt.ylabel('Count')
plt.show()

Top 10 author with most books written

In [None]:
# Create a DataFrame of the top 10 authors based on their book count
Top10_author = merged_df['Book-Author'].value_counts().reset_index().head(10)

# Rename columns to clearly indicate author names and count of books
Top10_author.columns = ['Book-Author', 'Count']

# Display the top 10 authors DataFrame
Top10_author

In [None]:
# Create a barplot for the top 10 authors
sns.barplot(x="Book-Author", y="Count", data=Top10_author)

# Rotate x-axis labels for better readability
plt.xticks(rotation=70, horizontalalignment="center")

# Set plot title and font size
plt.title("Top 10 Authors with the Most Books Written", fontsize=20)

# Display the plot
plt.show()

Top 10 Countries of Users 

In [None]:
# Import re to leverage regular expressions for extracting the last word of a location string, 
# which often represents the country
import re

# Dictionary to map common abbreviations and country names correctly
country_mapping = {
    'usa': 'USA', 'canada': 'Canada', 'kingdom': 'United Kingdom', 
    'germany': 'Germany', 'australia': 'Australia', 'spain': 'Spain', 
    'france': 'France', 'portugal': 'Portugal', 'uk': 'United Kingdom'
}

# Function to extract and map country name
def extract_country(location):
    # Extract the last word in location string
    country = re.search(r'([a-zA-Z]+)$', location.lower())
    if country:
        country = country.group(1)
        # Use the dictionary to map to correct country name
        return country_mapping.get(country, country.capitalize())
    return 'Unknown'  # For locations that don't match

# Apply the function to create a clean 'Country' column
merged_df['Country'] = merged_df['Location'].apply(extract_country)

In [None]:
# Create dataframe for Top 5 Countries 
Top5_Country = merged_df['Country'].value_counts().reset_index().head(5)
Top5_Country.columns = ['Country', 'Count']  

In [None]:
# Check the column names to confirm they are correct
print(Top5_Country.columns)
print(Top5_Country.head(5))

In [None]:
# Pie chart for top 5 countries with most users
plt.figure(figsize=(8, 6))
plt.pie(Top5_Country['Count'], labels=Top5_Country['Country'], autopct='%1.1f%%', startangle=140)
plt.title("Top 5 Countries with Most Users")
plt.show()

Correlation Analysis - Numerical Data: Year-Of-Publication, User-ID, Age, and Book-Rating

In [None]:
# Import seaborn module for further correlation visulisation 
import seaborn as sns

# Extract relevant columns from the merged_df
correlation_data = merged_df[['Year-Of-Publication', 'User-ID', 'Age', 'Book-Rating']]

# Calculate the correlation matrix
correlation_matrix = correlation_data.corr()

# Display the correlation matrix
print("Correlation Matrix:")
print(correlation_matrix)

# Visualize the correlation matrix using a heatmap
plt.figure(figsize=(8, 6))
sns.heatmap(correlation_matrix, annot=True, cmap='coolwarm', linewidths=0.5, fmt=".2f")
plt.title('Correlation Analysis of Year-Of-Publication, User-ID, Age, and Book-Rating')
plt.show()


Distribution for each Book's Average Rating

In [None]:
# Calculate average ratings per book
average_ratings = merged_df.groupby('Book-Title')['Book-Rating'].mean()

# Plot the distribution of average ratings
plt.figure(figsize=(10, 6))
sns.histplot(average_ratings, bins=20, kde=True, color='red')
plt.xlabel('Average Rating')
plt.ylabel('Number of Books')
plt.title('Distribution of Average Book Ratings')
plt.show()

# Building the Collaborative Filtering Models

Collaborative filtering methods Collaborative methods for recommender systems are methods that are based solely on the past interactions recorded between users and items in order to produce new recommendations. These interactions are stored in the so-called “user-item interactions matrix”.

In [None]:
# Import necessary libraries and modules from sci-kit learn to begin building reccomendation model
from scipy.stats import norm
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import roc_auc_score, confusion_matrix,accuracy_score,f1_score,roc_curve, roc_auc_score,classification_report,precision_score, recall_score
from sklearn.model_selection import train_test_split
from scipy import stats
import ast
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.neighbors import NearestNeighbors
from scipy.sparse import csr_matrix
from surprise import Reader, Dataset
from surprise.model_selection import train_test_split
from surprise import SVDpp, accuracy
from surprise.model_selection import cross_validate
from collections import defaultdict
from surprise import SVD, SVDpp, NMF
from surprise import SlopeOne, CoClustering

# SVD++ (from Github)

Data Preparation

In [None]:
#Rating head
ratings_data.head(1)

In [None]:
#Rating data with exclusion of Books with rating 0
ratings = ratings_data[ratings_data['Book-Rating'] != 0]

#Merging dataframe rating and books on ISBN
df = pd.merge(ratings_data,books_data, on='ISBN')

Implementing KNN - Books which are rated by atleast 10 users

In [None]:
# Books interactionn count
books_interactions_count_df = df.groupby(['ISBN', 'User-ID']).size().groupby('ISBN').size()
print('# of books: %d' % len(books_interactions_count_df))

# Books with enough interactions
books_with_enough_interactions_df = books_interactions_count_df[books_interactions_count_df >= 10].reset_index()[['ISBN']]
print('# of books with at least 10 interactions: %d' % len(books_with_enough_interactions_df))
print(books_with_enough_interactions_df.head(5))

Users which have rated atleast 25 different books

In [None]:
# Users interactionn count
users_interactions_count_df = df.groupby(['User-ID', 'ISBN']).size().groupby('User-ID').size()
print('# of users: %d' % len(users_interactions_count_df))

# Users with enough interactions
users_with_enough_interactions_df = users_interactions_count_df[users_interactions_count_df >= 25].reset_index()[['User-ID']]
print('# of users with at least 25 interactions: %d' % len(users_with_enough_interactions_df))
print(users_with_enough_interactions_df.head(5))

In [None]:
# Users with enough interactions
print('# of interactions: %d' % len(df))
interactions_from_selected_users_df = df.merge(users_with_enough_interactions_df, 
               how = 'right',
               on = 'User-ID')
print('# of interactions from users with at least 25 interactions: %d' % len(interactions_from_selected_users_df))

Dataframe of Users and Books with enough interactions

In [None]:
# Users and Books with enough interactions
print('# of interactions: %d' % len(df))
interactions_from_selected_books_and_users_df= interactions_from_selected_users_df.merge(books_with_enough_interactions_df, on = 'ISBN')
print('# of interactions from users with at least 25 interactions and books with at least 10 interactions: %d' % len(interactions_from_selected_books_and_users_df))

In [None]:
# Interactions from selected books and users dataframe
interactions_from_selected_books_and_users_df.head(5)

In [None]:
# Shape of interactions from selected books and users dataframe
interactions_from_selected_books_and_users_df.shape

In [None]:
# Aggregate all the interactions of users and applying log transformation to rating
import math
def smooth_user_preference(x):
    return math.log(1+x, 2)

interactions_full_df1 = interactions_from_selected_books_and_users_df.groupby(['User-ID', 'Book-Title'])['Book-Rating'].sum().apply(smooth_user_preference).reset_index()
print('# of unique user/item interactions: %d' % len(interactions_full_df1))
interactions_full_df = interactions_from_selected_books_and_users_df.groupby(['User-ID', 'ISBN'])['Book-Rating'].sum().apply(smooth_user_preference).reset_index()
print('# of unique user/item interactions: %d' % len(interactions_full_df))
interactions_full_df.head(5)

In [None]:
#Create a sparse pivot table

df_user_item_matrix = interactions_full_df.pivot(index='ISBN',columns='User-ID',values='Book-Rating').fillna(0)
user_item_matrix_sparse = csr_matrix(df_user_item_matrix.values)
df_user_item_matrix1 = interactions_full_df1.pivot(index='User-ID',columns='Book-Title',values='Book-Rating').fillna(0)
df_user_item_matrix1=df_user_item_matrix1.transpose()
user_item_matrix_sparse1 = csr_matrix(df_user_item_matrix1.values)
user_item_matrix_sparse1=csr_matrix(df_user_item_matrix1.values)

Model Building

In [None]:
#Fitting Model
model = NearestNeighbors(n_neighbors=30, metric='cosine', algorithm='brute', n_jobs=-1)
 
model.fit(user_item_matrix_sparse1)

Recommendations for randomly selected book

In [None]:
query_index = np.random.choice(df_user_item_matrix1.shape[0])
distances, indices = model.kneighbors(df_user_item_matrix1.iloc[query_index, :].values.reshape((1, -1)), n_neighbors = 16)

for i in range(0, len(distances.flatten())):
    if i == 0:
        print('Recommendations for Book {0}:\n'.format(df_user_item_matrix1.index[query_index]))
    else:
        print('{0}: {1}, with distance of {2}:'.format(i, df_user_item_matrix1.index[indices.flatten()[i]], distances.flatten()[i]))

Recommendations based on a specific book

In [None]:
#Model building and recommendation for perticular book
model = NearestNeighbors(n_neighbors=30, metric='cosine', algorithm='brute', n_jobs=-1)
 
model.fit(user_item_matrix_sparse)

index_to_book = dict()
 
df_titles_book = df.set_index('ISBN').loc[df_user_item_matrix.index]
 
count = 0
 
for index, row in df_titles_book.iterrows():
 
    index_to_book[count]=row['Book-Title']
 
    count +=1
 
 
def recommender(model, user_item_matrix_sparse, df_book, number_of_recommendations, book_index):
 
    main_title = index_to_book[book_index]
 
    dist, ind = model.kneighbors(user_item_matrix_sparse[book_index], n_neighbors=number_of_recommendations+1)
 
    dist = dist[0].tolist()
 
    ind = ind[0].tolist()
 
    titles = []
 
    for index in ind:
 
        titles.append(index_to_book[index])
 
    recommendations = list(zip(titles,dist))    
 
    # sort recommendations

    recommendations_sorted = sorted(recommendations, key = lambda x:x[1])
 
    # reverse recommendations, leaving out the first element 
 
    recommendations_sorted.reverse()
 
    recommendations_sorted = recommendations_sorted[:-1]
 
    print("Recommendations for Book {}: ".format(main_title))
 
    count = 0
 
    for (title, distance) in recommendations_sorted:
 
        count += 1
 
        print('{}. {}, recommendation score = {}'.format(count, title, round(distance,5)))
 
recommender(model, user_item_matrix_sparse, df, 10, 10)

Train Test Split And Model Building for SVD++

In [None]:
# test set is made of 25% of the ratings.
trainset, testset = train_test_split(data, test_size=0.25)

# We'll use the famous SVD algorithm
algo = SVDpp()

# Train the algorithm on the trainset, and predict ratings for the testset
algo.fit(trainset)
predictions = algo.test(testset)

# Then compute RMSE
accuracy.rmse(predictions)

# Then compute MAE
accuracy.mae(predictions)

In [None]:
interactions_full_df.head(1)

In [None]:
# Example user and ISBN number for predicted rating
user_id = '254'

isbn = '0060934700'

prediction = algo.predict(uid=user_id, iid=isbn)

print("Predicted rating of user with id {} for movie with id {}: {}".format(user_id, isbn, round(prediction.est,3)))

In [None]:
# Predictions- actual and estimated
predictions

In [None]:
# Visualize the prediction output
actual_ratings = [pred.r_ui for pred in predictions if pred.uid == user_id]
estimated_ratings = [pred.est for pred in predictions if pred.uid == user_id]

plt.figure(figsize=(10, 6))
sns.scatterplot(x=actual_ratings, y=estimated_ratings, alpha=0.6)
plt.xlabel('Actual Ratings')
plt.ylabel('Predicted Ratings')
plt.title('Actual vs Predicted Ratings for User {}'.format(user_id))
plt.grid(True)
plt.show()

# Incorporating the Diversity Factor using SVD++

In [None]:
# Generate Recommendations for a Specific User
def recommend_books_svdpp(user_id, top_n=10, diversity_factor=0.3):
    """
    Generate book recommendations for a given user using SVD++ and incorporate diversity.

    Parameters:
    - user_id: ID of the user for which to generate recommendations.
    - top_n: Total number of books to recommend.
    - diversity_factor: Percentage of recommendations to replace with diverse suggestions.

    Returns:
    - A DataFrame containing the recommended books with details.
    """
    # Predict ratings for all books for this user
    all_books = data['Book-Title'].unique()
    predictions = []
    for book in all_books:
        predictions.append((book, svdpp.predict(user_id, book).est))

    # Sort by predicted rating to get top recommendations
    sorted_predictions = sorted(predictions, key=lambda x: x[1], reverse=True)
    top_recommendations = sorted_predictions[:top_n]

    # Create a DataFrame for the top recommendations
    recommendation_df = pd.DataFrame(top_recommendations, columns=['Book-Title', 'Predicted-Rating'])

    # Introduce Diversity by getting less popular books for diversity
    less_popular_books = data['Book-Title'].value_counts().tail(50).index
    num_diverse_books = int(diversity_factor * top_n)
    num_top_books = top_n - num_diverse_books

    # Get less popular recommendations
    diverse_recommendations = []
    for book in less_popular_books:
        diverse_recommendations.append((book, svdpp.predict(user_id, book).est))

    diverse_recommendations = sorted(diverse_recommendations, key=lambda x: x[1], reverse=True)[:num_diverse_books]

    # Combine top recommendations with diverse recommendations
    final_recommendations = recommendation_df.head(num_top_books).append(pd.DataFrame(diverse_recommendations, columns=['Book-Title', 'Predicted-Rating']))
    final_recommendations = final_recommendations.sample(frac=1).reset_index(drop=True)  # Shuffle the final list

    # Ensure exactly `top_n` recommendations
    final_recommendations = final_recommendations.head(top_n)

    # Merge with book details for additional information
    final_recommendations = final_recommendations.merge(books_data, on='Book-Title', how='left')

    return final_recommendations[['Book-Title', 'Book-Author', 'Predicted-Rating']]

# Example usage for a given user
user_id = 141430  # Replace with an actual user ID from the dataset
recommended_books_svdpp = recommend_books_svdpp(user_id, top_n=10, diversity_factor=0.3)

# Display the recommendations
print(recommended_books_svdpp)