In [1]:
# Packages imports
import pandas as pd
import numpy as np
import scipy.stats
import seaborn as sns
from sklearn.metrics.pairwise import cosine_similarity

In [2]:
# load ratings
ratings = pd.read_csv('BX-Book-Ratings.csv', encoding='cp1251', sep=';')
ratings = ratings[ratings['Book-Rating']!=0]

In [3]:
# load books
books = pd.read_csv('BX-Books.csv',  encoding='cp1251', sep=';',error_bad_lines=False)

b'Skipping line 6452: expected 8 fields, saw 9\nSkipping line 43667: expected 8 fields, saw 10\nSkipping line 51751: expected 8 fields, saw 9\n'
b'Skipping line 92038: expected 8 fields, saw 9\nSkipping line 104319: expected 8 fields, saw 9\nSkipping line 121768: expected 8 fields, saw 9\n'
b'Skipping line 144058: expected 8 fields, saw 9\nSkipping line 150789: expected 8 fields, saw 9\nSkipping line 157128: expected 8 fields, saw 9\nSkipping line 180189: expected 8 fields, saw 9\nSkipping line 185738: expected 8 fields, saw 9\n'
b'Skipping line 209388: expected 8 fields, saw 9\nSkipping line 220626: expected 8 fields, saw 9\nSkipping line 227933: expected 8 fields, saw 11\nSkipping line 228957: expected 8 fields, saw 10\nSkipping line 245933: expected 8 fields, saw 9\nSkipping line 251296: expected 8 fields, saw 9\nSkipping line 259941: expected 8 fields, saw 9\nSkipping line 261529: expected 8 fields, saw 9\n'
  interactivity=interactivity, compiler=compiler, result=result)


In [4]:
# Take a look at the data
ratings.head()

Unnamed: 0,User-ID,ISBN,Book-Rating
1,276726,0155061224,5
3,276729,052165615X,3
4,276729,0521795028,6
6,276736,3257224281,8
7,276737,0600570967,6


In [5]:
# Get the dataset information
ratings.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 433671 entries, 1 to 1149779
Data columns (total 3 columns):
 #   Column       Non-Null Count   Dtype 
---  ------       --------------   ----- 
 0   User-ID      433671 non-null  int64 
 1   ISBN         433671 non-null  object
 2   Book-Rating  433671 non-null  int64 
dtypes: int64(2), object(1)
memory usage: 11.6+ MB


In [6]:
# Number of users
print('The ratings dataset has', ratings['User-ID'].nunique(), 'unique users')
# Number of books
print('The ratings dataset has', ratings['ISBN'].nunique(), 'unique books')
# Number of ratings
print('The ratings dataset has', ratings['Book-Rating'].nunique(), 'unique ratings')
# List of unique ratings
print('The unique ratings are', sorted(ratings['Book-Rating'].unique()))

The ratings dataset has 77805 unique users
The ratings dataset has 185973 unique books
The ratings dataset has 10 unique ratings
The unique ratings are [1, 2, 3, 4, 5, 6, 7, 8, 9, 10]


In [7]:
# Take a look at the data
books.head(100)

Unnamed: 0,ISBN,Book-Title,Book-Author,Year-Of-Publication,Publisher,Image-URL-S,Image-URL-M,Image-URL-L
0,0195153448,Classical Mythology,Mark P. O. Morford,2002,Oxford University Press,http://images.amazon.com/images/P/0195153448.0...,http://images.amazon.com/images/P/0195153448.0...,http://images.amazon.com/images/P/0195153448.0...
1,0002005018,Clara Callan,Richard Bruce Wright,2001,HarperFlamingo Canada,http://images.amazon.com/images/P/0002005018.0...,http://images.amazon.com/images/P/0002005018.0...,http://images.amazon.com/images/P/0002005018.0...
2,0060973129,Decision in Normandy,Carlo D'Este,1991,HarperPerennial,http://images.amazon.com/images/P/0060973129.0...,http://images.amazon.com/images/P/0060973129.0...,http://images.amazon.com/images/P/0060973129.0...
3,0374157065,Flu: The Story of the Great Influenza Pandemic...,Gina Bari Kolata,1999,Farrar Straus Giroux,http://images.amazon.com/images/P/0374157065.0...,http://images.amazon.com/images/P/0374157065.0...,http://images.amazon.com/images/P/0374157065.0...
4,0393045218,The Mummies of Urumchi,E. J. W. Barber,1999,W. W. Norton &amp; Company,http://images.amazon.com/images/P/0393045218.0...,http://images.amazon.com/images/P/0393045218.0...,http://images.amazon.com/images/P/0393045218.0...
...,...,...,...,...,...,...,...,...
95,0671867156,Pretend You Don't See Her,Mary Higgins Clark,1998,Pocket,http://images.amazon.com/images/P/0671867156.0...,http://images.amazon.com/images/P/0671867156.0...,http://images.amazon.com/images/P/0671867156.0...
96,0312252617,Fast Women,Jennifer Crusie,2001,St. Martin's Press,http://images.amazon.com/images/P/0312252617.0...,http://images.amazon.com/images/P/0312252617.0...,http://images.amazon.com/images/P/0312252617.0...
97,0312261594,Female Intelligence,Jane Heller,2001,St. Martin's Press,http://images.amazon.com/images/P/0312261594.0...,http://images.amazon.com/images/P/0312261594.0...,http://images.amazon.com/images/P/0312261594.0...
98,0316748641,Pasquale's Nose: Idle Days in an Italian Town,Michael Rips,2002,Back Bay Books,http://images.amazon.com/images/P/0316748641.0...,http://images.amazon.com/images/P/0316748641.0...,http://images.amazon.com/images/P/0316748641.0...


In [8]:
# Merge ratings and books datasets
df = pd.merge(ratings, books, on='ISBN', how='inner')
# Take a look at the data
df.head()

Unnamed: 0,User-ID,ISBN,Book-Rating,Book-Title,Book-Author,Year-Of-Publication,Publisher,Image-URL-S,Image-URL-M,Image-URL-L
0,276726,0155061224,5,Rites of Passage,Judith Rae,2001,Heinle,http://images.amazon.com/images/P/0155061224.0...,http://images.amazon.com/images/P/0155061224.0...,http://images.amazon.com/images/P/0155061224.0...
1,276729,052165615X,3,Help!: Level 1,Philip Prowse,1999,Cambridge University Press,http://images.amazon.com/images/P/052165615X.0...,http://images.amazon.com/images/P/052165615X.0...,http://images.amazon.com/images/P/052165615X.0...
2,276729,0521795028,6,The Amsterdam Connection : Level 4 (Cambridge ...,Sue Leather,2001,Cambridge University Press,http://images.amazon.com/images/P/0521795028.0...,http://images.amazon.com/images/P/0521795028.0...,http://images.amazon.com/images/P/0521795028.0...
3,276744,038550120X,7,A Painted House,JOHN GRISHAM,2001,Doubleday,http://images.amazon.com/images/P/038550120X.0...,http://images.amazon.com/images/P/038550120X.0...,http://images.amazon.com/images/P/038550120X.0...
4,11676,038550120X,10,A Painted House,JOHN GRISHAM,2001,Doubleday,http://images.amazon.com/images/P/038550120X.0...,http://images.amazon.com/images/P/038550120X.0...,http://images.amazon.com/images/P/038550120X.0...


In [9]:
# Aggregate by books
agg_ratings = df.groupby('Book-Title').agg(mean_rating = ('Book-Rating', 'mean'),
                                                number_of_ratings = ('Book-Rating', 'count')).reset_index()
# Keep the books with over 100 ratings
agg_ratings_100 = agg_ratings[agg_ratings['number_of_ratings']>100]
# Check the information of the dataframe
agg_ratings_100.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 192 entries, 441 to 135435
Data columns (total 3 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   Book-Title         192 non-null    object 
 1   mean_rating        192 non-null    float64
 2   number_of_ratings  192 non-null    int64  
dtypes: float64(1), int64(1), object(1)
memory usage: 5.2+ KB


In [10]:
# Check popular books
agg_ratings_100.sort_values(by='number_of_ratings', ascending=False).head()

Unnamed: 0,Book-Title,mean_rating,number_of_ratings
110229,The Lovely Bones: A Novel,8.18529,707
132241,Wild Animus,4.390706,581
102703,The Da Vinci Code,8.439271,494
116196,The Secret Life of Bees,8.477833,406
111950,The Nanny Diaries: A Novel,7.437659,393


In [12]:
# Merge data
df_100 = pd.merge(df, agg_ratings_100[['Book-Title']], on='Book-Title', how='inner')
df_100.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 33641 entries, 0 to 33640
Data columns (total 10 columns):
 #   Column               Non-Null Count  Dtype 
---  ------               --------------  ----- 
 0   User-ID              33641 non-null  int64 
 1   ISBN                 33641 non-null  object
 2   Book-Rating          33641 non-null  int64 
 3   Book-Title           33641 non-null  object
 4   Book-Author          33641 non-null  object
 5   Year-Of-Publication  33641 non-null  object
 6   Publisher            33641 non-null  object
 7   Image-URL-S          33641 non-null  object
 8   Image-URL-M          33641 non-null  object
 9   Image-URL-L          33641 non-null  object
dtypes: int64(2), object(8)
memory usage: 1.8+ MB


In [13]:
# Number of users
print('The ratings dataset has', df_100['User-ID'].nunique(), 'unique users')
# Number of books
print('The ratings dataset has', df_100['ISBN'].nunique(), 'unique books')
# Number of ratings
print('The ratings dataset has', df_100['Book-Rating'].nunique(), 'unique ratings')
# List of unique ratings
print('The unique ratings are', sorted(df_100['Book-Rating'].unique()))

The ratings dataset has 16397 unique users
The ratings dataset has 664 unique books
The ratings dataset has 10 unique ratings
The unique ratings are [1, 2, 3, 4, 5, 6, 7, 8, 9, 10]


In [14]:
# Create user-item matrix
matrix = df_100.pivot_table(index='Book-Title', columns='User-ID', values='Book-Rating')
matrix.head()

User-ID,16,26,32,51,91,114,125,165,183,226,...,278723,278740,278755,278773,278798,278800,278836,278843,278844,278846
Book-Title,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1984,,,,,,,,,,,...,,,,,,,,,,
1st to Die: A Novel,,,,,,,,,,,...,,,,,,,,,,
2nd Chance,,,,,,,,,,,...,,,,,,,,,,
A Bend in the Road,,,,,,,,,,,...,,,,,,,,,,
"A Child Called \It\"": One Child's Courage to Survive""",,,,,,,,,,,...,,,,,,,,,,


In [15]:
# Normalize user-item matrix
matrix_norm = matrix.subtract(matrix.mean(axis=1), axis = 0)
matrix_norm.head()

User-ID,16,26,32,51,91,114,125,165,183,226,...,278723,278740,278755,278773,278798,278800,278836,278843,278844,278846
Book-Title,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1984,,,,,,,,,,,...,,,,,,,,,,
1st to Die: A Novel,,,,,,,,,,,...,,,,,,,,,,
2nd Chance,,,,,,,,,,,...,,,,,,,,,,
A Bend in the Road,,,,,,,,,,,...,,,,,,,,,,
"A Child Called \It\"": One Child's Courage to Survive""",,,,,,,,,,,...,,,,,,,,,,


In [16]:
# Item similarity matrix using Pearson correlation
item_similarity = matrix_norm.T.corr()
item_similarity.head()

Book-Title,1984,1st to Die: A Novel,2nd Chance,A Bend in the Road,"A Child Called \It\"": One Child's Courage to Survive""",A Heartbreaking Work of Staggering Genius,A Is for Alibi (Kinsey Millhone Mysteries (Paperback)),A Map of the World,A Painted House,A Prayer for Owen Meany,...,Watership Down,We Were the Mulvaneys,When the Wind Blows,Where the Heart Is (Oprah's Book Club (Paperback)),While I Was Gone,White Oleander : A Novel,White Oleander : A Novel (Oprah's Book Club),Wicked: The Life and Times of the Wicked Witch of the West,Wild Animus,"\O\"" Is for Outlaw"""
Book-Title,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1984,1.0,-0.440225,,-1.0,1.0,0.27735,,-0.866025,0.408248,0.301511,...,0.264363,1.0,-0.033787,0.68313,,0.866025,-0.981981,0.866025,-0.456435,-1.0
1st to Die: A Novel,-0.440225,1.0,0.814767,0.495595,0.605406,-0.426562,0.469982,-0.19245,0.377321,-0.225891,...,0.944911,1.0,0.07595,0.227429,0.866025,0.717547,-0.193649,0.485071,0.866025,-0.138215
2nd Chance,,0.814767,1.0,-0.056265,,,0.944911,0.995871,0.340362,-0.463713,...,0.654654,,0.116426,-0.347224,-0.682048,-0.728208,1.0,-0.233663,0.201309,0.852279
A Bend in the Road,-1.0,0.495595,-0.056265,1.0,-0.114708,1.0,,0.608376,-0.864923,,...,0.613139,-1.0,0.240192,0.205793,-0.829156,-0.058794,,-0.503322,-0.035245,1.0
"A Child Called \It\"": One Child's Courage to Survive""",1.0,0.605406,,-0.114708,1.0,0.774597,0.29821,0.816497,1.0,-0.693375,...,,1.0,-1.0,-0.054699,,0.216777,0.395285,1.0,0.576557,


In [17]:
# Item similarity matrix using cosine similarity
item_similarity_cosine = cosine_similarity(matrix_norm.fillna(0))
item_similarity_cosine

array([[ 1.        , -0.00684128, -0.0010643 , ...,  0.00595896,
        -0.00249361, -0.00377956],
       [-0.00684128,  1.        ,  0.11210421, ...,  0.00820611,
         0.00382314,  0.00127555],
       [-0.0010643 ,  0.11210421,  1.        , ..., -0.00204124,
         0.00380935,  0.02202256],
       ...,
       [ 0.00595896,  0.00820611, -0.00204124, ...,  1.        ,
        -0.00519692, -0.00268469],
       [-0.00249361,  0.00382314,  0.00380935, ..., -0.00519692,
         1.        , -0.00204952],
       [-0.00377956,  0.00127555,  0.02202256, ..., -0.00268469,
        -0.00204952,  1.        ]])

In [18]:
# Pick a user ID
picked_userid = 51

# Pick a books
picked_book = '1984'

# Books that the target user has been read
picked_userid_read = pd.DataFrame(matrix_norm[picked_userid].dropna(axis=0, how='all')\
                          .sort_values(ascending=False))\
                          .reset_index()\
                          .rename(columns={51:'Book-Rating'})
picked_userid_read.head()


Unnamed: 0,Book-Title,Book-Rating
0,The Street Lawyer,1.448529


In [19]:
# Similarity score of the book American Pie with all the other books
picked_book_similarity_score = item_similarity[[picked_book]].reset_index().rename(columns={'1984':'similarity_score'})
# Rank the similarities between the books user 1 rated and American Pie.
n = 5
picked_userid_read_similarity = pd.merge(left=picked_userid_read, 
                                            right=picked_book_similarity_score, 
                                            on='Book-Title', 
                                            how='inner')\
                                     .sort_values('similarity_score', ascending=False)[:5]
# Take a look at the User 1 read books with highest similarity
picked_userid_read_similarity

Unnamed: 0,Book-Title,Book-Rating,similarity_score
0,The Street Lawyer,1.448529,0.161165


In [20]:
# Calculate the predicted rating using weighted average of similarity scores and the ratings from user 1
predicted_rating = round(np.average(picked_userid_read_similarity['Book-Rating'], 
                                    weights=picked_userid_read_similarity['similarity_score']), 6)
print(f'The predicted rating for {picked_book} by user {picked_userid} is {predicted_rating}' )

The predicted rating for 1984 by user 51 is 1.448529


In [21]:
# Item-based recommendation function
def item_based_rec(picked_userid=16, number_of_similar_items=5, number_of_recommendations =3):
  import operator
  # Books that the target user has not read
  picked_userid_read = pd.DataFrame(matrix_norm[picked_userid].isna()).reset_index()
  picked_userid_unread = picked_userid_unread[picked_userid_unread[16]==True]['Book-Title'].values.tolist()
  # Books that the target user has been read
  picked_userid_read = pd.DataFrame(matrix_norm[picked_userid].dropna(axis=0, how='all')\
                            .sort_values(ascending=False))\
                            .reset_index()\
                            .rename(columns={26:'Book-Rating'})
  
  # Dictionary to save the unread book and predicted rating pair
  rating_prediction ={}  
  # Loop through unread books          
  for picked_book in picked_userid_unwread: 
    # Calculate the similarity score of the picked book iwth other books
    picked_book_similarity_score = item_similarity[[picked_book]].reset_index().rename(columns={picked_book:'similarity_score'})
    # Rank the similarities between the picked user read book and the picked unread book.
    picked_userid_read_similarity = pd.merge(left=picked_userid_read, 
                                                right=picked_book_similarity_score, 
                                                on='Book-Title', 
                                                how='inner')\
                                        .sort_values('similarity_score', ascending=False)[:number_of_similar_items]
    # Calculate the predicted rating using weighted average of similarity scores and the ratings from user 1
    predicted_rating = round(np.average(picked_userid_read_similarity['Book-Rating'], 
                                        weights=picked_userid_read_similarity['similarity_score']), 6)
    # Save the predicted rating in the dictionary
    rating_prediction[picked_book] = predicted_rating
    # Return the top recommended books
  return sorted(rating_prediction.items(), key=operator.itemgetter(1), reverse=True)[:number_of_recommendations]


In [None]:
# Get recommendations
recommended_book = item_based_rec(picked_userid=16, number_of_similar_items=5, number_of_recommendations =3)
recommended_book