In [3]:
import pandas as pd
import numpy as np

In [4]:
books = pd.read_csv("Books.csv")
users = pd.read_csv("Users.csv")
ratings = pd.read_csv("Ratings.csv")

In [5]:
books.head()

Unnamed: 0,ISBN,Book-Title,Book-Author,Year-Of-Publication,Publisher,Image-URL-S,Image-URL-M,Image-URL-L
0,195153448,Classical Mythology,Mark P. O. Morford,2002,Oxford University Press,http://images.amazon.com/images/P/0195153448.0...,http://images.amazon.com/images/P/0195153448.0...,http://images.amazon.com/images/P/0195153448.0...
1,2005018,Clara Callan,Richard Bruce Wright,2001,HarperFlamingo Canada,http://images.amazon.com/images/P/0002005018.0...,http://images.amazon.com/images/P/0002005018.0...,http://images.amazon.com/images/P/0002005018.0...
2,60973129,Decision in Normandy,Carlo D'Este,1991,HarperPerennial,http://images.amazon.com/images/P/0060973129.0...,http://images.amazon.com/images/P/0060973129.0...,http://images.amazon.com/images/P/0060973129.0...
3,374157065,Flu: The Story of the Great Influenza Pandemic...,Gina Bari Kolata,1999,Farrar Straus Giroux,http://images.amazon.com/images/P/0374157065.0...,http://images.amazon.com/images/P/0374157065.0...,http://images.amazon.com/images/P/0374157065.0...
4,393045218,The Mummies of Urumchi,E. J. W. Barber,1999,W. W. Norton &amp; Company,http://images.amazon.com/images/P/0393045218.0...,http://images.amazon.com/images/P/0393045218.0...,http://images.amazon.com/images/P/0393045218.0...


In [6]:
users.head()

Unnamed: 0,User-ID,Location,Age
0,1,"nyc, new york, usa",
1,2,"stockton, california, usa",18.0
2,3,"moscow, yukon territory, russia",
3,4,"porto, v.n.gaia, portugal",17.0
4,5,"farnborough, hants, united kingdom",


In [7]:
ratings.head()

Unnamed: 0,User-ID,ISBN,Book-Rating
0,276725,034545104X,0
1,276726,0155061224,5
2,276727,0446520802,0
3,276729,052165615X,3
4,276729,0521795028,6


In [8]:
print(books.shape)
print(users.shape)
print(ratings.shape)

(144490, 8)
(278858, 3)
(1149780, 3)


NULL values

In [9]:
books.isna().sum()

ISBN                   0
Book-Title             0
Book-Author            0
Year-Of-Publication    0
Publisher              2
Image-URL-S            0
Image-URL-M            0
Image-URL-L            0
dtype: int64

In [10]:
ratings.isna().sum()

User-ID        0
ISBN           0
Book-Rating    0
dtype: int64

In [11]:
users.isna().sum()

User-ID          0
Location         0
Age         110762
dtype: int64

Duplicated Values

In [12]:
books.duplicated().sum()

0

In [13]:
ratings.duplicated().sum()

0

In [14]:
users.duplicated().sum()

0

1. Popularity Based

In [15]:
# Lets merge books and ratings datasets first
books_with_ratings = ratings.merge(books,on="ISBN")

In [16]:
books_with_ratings.shape

(855671, 10)

In [17]:
# number of rating per movie
no_of_rating_df = books_with_ratings.groupby('Book-Title').count()['Book-Rating'].reset_index()
no_of_rating_df.rename(columns={"Book-Rating": "no_of_Ratings"}, inplace=True)
no_of_rating_df


Unnamed: 0,Book-Title,no_of_Ratings
0,A Light in the Storm: The Civil War Diary of ...,4
1,Apple Magic (The Collector's series),1
2,Beyond IBM: Leadership Marketing and Finance ...,1
3,Dark Justice,1
4,Earth Prayers From around the World: 365 Pray...,10
...,...,...
130465,Ã?Â?berallnie. AusgewÃ?Â¤hlte Gedichte 1928 - ...,1
130466,Ã?Â?bermorgen.,1
130467,Ã?Â?rger mit Produkt X. Roman.,4
130468,Ã?Â?stlich der Berge.,3


In [18]:
avg_of_rating_df = books_with_ratings.groupby('Book-Title').mean()['Book-Rating'].reset_index()
avg_of_rating_df.rename(columns={"Book-Rating": "avg_of_Ratings"},inplace=True)
avg_of_rating_df

  avg_of_rating_df = books_with_ratings.groupby('Book-Title').mean()['Book-Rating'].reset_index()


Unnamed: 0,Book-Title,avg_of_Ratings
0,A Light in the Storm: The Civil War Diary of ...,2.250000
1,Apple Magic (The Collector's series),0.000000
2,Beyond IBM: Leadership Marketing and Finance ...,0.000000
3,Dark Justice,10.000000
4,Earth Prayers From around the World: 365 Pray...,5.000000
...,...,...
130465,Ã?Â?berallnie. AusgewÃ?Â¤hlte Gedichte 1928 - ...,10.000000
130466,Ã?Â?bermorgen.,0.000000
130467,Ã?Â?rger mit Produkt X. Roman.,5.250000
130468,Ã?Â?stlich der Berge.,2.666667


In [19]:
popular_df = no_of_rating_df.merge(avg_of_rating_df,on='Book-Title')

In [20]:
popular_df

Unnamed: 0,Book-Title,no_of_Ratings,avg_of_Ratings
0,A Light in the Storm: The Civil War Diary of ...,4,2.250000
1,Apple Magic (The Collector's series),1,0.000000
2,Beyond IBM: Leadership Marketing and Finance ...,1,0.000000
3,Dark Justice,1,10.000000
4,Earth Prayers From around the World: 365 Pray...,10,5.000000
...,...,...,...
130465,Ã?Â?berallnie. AusgewÃ?Â¤hlte Gedichte 1928 - ...,1,10.000000
130466,Ã?Â?bermorgen.,1,0.000000
130467,Ã?Â?rger mit Produkt X. Roman.,4,5.250000
130468,Ã?Â?stlich der Berge.,3,2.666667


In [21]:
# movies who have rating above 250 with highest avg rating top 50
popular_df = popular_df[popular_df['no_of_Ratings'] > 250].sort_values('avg_of_Ratings',ascending=False).head(50)

In [22]:
popular_df.merge(books,on='Book-Title').drop_duplicates('Book-Title')[['Book-Title','Book-Author','Image-URL-S','no_of_Ratings','avg_of_Ratings']]

Unnamed: 0,Book-Title,Book-Author,Image-URL-S,no_of_Ratings,avg_of_Ratings
0,Harry Potter and the Prisoner of Azkaban (Book 3),J. K. Rowling,http://images.amazon.com/images/P/0439136350.0...,428,5.852804
3,Harry Potter and the Goblet of Fire (Book 4),J. K. Rowling,http://images.amazon.com/images/P/0439139597.0...,387,5.824289
5,Harry Potter and the Sorcerer's Stone (Book 1),J. K. Rowling,http://images.amazon.com/images/P/0590353403.0...,274,5.748175
8,Harry Potter and the Order of the Phoenix (Boo...,J. K. Rowling,http://images.amazon.com/images/P/043935806X.0...,346,5.488439
11,Harry Potter and the Chamber of Secrets (Book 2),J. K. Rowling,http://images.amazon.com/images/P/0439064872.0...,556,5.183453
14,The Hobbit : The Enchanting Prelude to The Lor...,J.R.R. TOLKIEN,http://images.amazon.com/images/P/0345339681.0...,281,5.007117
15,The Fellowship of the Ring (The Lord of the Ri...,J.R.R. TOLKIEN,http://images.amazon.com/images/P/0345339703.0...,364,4.909341
22,Harry Potter and the Sorcerer's Stone (Harry P...,J. K. Rowling,http://images.amazon.com/images/P/059035342X.0...,575,4.895652
24,"The Two Towers (The Lord of the Rings, Part 2)",J.R.R. TOLKIEN,http://images.amazon.com/images/P/0345339711.0...,254,4.76378
32,To Kill a Mockingbird,Harper Lee,http://images.amazon.com/images/P/0446310786.0...,506,4.687747


## Collabarative Filtering

In [23]:
# we should filter user who gave more than 150 reviews and books which have 50+ reviews

In [24]:
x = books_with_ratings.groupby('User-ID').count()['Book-Rating'] > 150
arr = x[x].index

In [25]:
# filtering out the users which gave more than 150 reviews
filtered_rating = books_with_ratings[books_with_ratings['User-ID'].isin(arr)]

In [26]:
filtered_rating.shape

(404537, 10)

In [27]:
# filtering books with rating greater than 50
y = filtered_rating.groupby('Book-Title').count()['Book-Rating'] > 50
arr1 = y[y].index

In [28]:
filtered_rating = filtered_rating[filtered_rating['Book-Title'].isin(arr1)]

In [29]:
filtered_rating.shape

(65253, 10)

In [30]:
final_df = filtered_rating

In [31]:
pt = final_df.pivot_table(index='Book-Title', columns='User-ID', values='Book-Rating', aggfunc='mean')

In [32]:
pt.shape

(758, 921)

In [33]:
pt.fillna(0,inplace=True)

In [34]:
pt.isna().sum()

User-ID
254       0
1733      0
1903      0
2033      0
2110      0
         ..
276680    0
277427    0
277639    0
278188    0
278418    0
Length: 921, dtype: int64

In [35]:
pt.shape

(758, 921)

In [36]:
from sklearn.metrics.pairwise import cosine_similarity
similarity_score = cosine_similarity(pt)

In [37]:
similarity_score

array([[1.        , 0.0896717 , 0.01095791, ..., 0.10958712, 0.06990374,
        0.04237156],
       [0.0896717 , 1.        , 0.25718653, ..., 0.06509352, 0.15416834,
        0.13527462],
       [0.01095791, 0.25718653, 1.        , ..., 0.04090706, 0.04659168,
        0.1051007 ],
       ...,
       [0.10958712, 0.06509352, 0.04090706, ..., 1.        , 0.06738963,
        0.01925418],
       [0.06990374, 0.15416834, 0.04659168, ..., 0.06738963, 1.        ,
        0.10941043],
       [0.04237156, 0.13527462, 0.1051007 , ..., 0.01925418, 0.10941043,
        1.        ]])

In [38]:
def recommend(book_name):
  # fetching index from book name
  index = np.where(pt.index==book_name)[0][0]
  # similar_items = sorted(list(enumerate(similarity_score[index])),key=lambda x:x[1],reverse=True)[1:10]
  similar_items = []
  for i, score in enumerate(similarity_score[index]):
      similar_items.append((i, score))

  # Sort the list of tuples based on similarity score
  similar_items.sort(key=lambda x: x[1], reverse=True)

  # Exclude the first element (index itself) and select the top 10 similar items
  similar_items = similar_items[1:11]

  for i in similar_items:
    print(pt.index[i[0]])

In [39]:
recommend('Harry Potter and the Chamber of Secrets (Book 2)')

Harry Potter and the Prisoner of Azkaban (Book 3)
Harry Potter and the Goblet of Fire (Book 4)
Harry Potter and the Sorcerer's Stone (Book 1)
Harry Potter and the Sorcerer's Stone (Harry Potter (Paperback))
Harry Potter and the Order of the Phoenix (Book 5)
The Fellowship of the Ring (The Lord of the Rings, Part 1)
Charlotte's Web (Trophy Newbery)
The Witness
The Firm
Bridget Jones's Diary
