# Import the dependencies

In [28]:
import pandas as pd
import numpy as np
from sklearn.neighbors import NearestNeighbors
from scipy.sparse import csr_matrix
import warnings
import difflib 
warnings.filterwarnings('ignore')


# Load the data and Preprocess it

In [2]:
# load the data from a csv file
book_data = pd.read_csv(r"D:\Downloads D\book data set\BX-Books.csv",encoding = "ISO-8859-1",
    sep=";",
    header=0,
    names=['isbn', 'title', 'author'], # selecting only relevent columns
    usecols=['isbn', 'title', 'author'],
    dtype={'isbn': 'str', 'title': 'str', 'author': 'str'})


In [3]:
book_data.head()

Unnamed: 0,isbn,title,author
0,195153448,Classical Mythology,Mark P. O. Morford
1,2005018,Clara Callan,Richard Bruce Wright
2,60973129,Decision in Normandy,Carlo D'Este
3,374157065,Flu: The Story of the Great Influenza Pandemic...,Gina Bari Kolata
4,393045218,The Mummies of Urumchi,E. J. W. Barber


In [4]:
book_ratings =  pd.read_csv(
    r"D:\Downloads D\book data set\BX-Book-Ratings.csv",
    encoding = "ISO-8859-1",
    sep=";",
    header=0,
    names=['user', 'isbn', 'rating'],
    usecols=['user', 'isbn', 'rating'],# selecting only relevant columns
    dtype={'user': 'int32', 'isbn': 'str', 'rating': 'float32'})

In [5]:
book_ratings.head()

Unnamed: 0,user,isbn,rating
0,276725,034545104X,0.0
1,276726,0155061224,5.0
2,276727,0446520802,0.0
3,276729,052165615X,3.0
4,276729,0521795028,6.0


#  Exploratory Data Analysis

In [6]:
book_data.shape

(271379, 3)

In [7]:
len(book_data['title'].unique())

242154

In [8]:
len(book_ratings['user'].unique())

105283

**The problem with this data set is it has books that have received very few ratings. We cannot really recommend a book based on a few ratings. So what we will do is we will only consider books that have more than 50 ratings and also we only consider users who have rated at least 200 books.**

In [9]:
book_ratings['user'].value_counts()

11676     13602
198711     7550
153662     6109
98391      5891
35859      5850
          ...  
116180        1
116166        1
116154        1
116137        1
276723        1
Name: user, Length: 105283, dtype: int64

**I will extract users who have rated at least 200 books**

In [10]:
 # extracting the users
user_ratings = book_ratings['user'].value_counts() > 200
y = user_ratings[user_ratings].index
useful_ratings = book_ratings[book_ratings['user'].isin(y)]

In [11]:
# merge these ratings with the books
df = useful_ratings.merge(book_data,on='isbn')
df.head()

Unnamed: 0,user,isbn,rating,title,author
0,277427,002542730X,10.0,Politically Correct Bedtime Stories: Modern Ta...,James Finn Garner
1,3363,002542730X,0.0,Politically Correct Bedtime Stories: Modern Ta...,James Finn Garner
2,11676,002542730X,6.0,Politically Correct Bedtime Stories: Modern Ta...,James Finn Garner
3,12538,002542730X,10.0,Politically Correct Bedtime Stories: Modern Ta...,James Finn Garner
4,13552,002542730X,0.0,Politically Correct Bedtime Stories: Modern Ta...,James Finn Garner


In [12]:
# let's extract the books that has more than 50 ratings
number_rating = df.groupby('title')['rating'].count().reset_index()
number_rating.head()

Unnamed: 0,title,rating
0,A Light in the Storm: The Civil War Diary of ...,2
1,Always Have Popsicles,1
2,Apple Magic (The Collector's series),1
3,Beyond IBM: Leadership Marketing and Finance ...,1
4,Clifford Visita El Hospital (Clifford El Gran...,1


**Here we have grouped the titles based on number of ratings received**

In [13]:
number_rating.rename(columns= {'rating':'number_of_ratings'}, inplace=True)
number_rating.head()

Unnamed: 0,title,number_of_ratings
0,A Light in the Storm: The Civil War Diary of ...,2
1,Always Have Popsicles,1
2,Apple Magic (The Collector's series),1
3,Beyond IBM: Leadership Marketing and Finance ...,1
4,Clifford Visita El Hospital (Clifford El Gran...,1


In [14]:
# Now merge this dataframe with your originalone
df1 = df.merge(number_rating,on='title')
df1.head()

Unnamed: 0,user,isbn,rating,title,author,number_of_ratings
0,277427,002542730X,10.0,Politically Correct Bedtime Stories: Modern Ta...,James Finn Garner,82
1,3363,002542730X,0.0,Politically Correct Bedtime Stories: Modern Ta...,James Finn Garner,82
2,11676,002542730X,6.0,Politically Correct Bedtime Stories: Modern Ta...,James Finn Garner,82
3,12538,002542730X,10.0,Politically Correct Bedtime Stories: Modern Ta...,James Finn Garner,82
4,13552,002542730X,0.0,Politically Correct Bedtime Stories: Modern Ta...,James Finn Garner,82


In [156]:
df2 = df1[df1['number_of_ratings']>=30]
df2.head()

Unnamed: 0,user,isbn,rating,title,author,number_of_ratings
0,277427,002542730X,10.0,Politically Correct Bedtime Stories: Modern Ta...,James Finn Garner,82
1,3363,002542730X,0.0,Politically Correct Bedtime Stories: Modern Ta...,James Finn Garner,82
2,11676,002542730X,6.0,Politically Correct Bedtime Stories: Modern Ta...,James Finn Garner,82
3,12538,002542730X,10.0,Politically Correct Bedtime Stories: Modern Ta...,James Finn Garner,82
4,13552,002542730X,0.0,Politically Correct Bedtime Stories: Modern Ta...,James Finn Garner,82


In [157]:
df2.shape

(98978, 6)

**Now our data has only 61853 books after doing some data cleaning. Next we would want to delete any duplicate values because it will create a problem if a user reviews same book more than once.** 

In [158]:
df2.drop_duplicates(['user','title'],inplace=True)
df2.duplicated().sum()

0

In [159]:
df2.isnull().sum()

user                 0
isbn                 0
rating               0
title                0
author               0
number_of_ratings    0
dtype: int64

# Build the model

In [160]:
# create a pivot table
pivot_table = df2.pivot_table(columns='user',index='title',values='rating')

In [161]:
pivot_table.head()

user,254,2276,2766,2977,3363,3757,4017,4385,6242,6251,...,274004,274061,274301,274308,274808,275970,277427,277478,277639,278418
title,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
10 Lb. Penalty,,,,,,,,,,,...,,,,0.0,,,,,,
16 Lighthouse Road,,,,,,,,,,,...,,,,,,,,,,
1984,9.0,,,,,,,,,,...,,,,,,0.0,,,,
1st to Die: A Novel,,,,,,,,,,,...,,,,,,,,,,
2010: Odyssey Two,,0.0,,,,,,,,,...,,,,,,,,,,


**As you can see they are a lot of NAN values in our pivot table which basically means a particular user never reviewed a particular book corresponding to its null value. So replace those values with zero**

In [162]:
pivot_table.fillna(0,inplace=True)

In [163]:
pivot_table.head()
pivot_table.shape

(1733, 893)

**When we build our model, due to these zeros in our pivot table computational time may increase. So what will do is we will convert our pivot table to sparse matrix**

In [164]:
sparse_matrix = csr_matrix(pivot_table)

In [169]:
# feed this matrix to our model
model = NearestNeighbors(algorithm='brute')

In [170]:
model.fit(sparse_matrix)

NearestNeighbors(algorithm='brute')

**Now our model is trained. We can get recommendations from our model. By the way I am keeping default parameters for this model.**

In [172]:
def get_recommendations(book_name):
    try:
        close_match = difflib.get_close_matches(book_name,pivot_table.index.to_list())[0]
    except IndexError:
        return 'Book is not available'
    idx = np.where(pivot_table.index==close_match)[0][0]
    ls = [close_match]
    distances,neighbours = model.kneighbors(pivot_table.iloc[idx,:].values.reshape(1,-1),n_neighbors=6)
    neighbours = list(neighbours[0])
    for i in range(len(neighbours)):
        if i != 0:
            ls.append([pivot_table.index[neighbours[i]],distances[0][i]])
    return ls

In [173]:
book_name = input('Enter your favourite book name: ')
print()
print(get_recommendations(book_name))

Enter your favourite book name: The Queen of the Damned (Vampire Chronicles (Paperback))

['The Queen of the Damned (Vampire Chronicles (Paperback))', ['The Apocalypse Watch', 34.42383], ["Night Moves (Tom Clancy's Net Force, No. 3)", 34.42383], ['WEB OF DREAMS (Casteel Saga (Paperback))', 34.770676], ["Ruthless.Com (Tom Clancy's Power Plays (Paperback))", 34.785053], ['Escape the Night', 34.842503]]
