In [5]:
#import library
import pandas as pd
import numpy as np
from sklearn.metrics import pairwise_distances
from scipy.spatial.distance import cosine,correlation

In [6]:
#load Dataset
data = pd.read_csv("book.csv")

In [3]:
data

Unnamed: 0.1,Unnamed: 0,User.ID,Book.Title,Book.Rating
0,1,276726,Classical Mythology,5
1,2,276729,Clara Callan,3
2,3,276729,Decision in Normandy,6
3,4,276736,Flu: The Story of the Great Influenza Pandemic...,8
4,5,276737,The Mummies of Urumchi,6
...,...,...,...,...
9995,9996,162121,American Fried: Adventures of a Happy Eater.,7
9996,9997,162121,Cannibal In Manhattan,9
9997,9998,162121,How to Flirt: A Practical Guide,7
9998,9999,162121,Twilight,8


## Data preprocessing

In [4]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10000 entries, 0 to 9999
Data columns (total 4 columns):
 #   Column       Non-Null Count  Dtype 
---  ------       --------------  ----- 
 0   Unnamed: 0   10000 non-null  int64 
 1   User.ID      10000 non-null  int64 
 2   Book.Title   10000 non-null  object
 3   Book.Rating  10000 non-null  int64 
dtypes: int64(3), object(1)
memory usage: 312.6+ KB


In [5]:
data.duplicated().sum()

0

In [6]:
data.corr()

Unnamed: 0.1,Unnamed: 0,User.ID,Book.Rating
Unnamed: 0,1.0,-0.406308,-0.000228
User.ID,-0.406308,1.0,-0.041523
Book.Rating,-0.000228,-0.041523,1.0


In [7]:
#droppig Unnamed: 0 column
data.drop("Unnamed: 0",axis = 1, inplace = True)

In [8]:
data.sort_values("User.ID")

Unnamed: 0,User.ID,Book.Title,Book.Rating
2401,8,Wings,5
2400,8,The Western way: A practical guide to the West...,5
2399,8,Ancient Celtic Romances,5
2402,8,Truckers,5
2405,8,The Art Of Celtia,7
...,...,...,...
2395,278854,La crónica del Perú (Crónicas de América),7
2398,278854,Celtic Mythology (Library of the World's Myths...,8
2393,278854,A corrente de Trewis Scott,7
2394,278854,As valkírias,7


#### Finding unique values and total unique values

In [9]:
print("unique:",data["User.ID"].unique())
print("nunique:",data["User.ID"].nunique())
print("value_counts:",data["User.ID"].value_counts())

unique: [276726 276729 276736 ... 162113 162121 162129]
nunique: 2182
value_counts: 3757      523
162052    214
2276      212
4017      156
277427    150
         ... 
1180        1
1172        1
161479      1
1164        1
2844        1
Name: User.ID, Length: 2182, dtype: int64


In [10]:
print("unique:",data["Book.Title"].unique())
print("nunique:",data["Book.Title"].nunique())
print("value_counts:",data["User.ID"].value_counts())

unique: ['Classical Mythology' 'Clara Callan' 'Decision in Normandy' ...
 'How to Flirt: A Practical Guide' 'Twilight'
 'Kids Say the Darndest Things']
nunique: 9659
value_counts: 3757      523
162052    214
2276      212
4017      156
277427    150
         ... 
1180        1
1172        1
161479      1
1164        1
2844        1
Name: User.ID, Length: 2182, dtype: int64


In [11]:
print("unique:",data["Book.Rating"].unique())
print("nunique:",data["Book.Rating"].nunique())
print("value_counts:",data["Book.Rating"].value_counts())

unique: [ 5  3  6  8  7 10  9  4  1  2]
nunique: 10
value_counts: 8     2283
7     2076
10    1732
9     1493
5     1007
6      920
4      237
3      146
2       63
1       43
Name: Book.Rating, dtype: int64


In [12]:
sort = data.groupby(["User.ID","Book.Title"]).count() #which user read which book and for rating purpose
sort

Unnamed: 0_level_0,Unnamed: 1_level_0,Book.Rating
User.ID,Book.Title,Unnamed: 2_level_1
8,Ancient Celtic Romances,1
8,Keepers of the Earth Teachers Guide,1
8,The Art Of Celtia,1
8,The Celts Activity Book,1
8,The Western way: A practical guide to the Western mystery tradition,1
...,...,...
278854,A corrente de Trewis Scott,1
278854,As valkírias,1
278854,Blast From the Past,1
278854,Celtic Mythology (Library of the World's Myths and Legends),1


In [13]:
sort.value_counts()

Book.Rating
1              9986
2                 7
dtype: int64

In [14]:
data['Book.Title'].value_counts()

Fahrenheit 451                                     5
Vanished                                           4
The Subtle Knife (His Dark Materials, Book 2)      4
Stardust                                           4
The Amber Spyglass (His Dark Materials, Book 3)    4
                                                  ..
The Bastard                                        1
The Cabinet of Curiosities                         1
Charlotte and Claudia keeping in touch: A novel    1
El Vendedor De Noticias (Espasa Juvenil)           1
Salammbo (World Classics)                          1
Name: Book.Title, Length: 9659, dtype: int64

In [15]:
#creating user item matrix
user_book_df = data.pivot_table(index='User.ID',columns='Book.Title',values='Book.Rating')
user_book_df

Book.Title,"Jason, Madison &amp",Other Stories;Merril;1985;McClelland &amp,Repairing PC Drives &amp,'48,'O Au No Keia: Voices from Hawai'I's Mahu and Transgender Communities,...AND THE HORSE HE RODE IN ON : THE PEOPLE V. KENNETH STARR,01-01-00: A Novel of the Millennium,"1,401 More Things That P*Ss Me Off",10 Commandments Of Dating,"100 Great Fantasy Short, Short Stories",...,Zora Hurston and the Chinaberry Tree (Reading Rainbow Book),\Even Monkeys Fall from Trees\ and Other Japanese Proverbs,\I Won't Learn from You\: And Other Thoughts on Creative Maladjustment,"\More More More,\ Said the Baby",\O\ Is for Outlaw,"\Surely You're Joking, Mr. Feynman!\: Adventures of a Curious Character","\Well, there's your problem\: Cartoons",iI Paradiso Degli Orchi,stardust,�?�?bermorgen.
User.ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
8,,,,,,,,,,,...,,,,,,,,,,
9,,,,,,,,,,,...,,,,,,,,,,
10,,,,,,,,,,,...,,,,,,,,,,
12,,,,,,,,,,,...,,,,,,,,,,
14,,,,,,,,,,,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
278846,,,,,,,,,,,...,,,,,,,,,,
278849,,,,,,,,,,,...,,,,,,,,,,
278851,,,,,,,,,,,...,,,,,,,,7.0,,
278852,,,,,,,,,,,...,,,,,,,,,,


In [16]:
#fiiling na values
user_book_df.fillna(0,inplace = True)
user_book_df

Book.Title,"Jason, Madison &amp",Other Stories;Merril;1985;McClelland &amp,Repairing PC Drives &amp,'48,'O Au No Keia: Voices from Hawai'I's Mahu and Transgender Communities,...AND THE HORSE HE RODE IN ON : THE PEOPLE V. KENNETH STARR,01-01-00: A Novel of the Millennium,"1,401 More Things That P*Ss Me Off",10 Commandments Of Dating,"100 Great Fantasy Short, Short Stories",...,Zora Hurston and the Chinaberry Tree (Reading Rainbow Book),\Even Monkeys Fall from Trees\ and Other Japanese Proverbs,\I Won't Learn from You\: And Other Thoughts on Creative Maladjustment,"\More More More,\ Said the Baby",\O\ Is for Outlaw,"\Surely You're Joking, Mr. Feynman!\: Adventures of a Curious Character","\Well, there's your problem\: Cartoons",iI Paradiso Degli Orchi,stardust,�?�?bermorgen.
User.ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
8,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
9,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
10,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
12,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
14,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
278846,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
278849,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
278851,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,7.0,0.0,0.0
278852,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [17]:
user_book_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 2182 entries, 8 to 278854
Columns: 9659 entries,  Jason, Madison &amp to �?�?bermorgen.
dtypes: float64(9659)
memory usage: 160.8 MB


In [18]:
user_book_df.columns

Index([' Jason, Madison &amp', ' Other Stories;Merril;1985;McClelland &amp',
       ' Repairing PC Drives &amp', ''48',
       ''O Au No Keia: Voices from Hawai'I's Mahu and Transgender Communities',
       '...AND THE HORSE HE RODE IN ON : THE PEOPLE V. KENNETH STARR',
       '01-01-00: A Novel of the Millennium',
       '1,401 More Things That P*Ss Me Off', '10 Commandments Of Dating',
       '100 Great Fantasy Short, Short Stories',
       ...
       'Zora Hurston and the Chinaberry Tree (Reading Rainbow Book)',
       '\Even Monkeys Fall from Trees\ and Other Japanese Proverbs',
       '\I Won't Learn from You\: And Other Thoughts on Creative Maladjustment',
       '\More More More,\ Said the Baby', '\O\ Is for Outlaw',
       '\Surely You're Joking, Mr. Feynman!\: Adventures of a Curious Character',
       '\Well, there's your problem\: Cartoons', 'iI Paradiso Degli Orchi',
       'stardust', '�?�?bermorgen.'],
      dtype='object', name='Book.Title', length=9659)

In [19]:
#calculating cosine similairty bewteen users
from sklearn.metrics import pairwise_distances
from scipy.spatial.distance import cosine,correlation

In [20]:
user_sim = 1- pairwise_distances(user_book_df.values,metric = "cosine")
user_sim

array([[1., 0., 0., ..., 0., 0., 0.],
       [0., 1., 0., ..., 0., 0., 0.],
       [0., 0., 1., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 1., 0., 0.],
       [0., 0., 0., ..., 0., 1., 0.],
       [0., 0., 0., ..., 0., 0., 1.]])

In [21]:
#other approch
user_sim_corr = 1- pairwise_distances(user_book_df.values,metric = "correlation")
user_sim_corr

array([[ 1.00000000e+00, -2.71714970e-04, -2.71714970e-04, ...,
        -9.91259776e-04, -2.71714970e-04, -6.63481070e-04],
       [-2.71714970e-04,  1.00000000e+00, -1.03541106e-04, ...,
        -3.77734555e-04, -1.03541106e-04, -2.52829514e-04],
       [-2.71714970e-04, -1.03541106e-04,  1.00000000e+00, ...,
        -3.77734555e-04, -1.03541106e-04, -2.52829514e-04],
       ...,
       [-9.91259776e-04, -3.77734555e-04, -3.77734555e-04, ...,
         1.00000000e+00, -3.77734555e-04, -9.22362604e-04],
       [-2.71714970e-04, -1.03541106e-04, -1.03541106e-04, ...,
        -3.77734555e-04,  1.00000000e+00, -2.52829514e-04],
       [-6.63481070e-04, -2.52829514e-04, -2.52829514e-04, ...,
        -9.22362604e-04, -2.52829514e-04,  1.00000000e+00]])

In [22]:
#store the results in a dataframe
user_sim_df = pd.DataFrame(user_sim)
user_sim_df

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,2172,2173,2174,2175,2176,2177,2178,2179,2180,2181
0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2177,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
2178,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
2179,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
2180,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0


In [23]:
#set the index and column names to user ids
user_sim_df.index = data["User.ID"].unique()
user_sim_df.columns = data["User.ID"].unique()

In [24]:
user_sim_df

Unnamed: 0,276726,276729,276736,276737,276744,276745,276747,276748,276751,276754,...,162085,162091,162092,162095,162103,162107,162109,162113,162121,162129
276726,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
276729,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
276736,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
276737,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
276744,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
162107,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
162109,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
162113,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
162121,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0


In [25]:
user_sim_df.shape

(2182, 2182)

In [26]:
np.fill_diagonal(user_sim,0) #to remove same elements in idxmax
user_sim_df.iloc[0:5,0:5]

Unnamed: 0,276726,276729,276736,276737,276744
276726,0.0,0.0,0.0,0.0,0.0
276729,0.0,0.0,0.0,0.0,0.0
276736,0.0,0.0,0.0,0.0,0.0
276737,0.0,0.0,0.0,0.0,0.0
276744,0.0,0.0,0.0,0.0,0.0


In [27]:
#Most Similar Users
user_sim_df.idxmax(axis=1)

276726    276726
276729    276726
276736    276726
276737    276726
276744    276726
           ...  
162107    276726
162109    276726
162113    161453
162121    276726
162129    276726
Length: 2182, dtype: int64

In [28]:
# user 162107 and user 276726 have read these books
data[(data["User.ID"]==162107)|(data["User.ID"]==276726)] #slicing method

Unnamed: 0,User.ID,Book.Title,Book.Rating
0,276726,Classical Mythology,5
9987,162107,What's Bred in the Bone,7


In [29]:
#simliarly
data[(data["User.ID"]==162109)|(data["User.ID"]==276726)]

Unnamed: 0,User.ID,Book.Title,Book.Rating
0,276726,Classical Mythology,5
9988,162109,The Flower in the Skull,10


In [30]:
data[(data["User.ID"]==162121)|(data["User.ID"]==276726)]

Unnamed: 0,User.ID,Book.Title,Book.Rating
0,276726,Classical Mythology,5
9990,162121,The Cloister Walk,7
9991,162121,Open Water,5
9992,162121,The Evolution of Jane,8
9993,162121,AT PARADISE GATE,8
9994,162121,I Should Have Stayed Home: The Worst Trips of ...,8
9995,162121,American Fried: Adventures of a Happy Eater.,7
9996,162121,Cannibal In Manhattan,9
9997,162121,How to Flirt: A Practical Guide,7
9998,162121,Twilight,8


In [31]:
user_1 = data[data["User.ID"]==162121]
user_2 = data[data["User.ID"]==276726]

In [32]:
user_1["Book.Title"]

9990                                    The Cloister Walk
9991                                           Open Water
9992                                The Evolution of Jane
9993                                     AT PARADISE GATE
9994    I Should Have Stayed Home: The Worst Trips of ...
9995         American Fried: Adventures of a Happy Eater.
9996                                Cannibal In Manhattan
9997                      How to Flirt: A Practical Guide
9998                                             Twilight
Name: Book.Title, dtype: object

In [33]:
user_2["Book.Title"]

0    Classical Mythology
Name: Book.Title, dtype: object

## Joining Methods

In [34]:
pd.merge(user_1,user_2,on='Book.Title',how='outer')

Unnamed: 0,User.ID_x,Book.Title,Book.Rating_x,User.ID_y,Book.Rating_y
0,162121.0,The Cloister Walk,7.0,,
1,162121.0,Open Water,5.0,,
2,162121.0,The Evolution of Jane,8.0,,
3,162121.0,AT PARADISE GATE,8.0,,
4,162121.0,I Should Have Stayed Home: The Worst Trips of ...,8.0,,
5,162121.0,American Fried: Adventures of a Happy Eater.,7.0,,
6,162121.0,Cannibal In Manhattan,9.0,,
7,162121.0,How to Flirt: A Practical Guide,7.0,,
8,162121.0,Twilight,8.0,,
9,,Classical Mythology,,276726.0,5.0


In [35]:
pd.merge(user_1,user_2,on='Book.Title',how='inner')

Unnamed: 0,User.ID_x,Book.Title,Book.Rating_x,User.ID_y,Book.Rating_y


In [36]:
pd.merge(user_1,user_2,on='Book.Title',how='left')

Unnamed: 0,User.ID_x,Book.Title,Book.Rating_x,User.ID_y,Book.Rating_y
0,162121,The Cloister Walk,7,,
1,162121,Open Water,5,,
2,162121,The Evolution of Jane,8,,
3,162121,AT PARADISE GATE,8,,
4,162121,I Should Have Stayed Home: The Worst Trips of ...,8,,
5,162121,American Fried: Adventures of a Happy Eater.,7,,
6,162121,Cannibal In Manhattan,9,,
7,162121,How to Flirt: A Practical Guide,7,,
8,162121,Twilight,8,,


In [37]:
pd.merge(user_1,user_2,on='Book.Title',how='right')

Unnamed: 0,User.ID_x,Book.Title,Book.Rating_x,User.ID_y,Book.Rating_y
0,,Classical Mythology,,276726,5


## Conclusion :

- so we can conclude that, Classical Mythology book is more associated with other books and we can recommend this book whenever a user buys or reads other books.