## Generation of recommendations based on it

In [3]:
import pandas as pd
import random
from scipy.sparse import csr_matrix
import sklearn
import numpy as np

In [4]:
books = pd.read_csv('books.csv', sep=';', error_bad_lines=False, encoding="latin-1")
books.columns=['book_id','authors','title','average_rating','ratings_count']

books[['book_id','title']].head()

Unnamed: 0,book_id,title
0,27,A Woman of Substance (Emma Harte Saga #1)
1,21,Bergdorf Blondes
2,2,InÃ©s of My Soul
3,18,The Far Pavilions
4,24,Where the Red Fern Grows


In [5]:
books['ratings_count'].describe()

count    9.937000e+03
mean     5.995367e+04
std      1.682987e+05
min      6.823000e+03
25%      1.548200e+04
50%      2.392600e+04
75%      4.615300e+04
max      4.942365e+06
Name: ratings_count, dtype: float64

In [6]:
books = books[books['ratings_count']>13000]

In [7]:
books.head()

Unnamed: 0,book_id,authors,title,average_rating,ratings_count
0,27,Barbara Taylor Bradford,A Woman of Substance (Emma Harte Saga #1),4.16,32932
1,21,Plum Sykes,Bergdorf Blondes,3.26,25822
2,2,Isabel Allende,InÃ©s of My Soul,3.9,17411
3,18,M.M. Kaye,The Far Pavilions,4.2,34522
4,24,Wilson Rawls,Where the Red Fern Grows,4.04,280179


In [8]:
ratings = pd.read_csv('ratings.csv',sep=',',error_bad_lines=False, encoding="latin-1")
ratings.head()

Unnamed: 0,user_id,book_id,rating
0,1,258,5
1,2,4081,4
2,2,260,5
3,2,9296,5
4,2,2318,3


In [9]:
#get ratings only for the books present in the books dataset
ratings = ratings.loc[ratings['book_id'].isin(books['book_id'])]
ratings.head()

Unnamed: 0,user_id,book_id,rating
0,1,258,5
1,2,4081,4
2,2,260,5
3,2,9296,5
4,2,2318,3


In [10]:
#check for duplicate ratings(duplicate (user id,book id) combination)
ratings[ratings.duplicated(['user_id','book_id'])]

Unnamed: 0,user_id,book_id,rating


In [11]:
#analyse the ratings 
print ratings.groupby('user_id')['rating'].count().describe()
print len(ratings.groupby('user_id').filter(lambda x: len(x) >= 130).user_id.unique().tolist())

count    53424.000000
mean        94.206649
std         22.347864
min          1.000000
25%         81.000000
50%         94.000000
75%        108.000000
max        174.000000
Name: rating, dtype: float64
3241


In [12]:
ratings=ratings.groupby('user_id').filter(lambda x: len(x) >= 130)
users = ratings.user_id.unique().tolist()
print "Total users in original ratings dataset: ", len(users)

Total users in original ratings dataset:  3241


In [13]:
#choose 2000 random users
print "Reducing dataset by choosing ratings by a random sample of 2000 users"

users = [ users[i] for i in sorted(random.sample(xrange(len(users)), 2000)) ]
ratings=ratings.loc[ratings['user_id'].isin(users)]

#merge authors column to the ratings table to create author ratings matrix
ratings_2 = pd.merge(ratings,books[['book_id','authors','ratings_count']],on='book_id')

ratings = ratings_2

Reducing dataset by choosing ratings by a random sample of 2000 users


In [14]:
ratings.head()


Unnamed: 0,user_id,book_id,rating,authors,ratings_count
0,75,3254,2,Arnold Lobel,70705
1,446,3254,4,Arnold Lobel,70705
2,2276,3254,4,Arnold Lobel,70705
3,4212,3254,5,Arnold Lobel,70705
4,4606,3254,3,Arnold Lobel,70705


In [15]:
ratings_2.head()


Unnamed: 0,user_id,book_id,rating,authors,ratings_count
0,75,3254,2,Arnold Lobel,70705
1,446,3254,4,Arnold Lobel,70705
2,2276,3254,4,Arnold Lobel,70705
3,4212,3254,5,Arnold Lobel,70705
4,4606,3254,3,Arnold Lobel,70705


In [16]:
ratings_3 = ratings_2
ratings_3['user_id'] = ratings_3['user_id'].astype('category').cat.codes
ratings_3.head()

Unnamed: 0,user_id,book_id,rating,authors,ratings_count
0,0,3254,2,Arnold Lobel,70705
1,16,3254,4,Arnold Lobel,70705
2,77,3254,4,Arnold Lobel,70705
3,158,3254,5,Arnold Lobel,70705
4,169,3254,3,Arnold Lobel,70705


In [17]:
user_id_to_int = pd.merge(ratings_3[['user_id','book_id']],ratings_2[['user_id','book_id']],on='book_id')
user_id_to_int.head()

Unnamed: 0,user_id_x,book_id,user_id_y
0,0,3254,0
1,0,3254,16
2,0,3254,77
3,0,3254,158
4,0,3254,169


In [18]:
#to get contiguous IDs for books , users and authors
ratings['user_id'] = ratings['user_id'].astype('category').cat.codes
ratings['book_id'] = ratings['book_id'].astype('category').cat.codes
ratings['authors'] = ratings['authors'].astype('category').cat.codes


n_users = 1 + ratings['user_id'].max()
n_books = 1 + ratings['book_id'].max()
n_authors = 1 + ratings['authors'].max()
print n_users,n_books,n_authors

2000 8134 3148


In [19]:
n_users = ratings.user_id.unique().shape[0]
n_books = ratings.book_id.unique().shape[0]
n_authors = ratings.authors.unique().shape[0]

print "n_users" , n_users
print "n_books" , n_books
print "n_authors" , n_authors

ratings.head()

n_users 2000
n_books 8134
n_authors 3148


Unnamed: 0,user_id,book_id,rating,authors,ratings_count
0,0,2752,2,228,70705
1,16,2752,4,228,70705
2,77,2752,4,228,70705
3,158,2752,5,228,70705
4,169,2752,3,228,70705


In [20]:
from sklearn import cross_validation as cv
train_data, test_data = cv.train_test_split(ratings, test_size=0.25)

train_data[train_data['book_id']==935]


Unnamed: 0,user_id,book_id,rating,authors,ratings_count
234810,1629,935,2,1670,25434
234805,10,935,3,1670,25434
234809,1699,935,3,1670,25434
234806,100,935,4,1670,25434
234807,1297,935,2,1670,25434


In [21]:
#create train and test ratings matrices
train_data_books_matrix = np.zeros((n_users, n_books))
for line in train_data.itertuples():
    train_data_books_matrix[line[1], line[2]] = line[3]

test_data_books_matrix = np.zeros((n_users, n_books))
for line in test_data.itertuples():
    test_data_books_matrix[line[1], line[2]] = line[3]

In [22]:
#find similarities
from sklearn.metrics.pairwise import pairwise_distances
user_similarity = pairwise_distances(train_data_books_matrix, metric='cosine')
item_similarity = pairwise_distances(train_data_books_matrix.T, metric='cosine')


print user_similarity.shape
print item_similarity.shape

(2000, 2000)
(8134, 8134)


In [23]:
#ratings prediction matrix generation function
def predict(ratings, similarity, type='user'):
    if type == 'user':
        mean_user_rating = ratings.mean(axis=1)
        ratings_diff = (ratings - mean_user_rating[:, np.newaxis])
        pred = mean_user_rating[:, np.newaxis] + similarity.dot(ratings_diff) / np.array([np.abs(similarity).sum(axis=1)]).T
    elif type == 'item':
        pred = ratings.dot(similarity) / np.array([np.abs(similarity).sum(axis=1)])
    return pred

In [24]:
item_prediction = predict(train_data_books_matrix, item_similarity, type='item')
user_prediction = predict(train_data_books_matrix, user_similarity, type='user')

item_prediction.shape

(2000, 8134)

In [25]:
test_data_books_matrix.shape

(2000, 8134)

In [26]:
from sklearn.metrics import mean_squared_error
from math import sqrt
def rmse(prediction, ground_truth):
    prediction = prediction[ground_truth.nonzero()].flatten()
    ground_truth = ground_truth[ground_truth.nonzero()].flatten()
    return sqrt(mean_squared_error(prediction, ground_truth))

In [27]:
print 'User-based CF RMSE: ' + str(rmse(user_prediction, test_data_books_matrix))
print 'Item-based CF RMSE: ' + str(rmse(item_prediction, test_data_books_matrix))

User-based CF RMSE: 3.70477418269
Item-based CF RMSE: 3.94684813691


In [28]:
#obtain author ratings from book ratings by users 
author_ratings_train=pd.DataFrame(train_data.groupby(['user_id','authors'])['rating'].mean())
author_ratings_train.head()

author_ratings_test=pd.DataFrame(test_data.groupby(['user_id','authors'])['rating'].mean())
author_ratings_test.head()


Unnamed: 0_level_0,Unnamed: 1_level_0,rating
user_id,authors,Unnamed: 2_level_1
0,29,2.0
0,50,4.0
0,156,2.0
0,215,4.0
0,362,1.0


In [29]:
train_data_authors_matrix = np.zeros((n_users, n_authors))
for line in author_ratings_train.itertuples():
    train_data_authors_matrix[line[0][0], line[0][1]] = line[1]
    
test_data_authors_matrix = np.zeros((n_users, n_authors))
for line in author_ratings_test.itertuples():
    test_data_authors_matrix[line[0][0], line[0][1]] = line[1]

In [30]:
user_similarity_authors = pairwise_distances(train_data_authors_matrix, metric='cosine')
item_similarity_authors = pairwise_distances(train_data_authors_matrix.T, metric='cosine')

print user_similarity_authors.shape
print item_similarity_authors.shape


(2000, 2000)
(3148, 3148)


In [31]:
item_prediction_2 = predict(train_data_authors_matrix, item_similarity_authors, type='item')
user_prediction_2 = predict(train_data_authors_matrix, user_similarity_authors, type='user')

In [32]:
item_prediction_2.shape

(2000, 3148)

In [33]:
print 'User-based CF RMSE: ' + str(rmse(user_prediction, test_data_authors_matrix))
print 'Item-based CF RMSE: ' + str(rmse(item_prediction, test_data_authors_matrix))

User-based CF RMSE: 3.89278162402
Item-based CF RMSE: 3.94344310987


In [34]:
#expansion of author ratings to book ratings matrix 
author_to_books_prediction = np.zeros((n_users,n_books))
print author_to_books_prediction.shape


for line in train_data.itertuples():
    x= train_data[train_data['book_id']==line[2]]['ratings_count'].iloc[0]
    author_to_books_prediction[:,line[2]].fill( x )
        
    
author_to_books_prediction.shape
print author_to_books_prediction

(2000, 8134)
[[ 25359.  17411.  13103. ...,  33212.  27015.      0.]
 [ 25359.  17411.  13103. ...,  33212.  27015.      0.]
 [ 25359.  17411.  13103. ...,  33212.  27015.      0.]
 ..., 
 [ 25359.  17411.  13103. ...,  33212.  27015.      0.]
 [ 25359.  17411.  13103. ...,  33212.  27015.      0.]
 [ 25359.  17411.  13103. ...,  33212.  27015.      0.]]


In [35]:
author_array=np.zeros(n_books)

for line in train_data.itertuples():
    x= train_data[train_data['book_id']==line[2]]['authors'].iloc[0]
    author_array[line[2]] = x
        
    
print author_array

[  464.  1208.  2264. ...,  2314.  1580.     0.]


In [36]:
author_to_books_prediction_2 = np.zeros((n_users,n_books))

for i in xrange(n_books):
    author_id = int(author_array[i])
    author_to_books_prediction_2[:,i] = np.multiply( author_to_books_prediction[:,i], item_prediction_2[:,author_id])       

In [37]:
author_to_books_prediction_2.shape


(2000, 8134)

In [38]:
#rescaling predictions between 0 and 5
author_to_books_prediction_3 = np.zeros((n_users,n_books))

for i in xrange(n_books):
    min_val = author_to_books_prediction_2[:,i].min()
    max_val = author_to_books_prediction_2[:,i].max()
    if max_val==min_val: 
        continue
    x= np.full((1,n_users),min_val)
    y=np.subtract(author_to_books_prediction_2[:,i],x)
    z= 5*y/(max_val-min_val)
    author_to_books_prediction_3[:,i] = z

In [39]:
author_to_books_prediction_3

array([[ 2.19659095,  2.21303624,  2.35435786, ...,  2.43383404,
         2.40431882,  0.        ],
       [ 2.2341085 ,  2.23859059,  2.24540898, ...,  2.29174696,
         2.28017894,  0.        ],
       [ 1.85658455,  1.88292076,  1.84539151, ...,  1.67705488,
         1.6712282 ,  0.        ],
       ..., 
       [ 2.46196973,  2.49445483,  2.63527063, ...,  2.80045059,
         2.74869549,  0.        ],
       [ 4.39086404,  4.42501828,  4.68538925, ...,  4.78154055,
         4.75613739,  0.        ],
       [ 2.51589056,  2.46594367,  2.5849116 , ...,  2.5045315 ,
         2.49290822,  0.        ]])

In [40]:
#item similarity
book_ratings_prediction_1 = item_prediction
book_ratings_prediction_2 = author_to_books_prediction_3

print book_ratings_prediction_1.shape
print book_ratings_prediction_2.shape



(2000, 8134)
(2000, 8134)


In [41]:
#item similarity
final_prediction = np.mean([book_ratings_prediction_1,book_ratings_prediction_2],axis=0)
final_prediction[0:5,0:5]

array([[ 1.11728455,  1.12552001,  1.19689219,  1.12581343,  1.18706743],
       [ 1.13651861,  1.13894419,  1.14243619,  1.14626642,  1.174681  ],
       [ 0.94535928,  0.95882713,  0.93993805,  0.99639048,  0.97155068],
       [ 0.95642301,  0.97682148,  1.02409982,  0.97108459,  0.99334215],
       [ 1.44865173,  1.47856255,  1.51896284,  1.47900414,  1.46151406]])

In [42]:
print 'Item-based CF RMSE for hybrid book recommender system: ' + str(rmse(final_prediction, test_data_books_matrix))

Item-based CF RMSE for hybrid book recommender system: 2.71253423638


In [43]:
author_to_books_prediction_4 = np.zeros((n_users,n_books))


for i in xrange(n_books):
    author_id = int(author_array[i])
    author_to_books_prediction_4[:,i] = np.multiply( author_to_books_prediction[:,i], user_prediction_2[:,author_id])


In [44]:
author_to_books_prediction_4.shape


(2000, 8134)

In [45]:
#rescaling the predictions between 0 and 5
author_to_books_prediction_5 = np.zeros((n_users,n_books))

for i in xrange(n_books):
    min_val = author_to_books_prediction_4[:,i].min()
    max_val = author_to_books_prediction_4[:,i].max()
    if max_val==min_val: 
        continue
    x= np.full((1,n_users),min_val)
    y=np.subtract(author_to_books_prediction_4[:,i],x)
    z= 5*y/(max_val-min_val)
    author_to_books_prediction_5[:,i] = z

In [46]:
author_to_books_prediction_5

array([[ 1.29023559,  1.58980589,  2.05229325, ...,  2.43863875,
         2.42307029,  0.        ],
       [ 1.82262458,  1.90933063,  1.71236125, ...,  2.28188285,
         2.29379885,  0.        ],
       [ 1.16397694,  2.21288885,  1.70656507, ...,  1.65218526,
         1.64518422,  0.        ],
       ..., 
       [ 2.16628869,  2.59601725,  2.37168425, ...,  2.82243178,
         2.77509165,  0.        ],
       [ 3.71156769,  4.02886237,  4.66490308, ...,  4.7915809 ,
         4.78159606,  0.        ],
       [ 1.73930508,  1.91142938,  2.31348176, ...,  2.49024979,
         2.47791293,  0.        ]])

In [47]:
#user similarity
book_ratings_prediction_3 = user_prediction
book_ratings_prediction_4 = author_to_books_prediction_5

print book_ratings_prediction_3.shape
print book_ratings_prediction_4.shape



(2000, 8134)
(2000, 8134)


In [48]:
#user similarity
final_prediction_2 = np.mean([book_ratings_prediction_3,book_ratings_prediction_4],axis=0)
final_prediction_2[0:5,0:5]

array([[ 1.45129585,  1.53177002,  1.47156128,  1.56529136,  1.86201589],
       [ 1.72431939,  1.69736302,  1.29972001,  1.70252905,  1.78468841],
       [ 1.39920558,  1.86240674,  1.3013685 ,  1.89207196,  1.47167032],
       [ 1.34810718,  1.50243582,  1.24502985,  1.50318132,  1.37457921],
       [ 1.91319142,  2.36880553,  1.95951134,  2.19294824,  1.90719731]])

In [49]:
print 'User-based CF RMSE for hybrid book recommender system: ' + str(rmse(final_prediction_2, test_data_books_matrix))

User-based CF RMSE for hybrid book recommender system: 2.63787536787
