In [1]:
import pandas as pd
import numpy as np
from scipy.sparse import csr_matrix
import matplotlib.pyplot as plt
import sklearn as sk

books = pd.read_csv('BX-Books.csv', sep=';', error_bad_lines=False, encoding="latin-1")
books.columns = ['ISBN', 'bookTitle', 'bookAuthor', 'yearOfPublication', 'publisher', 'imageUrlS', 'imageUrlM', 'imageUrlL']
users = pd.read_csv('BX-Users.csv', sep=';', error_bad_lines=False, encoding="latin-1")
users.columns = ['userID', 'Location', 'Age']
ratings = pd.read_csv('BX-Book-Ratings.csv', sep=';', error_bad_lines=False, encoding="latin-1")
ratings.columns = ['userID', 'ISBN', 'bookRating']

b'Skipping line 6452: expected 8 fields, saw 9\nSkipping line 43667: expected 8 fields, saw 10\nSkipping line 51751: expected 8 fields, saw 9\n'
b'Skipping line 92038: expected 8 fields, saw 9\nSkipping line 104319: expected 8 fields, saw 9\nSkipping line 121768: expected 8 fields, saw 9\n'
b'Skipping line 144058: expected 8 fields, saw 9\nSkipping line 150789: expected 8 fields, saw 9\nSkipping line 157128: expected 8 fields, saw 9\nSkipping line 180189: expected 8 fields, saw 9\nSkipping line 185738: expected 8 fields, saw 9\n'
b'Skipping line 209388: expected 8 fields, saw 9\nSkipping line 220626: expected 8 fields, saw 9\nSkipping line 227933: expected 8 fields, saw 11\nSkipping line 228957: expected 8 fields, saw 10\nSkipping line 245933: expected 8 fields, saw 9\nSkipping line 251296: expected 8 fields, saw 9\nSkipping line 259941: expected 8 fields, saw 9\nSkipping line 261529: expected 8 fields, saw 9\n'
  interactivity=interactivity, compiler=compiler, result=result)


In [4]:
ratings.head(5)

Unnamed: 0,userID,ISBN,bookRating
0,276725,034545104X,0
1,276726,0155061224,5
2,276727,0446520802,0
3,276729,052165615X,3
4,276729,0521795028,6


In [5]:
users.head(5)

Unnamed: 0,userID,Location,Age
0,1,"nyc, new york, usa",
1,2,"stockton, california, usa",18.0
2,3,"moscow, yukon territory, russia",
3,4,"porto, v.n.gaia, portugal",17.0
4,5,"farnborough, hants, united kingdom",


In [6]:
books.head(5)

Unnamed: 0,ISBN,bookTitle,bookAuthor,yearOfPublication,publisher,imageUrlS,imageUrlM,imageUrlL
0,195153448,Classical Mythology,Mark P. O. Morford,2002,Oxford University Press,http://images.amazon.com/images/P/0195153448.0...,http://images.amazon.com/images/P/0195153448.0...,http://images.amazon.com/images/P/0195153448.0...
1,2005018,Clara Callan,Richard Bruce Wright,2001,HarperFlamingo Canada,http://images.amazon.com/images/P/0002005018.0...,http://images.amazon.com/images/P/0002005018.0...,http://images.amazon.com/images/P/0002005018.0...
2,60973129,Decision in Normandy,Carlo D'Este,1991,HarperPerennial,http://images.amazon.com/images/P/0060973129.0...,http://images.amazon.com/images/P/0060973129.0...,http://images.amazon.com/images/P/0060973129.0...
3,374157065,Flu: The Story of the Great Influenza Pandemic...,Gina Bari Kolata,1999,Farrar Straus Giroux,http://images.amazon.com/images/P/0374157065.0...,http://images.amazon.com/images/P/0374157065.0...,http://images.amazon.com/images/P/0374157065.0...
4,393045218,The Mummies of Urumchi,E. J. W. Barber,1999,W. W. Norton &amp; Company,http://images.amazon.com/images/P/0393045218.0...,http://images.amazon.com/images/P/0393045218.0...,http://images.amazon.com/images/P/0393045218.0...


In [25]:
combine_book_rating = pd.merge(ratings, books, on="ISBN")
combine_book_rating.head(5)

combine_book = combine_book_rating[['userID','ISBN','bookRating','bookTitle']]
combine_book.head(5)

Unnamed: 0,userID,ISBN,bookRating,bookTitle
0,276725,034545104X,0,Flesh Tones: A Novel
1,2313,034545104X,5,Flesh Tones: A Novel
2,6543,034545104X,0,Flesh Tones: A Novel
3,8680,034545104X,5,Flesh Tones: A Novel
4,10314,034545104X,9,Flesh Tones: A Novel


In [68]:
##exclude rating lower than 0
combine_book_with_positive_rating = combine_book[combine_book['bookRating']>=0]
combine_book_with_positive_rating.head()

Unnamed: 0,userID,ISBN,bookRating,bookTitle
0,276725,034545104X,0,Flesh Tones: A Novel
1,2313,034545104X,5,Flesh Tones: A Novel
2,6543,034545104X,0,Flesh Tones: A Novel
3,8680,034545104X,5,Flesh Tones: A Novel
4,10314,034545104X,9,Flesh Tones: A Novel


In [73]:
##create a new coloumn with number of count of rating group by ISBN
ISBN_Count = combine_book_with_positive_rating.groupby('ISBN')['bookRating'].count()
ISBN_Count = pd.DataFrame({'ISBN': ISBN_Count.index, 'Count':ISBN_Count.values})
ISBN_Count.head()

Unnamed: 0,Count,ISBN
0,1,0000913154
1,2,0001010565
2,1,0001046438
3,1,0001046713
4,1,000104687X


In [74]:
##explore how people rate the book 
ISBN_Count['Count'].quantile([.1, .3, .5,.7,.9, .95, .97])

## 5% of the book received 12 or more ratings, we will limit it to top 10% in this case

0.10     1.0
0.30     1.0
0.50     1.0
0.70     2.0
0.90     7.0
0.95    12.0
0.97    18.0
Name: Count, dtype: float64

In [75]:
popularity_threshold = 18
ISBN_Count = ISBN_Count[ISBN_Count['Count'] >= popularity_threshold]
ISBN_Count.shape

(8283, 2)

In [76]:
#merger ISBN_Count and combine_book
merge_set = combine_book.merge(ISBN_Count, left_on='ISBN', right_on='ISBN')
merge_set.head()

Unnamed: 0,userID,ISBN,bookRating,bookTitle,Count
0,276725,034545104X,0,Flesh Tones: A Novel,60
1,2313,034545104X,5,Flesh Tones: A Novel,60
2,6543,034545104X,0,Flesh Tones: A Novel,60
3,8680,034545104X,5,Flesh Tones: A Novel,60
4,10314,034545104X,9,Flesh Tones: A Novel,60


In [101]:
##create the pivot 
user_rating_pivot = merge_set.pivot(index ='ISBN', columns='userID',values='bookRating').fillna(0)
user_rating_matrix = csr_matrix(user_rating_pivot.values)

In [85]:
##implement knn model 
from sklearn.neighbors import  NearestNeighbors

model_knn = NearestNeighbors(metric = 'cosine', algorithm='brute')
model_knn.fit(user_rating_matrix)

NearestNeighbors(algorithm='brute', leaf_size=30, metric='cosine',
         metric_params=None, n_jobs=1, n_neighbors=5, p=2, radius=1.0)

In [267]:
user_rating_pivot.head()

userID,8,9,10,14,16,17,19,26,32,39,...,278831,278832,278836,278838,278843,278844,278846,278849,278851,278854
ISBN,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0002558122,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
000649840X,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
0006547834,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
0006550576,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
0006550789,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [113]:
##test our model and make some recommendation

##random choice
query_index = np.random.choice(user_rating_pivot.shape[0])
print(query_index)

distance,indices = model_knn.kneighbors(user_rating_pivot.iloc[query_index,:].reshape(1,-1), n_neighbors=6)

for i in range(0, len(distance.flatten())):
    if i == 0:
        print("Recommendation for {0}: \n".format(user_rating_pivot.index[query_index]))
    else:
        print('{0}:{1}, with distance of {2}'.format(i, user_rating_pivot.index[indices.flatten()[i]], distance.flatten()))

7230
Recommendation for 0786889020: 

1:0449216411, with distance of [  2.22044605e-16   8.32855463e-01   8.52388725e-01   8.62107785e-01
   8.65441753e-01   8.67235480e-01]
2:1555910807, with distance of [  2.22044605e-16   8.32855463e-01   8.52388725e-01   8.62107785e-01
   8.65441753e-01   8.67235480e-01]
3:0060808934, with distance of [  2.22044605e-16   8.32855463e-01   8.52388725e-01   8.62107785e-01
   8.65441753e-01   8.67235480e-01]
4:0425179605, with distance of [  2.22044605e-16   8.32855463e-01   8.52388725e-01   8.62107785e-01
   8.65441753e-01   8.67235480e-01]
5:042518627X, with distance of [  2.22044605e-16   8.32855463e-01   8.52388725e-01   8.62107785e-01
   8.65441753e-01   8.67235480e-01]


  import sys


In [134]:
book_ISBN = user_rating_pivot.index
book_list = list(book_ISBN)

In [186]:
book_list.index('0002558122')

0

In [None]:
ISBN_bookTitle = books.set_index('ISBN')['bookTitle'].to_dict()

In [202]:
## input a ISBN
query_index = book_list.index('1555910807')
print(query_index)

distance,indices = model_knn.kneighbors(user_rating_pivot.iloc[query_index,:].reshape(1,-1), n_neighbors=6)

for i in range(0, len(distance.flatten())):
    if i == 0:
        print("Recommendation for {0},{1}: \n".format(user_rating_pivot.index[query_index],ISBN_bookTitle[user_rating_pivot.index[query_index]]))
    else:
        print('{0}:{1},{2}'.format(i, user_rating_pivot.index[indices.flatten()[i]],ISBN_bookTitle[user_rating_pivot.index[indices.flatten()[i]]] ))

7942
Recommendation for 1555910807,Biblioholism: The Literary Addiction: 

1:0515134120,Sarah's Window
2:0425179605,Some Things That Stay
3:0688180639,Vinegar Hill (Oprah's Book Club (Hardcover))
4:0671016792,SKATING FOR THE GOLD
5:0451410742,Step-Ball-Change


  """


In [170]:
##export model 
from sklearn.externals import joblib
joblib.dump(model_knn, 'knn.pkl') 

['knn.pkl']

In [199]:
##convert the pivot table to a dictionary

Index_ISBN = pd.DataFrame(book_list, columns=["ISBN"])
Index_ISBN['Index'] = Index_ISBN.index
Index_ISBN.head()
Index_ISBN_dic = Index_ISBN.set_index('ISBN')['Index'].to_dict()


In [216]:
query_index = Index_ISBN_dic['0002558122']
print(query_index)

distance,indices = model_knn.kneighbors(user_rating_pivot.iloc[query_index,:].reshape(1,-1), n_neighbors=6)

for i in range(0, len(distance.flatten())):
    if i == 0:
        print("Recommendation for {0},{1}: \n".format(user_rating_pivot.index[query_index],ISBN_bookTitle[user_rating_pivot.index[query_index]]))
    else:
        print('{0}:{1},{2}'.format(i, user_rating_pivot.index[indices.flatten()[i]],ISBN_bookTitle[user_rating_pivot.index[indices.flatten()[i]]] ))

0
Recommendation for 0002558122,Angelas Ashes: 

1:0385310161,Gai-Jin: A Novel of Japan
2:0375503943,The Greatest Generation Speaks : Letters and Reflections
3:0440407532,The River
4:0345356365,The Wishsong of Shannara
5:0812574621,Animist


  after removing the cwd from sys.path.


In [204]:
user_rating_pivot.iloc[query_index,:].reshape(1,-1)

  """Entry point for launching an IPython kernel.


array([[ 0.,  0.,  0., ...,  0.,  0.,  0.]])

In [211]:
user_rating_pivot.indexx

Index(['0002558122', '000649840X', '0006547834', '0006550576', '0006550789',
       '0007106572', '0007110928', '0007141076', '0007154615', '000716226X',
       ...
       '880781210X', '8807813025', '8817106100', '8817106259', '8817131628',
       '8845205118', '8845247414', '884590184X', '8885989403', '950491036X'],
      dtype='object', name='ISBN', length=8283)

In [213]:
indices.flatten()

array([7942, 5007, 3187, 6858, 6042, 4647], dtype=int64)

In [217]:
type(user_rating_pivot)

pandas.core.frame.DataFrame

In [222]:
user_rating_pivot.shape

(8283, 57130)

In [220]:
type(user_rating_pivot.iloc[query_index,:])

pandas.core.series.Series

In [226]:
user_rating_pivot.iloc[query_index,:]

userID
8         0.0
9         0.0
10        0.0
14        0.0
16        0.0
17        0.0
19        0.0
26        0.0
32        0.0
39        0.0
42        0.0
44        0.0
51        0.0
53        0.0
56        0.0
67        0.0
69        0.0
75        0.0
77        0.0
78        0.0
81        0.0
91        0.0
95        0.0
97        0.0
99        0.0
107       0.0
114       0.0
125       0.0
129       0.0
132       0.0
         ... 
278760    0.0
278767    0.0
278769    0.0
278771    0.0
278773    0.0
278774    0.0
278781    0.0
278782    0.0
278784    0.0
278786    0.0
278796    0.0
278798    0.0
278800    0.0
278807    0.0
278813    0.0
278818    0.0
278819    0.0
278820    0.0
278824    0.0
278828    0.0
278831    0.0
278832    0.0
278836    0.0
278838    0.0
278843    0.0
278844    0.0
278846    0.0
278849    0.0
278851    0.0
278854    0.0
Name: 0002558122, Length: 57130, dtype: float64

In [227]:
##create the pivot2 
user_rating_pivot2 = merge_set.pivot(index ='userID', columns='ISBN',values='bookRating').fillna(0)
user_rating_pivot2.head()

ISBN,0002558122,000649840X,0006547834,0006550576,0006550789,0007106572,0007110928,0007141076,0007154615,000716226X,...,880781210X,8807813025,8817106100,8817106259,8817131628,8845205118,8845247414,884590184X,8885989403,950491036X
userID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
8,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
9,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
10,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
14,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
16,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [228]:
user_rating_pivot2.shape

(57130, 8283)

In [229]:
X = user_rating_pivot2.values.T
X.shape

(8283, 57130)

In [232]:
from sklearn.decomposition import TruncatedSVD

SVD = TruncatedSVD(n_components =12, random_state=17)
matrix = SVD.fit_transform(X)
matrix.shape

(8283, 12)

In [234]:
import warnings
warnings.filterwarnings("ignore",category = RuntimeWarning)
corr = np.corrcoef(matrix)
corr.shape

(8283, 8283)

In [261]:
corr[3]

array([ -4.21749918e-02,  -6.82783242e-04,   1.22677787e-02, ...,
        -1.60360946e-01,   1.20048414e-02,   7.53795191e-01])

In [254]:
list(book_ISBN[(corr[0]>0.99)])[:5]

['0002558122', '0006547834', '0020811853', '0060005424', '0060007788']

In [262]:
type(corr)

numpy.ndarray

In [259]:
np.savetxt("cor.txt", corr)

In [263]:
np.save('test.npy', corr)