## Collaborative Filtering

In [91]:
# import pandas
import pandas as pd
import numpy as np
from sklearn.neighbors import NearestNeighbors

We will create an recommender engine based on Item Based Collaborative Filtering (IBCF) which searches for the most similar books based on the user ratings. We can download the data from [here](https://drive.google.com/file/d/1WvTmAfO09TCX7xp7uu06__ziic7JnrL5/view?usp=sharing).

In [6]:
book_ratings = pd.read_csv('data/BX-Book-Ratings.csv',sep=";", encoding="latin")
books = pd.read_csv('data/BX-Books.csv',sep=";", encoding="latin", on_bad_lines='warn')

b'Skipping line 6452: expected 8 fields, saw 9\nSkipping line 43667: expected 8 fields, saw 10\nSkipping line 51751: expected 8 fields, saw 9\n'
b'Skipping line 92038: expected 8 fields, saw 9\nSkipping line 104319: expected 8 fields, saw 9\nSkipping line 121768: expected 8 fields, saw 9\n'
b'Skipping line 144058: expected 8 fields, saw 9\nSkipping line 150789: expected 8 fields, saw 9\nSkipping line 157128: expected 8 fields, saw 9\nSkipping line 180189: expected 8 fields, saw 9\nSkipping line 185738: expected 8 fields, saw 9\n'
b'Skipping line 209388: expected 8 fields, saw 9\nSkipping line 220626: expected 8 fields, saw 9\nSkipping line 227933: expected 8 fields, saw 11\nSkipping line 228957: expected 8 fields, saw 10\nSkipping line 245933: expected 8 fields, saw 9\nSkipping line 251296: expected 8 fields, saw 9\nSkipping line 259941: expected 8 fields, saw 9\nSkipping line 261529: expected 8 fields, saw 9\n'
  exec(code_obj, self.user_global_ns, self.user_ns)


* Explore both datasets

In [7]:
print(book_ratings.shape)
book_ratings.head()

(1149780, 3)


Unnamed: 0,User-ID,ISBN,Book-Rating
0,276725,034545104X,0
1,276726,0155061224,5
2,276727,0446520802,0
3,276729,052165615X,3
4,276729,0521795028,6


In [8]:
print(books.shape)
books.head()

(271360, 8)


Unnamed: 0,ISBN,Book-Title,Book-Author,Year-Of-Publication,Publisher,Image-URL-S,Image-URL-M,Image-URL-L
0,195153448,Classical Mythology,Mark P. O. Morford,2002,Oxford University Press,http://images.amazon.com/images/P/0195153448.0...,http://images.amazon.com/images/P/0195153448.0...,http://images.amazon.com/images/P/0195153448.0...
1,2005018,Clara Callan,Richard Bruce Wright,2001,HarperFlamingo Canada,http://images.amazon.com/images/P/0002005018.0...,http://images.amazon.com/images/P/0002005018.0...,http://images.amazon.com/images/P/0002005018.0...
2,60973129,Decision in Normandy,Carlo D'Este,1991,HarperPerennial,http://images.amazon.com/images/P/0060973129.0...,http://images.amazon.com/images/P/0060973129.0...,http://images.amazon.com/images/P/0060973129.0...
3,374157065,Flu: The Story of the Great Influenza Pandemic...,Gina Bari Kolata,1999,Farrar Straus Giroux,http://images.amazon.com/images/P/0374157065.0...,http://images.amazon.com/images/P/0374157065.0...,http://images.amazon.com/images/P/0374157065.0...
4,393045218,The Mummies of Urumchi,E. J. W. Barber,1999,W. W. Norton &amp; Company,http://images.amazon.com/images/P/0393045218.0...,http://images.amazon.com/images/P/0393045218.0...,http://images.amazon.com/images/P/0393045218.0...


* create dataframe with name 'df_book_features' from book_ratings that have `ISBN` as index, `User-ID` as columns and values are `Book-Rating`.
    - The data are quite big so it's OK to use a sample only in case your PC has limited RAM.


In [10]:
book_ratings_sample = book_ratings[:10000]
df_book_features = book_ratings_sample.pivot_table(index='ISBN', columns='User-ID', values='Book-Rating')

In [11]:
print(df_book_features.shape)
df_book_features.head()

(9340, 941)


User-ID,2,7,8,9,10,12,14,16,17,19,...,278832,278836,278838,278843,278844,278846,278849,278851,278852,278854
ISBN,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0002005018,,,5.0,,,,,,,,...,,,,,,,,,,
0002231115,,,,,,,,,,,...,,,,,,,,,,
0002232766,,,,,,,,,,,...,,,,,,,,,,
0002240114,,,,,,,,,,,...,,,,,,,,,,
000225669X,,,,,,,,,,,...,,,,,,,,,,


* create the instance of the NearestNeighbors class

In [12]:
from sklearn.neighbors import NearestNeighbors
nn = NearestNeighbors()

* fit the NearestNeighbors using'df_book_features'

In [22]:
for idx in df_book_features.index:
    avg = df_book_features.loc[idx].mean()
    df_book_features.loc[idx].fillna(avg, inplace=True)
    df_book_features.loc[idx] = [val - avg for val in df_book_features.loc[idx]]

In [23]:
print(df_book_features.shape)
df_book_features.head()

(9340, 941)


User-ID,2,7,8,9,10,12,14,16,17,19,...,278832,278836,278838,278843,278844,278846,278849,278851,278852,278854
ISBN,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0002005018,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
0002231115,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
0002232766,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
0002240114,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
000225669X,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [24]:
nn.fit(df_book_features)

NearestNeighbors()

* create function that returns top 5 most similar movies (according to KNN model) for selected ISBN
    * the input will be Book-Title from the DataFrame books 
    * the output will be the Book-Titles of the top 5 most similar books.
    * for every book in the top 5 most similar books, print also the distance from the selected book (ISBN we chose as input to the function)

In [138]:
# create mapping from title to ISBN
title_isbn = books[['ISBN', 'Book-Title']]
no_dup_title_isbn = title_isbn.drop_duplicates()
title_series = pd.Series(no_dup_title_isbn['Book-Title'].values, index=no_dup_title_isbn['ISBN'])

In [139]:
title_series.head()

ISBN
0195153448                                  Classical Mythology
0002005018                                         Clara Callan
0060973129                                 Decision in Normandy
0374157065    Flu: The Story of the Great Influenza Pandemic...
0393045218                               The Mummies of Urumchi
dtype: object

In [191]:
def top_5_recommended(title):
    # using map to find the isbn series for input
    isbn = title_series[title_series == test_title].index[0]
    isbn_series = df_book_features.loc[isbn]
    
    # find the distances and indices for the top 5 neighbors
    distance, index = nn.kneighbors([isbn_series], n_neighbors=5)
    
    # reverse mapping to find the titles
    top5_isbns = [df_book_features.iloc[idx].name for idx in index[0]]
    top5_titles = [title_series.loc[isbn] for isbn in top5_isbns]
    
    results = pd.DataFrame()
    results['title'] = top5_titles
    results['distance'] = distance[0]
    
    return results

* Apply the function to book of your choice

In [193]:
top_5_recommended('Decision in Normandy')

Unnamed: 0,title,distance
0,YOU BELONG TO ME,0.0
1,The Road Less Traveled and Beyond : Spiritual ...,0.0
2,Timepiece (Christmas Box Trilogy),0.0
3,ONE DAY MY SOUL JUST OPENED UP : 40 DAYS AND 4...,0.0
4,MY STORY,0.0


In [189]:
# test only
test_title = 'Clara Callan'
test_isbn = title_series[title_series == test_title].index[0]
test_isbn_series = df_book_features.loc[test_isbn]
test_distance, test_index = nn.kneighbors([test_isbn_series], n_neighbors=5)

test_top5_isbns = [df_book_features.iloc[idx].name for idx in test_index[0]]
test_top5_titles = [title_series.loc[isbn] for isbn in test_top5_isbns]

results = pd.DataFrame()
results['title'] = test_top5_titles
results['distance'] = test_distance[0]
results

Unnamed: 0,title,distance
0,YOU BELONG TO ME,0.0
1,The Road Less Traveled and Beyond : Spiritual ...,0.0
2,Timepiece (Christmas Box Trilogy),0.0
3,ONE DAY MY SOUL JUST OPENED UP : 40 DAYS AND 4...,0.0
4,MY STORY,0.0
