In [2]:
import os
import pandas as pd
import numpy as np
from scipy.sparse import csr_matrix
from sklearn.neighbors import NearestNeighbors
import matplotlib.pyplot as plt
from fuzzywuzzy import fuzz
import pickle

In [86]:
df_ratings = pd.read_csv("dataset/ratings_drop.csv")
df_books = pd.read_csv("dataset/books_drop.csv")
users_csv = pd.read_csv("dataset/users.csv")

In [87]:
df_ratings.head()

Unnamed: 0,user_id,book_id,rating
0,1,258,5
1,2,260,5
2,2,26,4
3,2,315,3
4,2,33,4


In [88]:
df_books.head()

Unnamed: 0,book_id,title,original_title,books_count,authors,average_rating,image_url,small_image_url
0,1,"The Hunger Games (The Hunger Games, #1)",The Hunger Games,272,Suzanne Collins,4.34,https://images.gr-assets.com/books/1447303603m...,https://images.gr-assets.com/books/1447303603s...
1,2,Harry Potter and the Sorcerer's Stone (Harry P...,Harry Potter and the Philosopher's Stone,491,"J.K. Rowling, Mary GrandPré",4.44,https://images.gr-assets.com/books/1474154022m...,https://images.gr-assets.com/books/1474154022s...
2,3,"Twilight (Twilight, #1)",Twilight,226,Stephenie Meyer,3.57,https://images.gr-assets.com/books/1361039443m...,https://images.gr-assets.com/books/1361039443s...
3,4,To Kill a Mockingbird,To Kill a Mockingbird,487,Harper Lee,4.25,https://images.gr-assets.com/books/1361975680m...,https://images.gr-assets.com/books/1361975680s...
4,5,The Great Gatsby,The Great Gatsby,1356,F. Scott Fitzgerald,3.89,https://images.gr-assets.com/books/1490528560m...,https://images.gr-assets.com/books/1490528560s...


In [89]:
movie_user_matrix = df_ratings.pivot(index='book_id', columns='user_id', values='rating').fillna(0)
movie_user_matrix.head()

user_id,1,2,3,4,5,6,7,8,9,10,...,53415,53416,53417,53418,53419,53420,53421,53422,53423,53424
book_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,4.0,0.0,...,0.0,0.0,4.0,5.0,4.0,4.0,4.0,4.0,4.0,4.0
2,0.0,5.0,0.0,5.0,0.0,0.0,0.0,0.0,4.0,0.0,...,0.0,0.0,0.0,0.0,5.0,5.0,5.0,5.0,5.0,5.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,4.0,0.0,...,0.0,0.0,0.0,0.0,3.0,3.0,0.0,0.0,0.0,4.0
4,5.0,0.0,3.0,4.0,0.0,0.0,0.0,3.0,0.0,5.0,...,0.0,0.0,0.0,0.0,3.0,0.0,5.0,0.0,5.0,5.0
5,0.0,5.0,0.0,4.0,0.0,0.0,3.0,3.0,5.0,5.0,...,0.0,0.0,0.0,0.0,3.0,2.0,4.0,0.0,0.0,0.0


In [98]:

'''
movie_to_index = {
    movie:i for i,movie in 
    enumerate(list(df_books.set_index('book_id').loc[movie_user_matrix.index].original_title))
}
'''
movie_to_index = pickle.load(open('mapper_dict.pkl','rb'))

In [3]:
'''
movie_user_matrix_sparse = csr_matrix(movie_user_matrix.values)
'''
movie_user_matrix_sparse = pickle.load(open('predict/pivot_df.pkl','rb'))
print(movie_user_matrix_sparse[0])

FileNotFoundError: [Errno 2] No such file or directory: 'predict/pivot_df.pkl'

In [101]:
model_knn = NearestNeighbors(metric='cosine', algorithm='brute', n_neighbors=20, n_jobs=-1)
model_knn.fit(movie_user_matrix_sparse)

NearestNeighbors(algorithm='brute', leaf_size=30, metric='cosine',
                 metric_params=None, n_jobs=-1, n_neighbors=20, p=2,
                 radius=1.0)

In [102]:
def fuzzy_matching(mapper, fav_movie, verbose=True):
    match_tuple = []
    for title, idx in mapper.items():
        title = str(title)
        ratio = fuzz.ratio(title.lower(), fav_movie.lower())
        if ratio >= 50:
            match_tuple.append((title, idx, ratio))
    match_tuple = sorted(match_tuple, key=lambda x: x[2])[::-1]
    if not match_tuple:
        print('Oops! No match is found')
        return -20
    print('Found possible matches in our database: {0}\n'.format([x[0] for x in match_tuple]))
    return match_tuple[0][1]

def make_recommendation(model_knn, data, mapper, fav_movie, n_recommendations):
    print('You have input movie:', fav_movie)
    idx = fuzzy_matching(mapper, fav_movie)
    if(idx != -20):
        print('Recommendation system start to make inference')
        print('\n')
        distances, indices = model_knn.kneighbors(data[idx], n_neighbors=n_recommendations+1)
        raw_recommends = sorted(list(zip(indices.squeeze().tolist(), distances.squeeze().tolist())), key=lambda x: x[1])[:0:-1]
        reverse_mapper = {v: k for k, v in mapper.items()}
        print('Recommendations for {}:'.format(fav_movie))
        for i, (idx, dist) in enumerate(raw_recommends):
            print('{0}: {1}, with distance of {2}'.format(i+1, reverse_mapper[idx], dist))

In [103]:
my_favorite = 'Harry Potter'

make_recommendation(
    model_knn=model_knn,
    data=movie_user_matrix_sparse,
    fav_movie=my_favorite,
    mapper=movie_to_index,
    n_recommendations=5
)

You have input movie: Harry Potter
Found possible matches in our database: ['Complete Harry Potter Boxed Set', 'The Marriage Plot', 'Dark Lover', 'The Dark Tower', 'Are You My Mother?', 'The Scarlet Letter', 'The Partner', "Charlotte's Web", 'Harry Potter and the Goblet of Fire', 'La peste', 'Hyperion', 'Harry Potter and the Deathly Hallows']

Recommendation system start to make inference


Recommendations for Harry Potter:
1: Harry Potter and the Chamber of Secrets, with distance of 0.787631163657476
2: Harry Potter and the Goblet of Fire, with distance of 0.7837809542241727
3: Harry Potter and the Order of the Phoenix, with distance of 0.7835079373758689
4: Harry Potter and the Half-Blood Prince, with distance of 0.7819729330115626
5: Harry Potter and the Deathly Hallows, with distance of 0.7815188162498835


In [64]:
filename = 'knn_model.sav'
pickle.dump(model_knn, open(filename,'wb'))

In [73]:
filename = 'pivot_df.pkl'
pickle.dump(movie_user_matrix_sparse, open(filename,'wb'))

In [97]:
filename = 'mapper_dict.pkl'
pickle.dump(movie_to_index, open(filename,'wb'))