In [45]:
import pandas as pd
import numpy as np
import os
from sklearn.neighbors import NearestNeighbors
import matplotlib.pyplot as plt
from fuzzywuzzy import fuzz

In [30]:
dataset_path = os.path.join(os.environ['PWD'],'movielens_dataset')
movies_filename = 'movies.csv'
ratings_filename = 'ratings.csv'

In [31]:
df_movies = pd.read_csv(dataset_path+'/'+movies_filename)
df_ratings = pd.read_csv(dataset_path+'/'+ratings_filename)

In [32]:
df_movies.head()

Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance
4,5,Father of the Bride Part II (1995),Comedy


In [33]:
df_ratings.head()

Unnamed: 0,userId,movieId,rating,timestamp
0,1,1,4.0,964982703
1,1,3,4.0,964981247
2,1,6,4.0,964982224
3,1,47,5.0,964983815
4,1,50,5.0,964982931


In [34]:
user_movie_matrix = df_ratings.pivot(index='userId',columns='movieId',values='rating').fillna(-1)
user_movie_matrix.head()

movieId,1,2,3,4,5,6,7,8,9,10,...,193565,193567,193571,193573,193579,193581,193583,193585,193587,193609
userId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,4.0,-1.0,4.0,-1.0,-1.0,4.0,-1.0,-1.0,-1.0,-1.0,...,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0
2,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,...,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0
3,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,...,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0
4,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,...,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0
5,4.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,...,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0


In [56]:
#Mapper from movie title to index
movie_user_matrix_temp = df_ratings.pivot(index='movieId', columns='userId', values='rating').fillna(0)
movie_to_index = {
    movie:i for i,movie in 
    enumerate(list(df_movies.set_index('movieId').loc[movie_user_matrix_temp.index].title))
}

In [35]:
user_movie_matrix = user_movie_matrix.values
print(user_movie_matrix)
print(len(user_movie_matrix))

[[ 4.  -1.   4.  ... -1.  -1.  -1. ]
 [-1.  -1.  -1.  ... -1.  -1.  -1. ]
 [-1.  -1.  -1.  ... -1.  -1.  -1. ]
 ...
 [ 2.5  2.   2.  ... -1.  -1.  -1. ]
 [ 3.  -1.  -1.  ... -1.  -1.  -1. ]
 [ 5.  -1.  -1.  ... -1.  -1.  -1. ]]
610


In [36]:
N = user_movie_matrix.shape[0]  #Number of rows (users)
M = user_movie_matrix.shape[1]  #Number of columns (movies)
K = 2  #Number of latent features
user_matrix = np.random.rand(N,K)
movie_matrix = np.random.rand(M,K)

In [37]:
def matrix_factorization(R, P, Q, K, steps=50, alpha=0.0002, beta=0.02):
    Q = Q.T
    for step in range(steps):
        for i in range(len(R)):
            for j in range(len(R[i])):
                if R[i][j] > 0:
                    eij = R[i][j] - np.dot(P[i,:],Q[:,j])
                    for k in range(K):
                        P[i][k] = P[i][k] + alpha * (2 * eij * Q[k][j] - beta * P[i][k])
                        Q[k][j] = Q[k][j] + alpha * (2 * eij * P[i][k] - beta * Q[k][j])
        eR = np.dot(P,Q)
        e = 0
        for i in range(len(R)):
            for j in range(len(R[i])):
                if R[i][j] > 0:
                    e = e + pow(R[i][j] - np.dot(P[i,:],Q[:,j]), 2)
                    for k in range(K):
                        e = e + (beta/2) * ( pow(P[i][k],2) + pow(Q[k][j],2) )
        if e < 0.001:
            break
    return P, Q.T

In [38]:
mf_user_matrix, mf_movie_matrix = matrix_factorization(user_movie_matrix, user_matrix, movie_matrix, K)

In [39]:
print(mf_user_matrix)
print(mf_movie_matrix)

[[2.26465225 2.41263887]
 [1.73228875 1.02217605]
 [0.90115317 1.06004943]
 ...
 [1.86896179 1.84724447]
 [1.26371138 1.34214899]
 [2.29086655 2.24232655]]
[[0.98737446 1.12699373]
 [0.92490937 0.91515616]
 [0.92929318 0.86814302]
 ...
 [1.00172921 0.44289095]
 [0.87254278 0.96428708]
 [0.16037846 0.69206869]]


In [42]:
full_matrix = np.matmul(mf_user_matrix,mf_movie_matrix.T)
print(full_matrix)

[[4.95508866 4.30253941 4.1990415  ... 3.33710424 4.30248246 2.03291327]
 [2.86240366 2.5376608  2.49719913 ... 2.18799677 2.4971672  0.98523785]
 [2.08444468 1.80359577 1.75771001 ... 1.37219775 1.80848666 0.87815258]
 ...
 [3.92719806 3.41913742 3.34048584 ... 2.69032148 3.41202309 1.57816128]
 [2.76034983 2.39709441 2.33953565 ... 1.86032225 2.39685917 1.13153138]
 [4.78903107 4.17092289 4.07554681 ... 3.28793408 4.16112559 1.91924965]]


In [43]:
#KNN

In [46]:
model_knn = NearestNeighbors(metric='cosine', algorithm='brute', n_neighbors=20, n_jobs=-1)
model_knn.fit(full_matrix)

NearestNeighbors(algorithm='brute', leaf_size=30, metric='cosine',
         metric_params=None, n_jobs=-1, n_neighbors=20, p=2, radius=1.0)

In [47]:
def fuzzy_matching(mapper, fav_movie, verbose=True):
    match_tuple = []
    for title, idx in mapper.items():
        ratio = fuzz.ratio(title.lower(), fav_movie.lower())
        if ratio >= 60:
            match_tuple.append((title, idx, ratio))
    match_tuple = sorted(match_tuple, key=lambda x: x[2])[::-1]
    if not match_tuple:
        print('Oops! No match is found')
        return -20
    print('Found possible matches in our database: {0}\n'.format([x[0] for x in match_tuple]))
    return match_tuple[0][1]

def make_recommendation(model_knn, data, mapper, fav_movie, n_recommendations):
    model_knn.fit(data)
    print('You have input movie:', fav_movie)
    idx = fuzzy_matching(mapper, fav_movie)
    if(idx != -20):
        print('Recommendation system start to make inference')
        print('\n')
        distances, indices = model_knn.kneighbors(data[idx], n_neighbors=n_recommendations+1)
        raw_recommends = sorted(list(zip(indices.squeeze().tolist(), distances.squeeze().tolist())), key=lambda x: x[1])[:0:-1]
        reverse_mapper = {v: k for k, v in mapper.items()}
        print('Recommendations for {}:'.format(fav_movie))
        for i, (idx, dist) in enumerate(raw_recommends):
            print('{0}: {1}, with distance of {2}'.format(i+1, reverse_mapper[idx], dist))


In [57]:
my_favorite = 'Iron Man'

make_recommendation(
    model_knn=model_knn,
    data=full_matrix,
    fav_movie=my_favorite,
    mapper=movie_to_index,
    n_recommendations=5)

You have input movie: Iron Man
Found possible matches in our database: ['Iron Man (1931)', 'Iron Man (2008)', 'Iron Man 3 (2013)', 'Iron Man 2 (2010)']

Recommendation system start to make inference




IndexError: index 9426 is out of bounds for axis 0 with size 610