In [372]:
import os
import pandas as pd

# Loading data paths
data_path = '../data'
movies_filename = 'movies.csv'
ratings_filename = 'ratings.csv'

# Loading data into Pandas Dataframes
df_movies = pd.read_csv(
    os.path.join(data_path, movies_filename),
    usecols=['movieId', 'title'], 
    dtype={'movieId': 'int32', 'title': 'str'})

df_ratings = pd.read_csv(
    os.path.join(data_path, ratings_filename),
    usecols=['userId', 'movieId', 'rating'],
    dtype={'userId': 'int32', 'movieId': 'int32', 'rating': 'float32'})


In [378]:
# Preview movies dataset
df_movies.head(3)

Unnamed: 0,movieId,title
0,1,Toy Story (1995)
1,2,Jumanji (1995)
2,3,Grumpier Old Men (1995)


In [379]:
# Preview ratings dataset
df_ratings.head(3)

Unnamed: 0,userId,movieId,rating
0,1,1,4.0
1,1,3,4.0
2,1,6,4.0


In [380]:
from scipy.sparse import csr_matrix

# pivot ratings into movie features
df_movie_features = df_ratings.pivot(
    index='movieId',
    columns='userId',
    values='rating'
).fillna(0)

df_movie_features.head(3)

userId,1,2,3,4,5,6,7,8,9,10,...,601,602,603,604,605,606,607,608,609,610
movieId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,4.0,0.0,0.0,0.0,4.0,0.0,4.5,0.0,0.0,0.0,...,4.0,0.0,4.0,3.0,4.0,2.5,4.0,2.5,3.0,5.0
2,0.0,0.0,0.0,0.0,0.0,4.0,0.0,4.0,0.0,0.0,...,0.0,4.0,0.0,5.0,3.5,0.0,0.0,2.0,0.0,0.0
3,4.0,0.0,0.0,0.0,0.0,5.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2.0,0.0,0.0


In [381]:
# Converting into CSR (Compressed Sparse Row) Matrix
# This is a scipy packaged, sparse matrix
mat_movie_features = csr_matrix(df_movie_features.values)

# preview the sparse matrix
mat_movie_features.toarray()

array([[4. , 0. , 0. , ..., 2.5, 3. , 5. ],
       [0. , 0. , 0. , ..., 2. , 0. , 0. ],
       [4. , 0. , 0. , ..., 2. , 0. , 0. ],
       ...,
       [0. , 0. , 0. , ..., 0. , 0. , 0. ],
       [0. , 0. , 0. , ..., 0. , 0. , 0. ],
       [0. , 0. , 0. , ..., 0. , 0. , 0. ]], dtype=float32)

In [382]:
#number of ratings each movie got
df_movies_cnt = pd.DataFrame(df_ratings.groupby('movieId').size(), columns=['count'])
df_movies_cnt.head()

# Filter to remove movies that have been rated less than 50 times
popularity_thres = 50
popular_movies = list(set(df_movies_cnt.query('count >= @popularity_thres').index))
df_ratings_drop_movies = df_ratings[df_ratings.movieId.isin(popular_movies)]
print('shape of original ratings data: ', df_ratings.shape)
print('shape of ratings data after dropping unpopular movies: ', df_ratings_drop_movies.shape)

shape of original ratings data:  (100836, 3)
shape of ratings data after dropping unpopular movies:  (41360, 3)


In [383]:
# get number of ratings given by user every user
df_users_cnt = pd.DataFrame(df_ratings_drop_movies.groupby('userId').size(), columns=['count'])
df_users_cnt.head()

# Filter to remove users that have rated less than 50 movies
ratings_thres = 50
active_users = list(set(df_users_cnt.query('count >= @ratings_thres').index))
df_ratings_drop_users = df_ratings_drop_movies[df_ratings_drop_movies.userId.isin(active_users)]
print('shape of original ratings data: ', df_ratings.shape)
print('shape of ratings data after dropping both unpopular movies and inactive users: ', df_ratings_drop_users.shape)


shape of original ratings data:  (100836, 3)
shape of ratings data after dropping both unpopular movies and inactive users:  (32999, 3)


In [384]:
# pivot and create movie-user matrix
movie_user_mat = df_ratings_drop_users.pivot(index='movieId', columns='userId', values='rating').fillna(0)

#map movie titles to images
movie_to_idx = {
    movie: i for i, movie in 
    enumerate(list(df_movies.set_index('movieId').loc[movie_user_mat.index].title))
}

# transform matrix to scipy sparse matrix
movie_user_mat_sparse = csr_matrix(movie_user_mat.values)
movie_user_mat_sparse.toarray()

array([[4. , 0. , 0. , ..., 4. , 2.5, 5. ],
       [0. , 0. , 4. , ..., 0. , 2. , 0. ],
       [4. , 0. , 5. , ..., 0. , 2. , 0. ],
       ...,
       [0. , 0. , 0. , ..., 0. , 0. , 4.5],
       [0. , 0. , 0. , ..., 0. , 0. , 0. ],
       [0. , 0. , 0. , ..., 0. , 0. , 3. ]], dtype=float32)

In [385]:
from fuzzywuzzy import fuzz

#https://www.geeksforgeeks.org/fuzzywuzzy-python-library/?source=post_page-----c8dcd5fd89b2----------------------
def fuzzy_matching(mapper, fav_movie, verbose=True):
    """
    return the closest match via fuzzy ratio. 
    
    Parameters
    ----------    
    mapper: dict, map movie title name to index of the movie in data
    fav_movie: str, name of user input movie
    
    verbose: bool, print log if True
    Return
    ------
    index of the closest match
    """
    match_tuple = []
    # get match
    for title, idx in mapper.items():
        ratio = fuzz.ratio(title.lower(), fav_movie.lower())
        if ratio >= 60:
            match_tuple.append((title, idx, ratio))
    # sort
    match_tuple = sorted(match_tuple, key=lambda x: x[2])[::-1]
    if not match_tuple:
        print('Oops! No match is found')
        return
    if verbose:
        print('Found possible matches in our database: {0}\n'.format([x[0] for x in match_tuple]))
    return match_tuple[0][1]

In [339]:
from sklearn.neighbors import NearestNeighbors

#make an object for the NearestNeighbors Class.
model_knn = NearestNeighbors(metric='cosine', algorithm='brute', n_neighbors=20, n_jobs=-1)

# fit the dataset
model_knn.fit(movie_user_mat_sparse)

model_knn.kneighbors

<bound method KNeighborsMixin.kneighbors of NearestNeighbors(algorithm='brute', leaf_size=30, metric='cosine',
                 metric_params=None, n_jobs=-1, n_neighbors=20, p=2,
                 radius=1.0)>

In [387]:
# function to return a Pandas df with the recommendations sorted by KNN distance, by closest
def get_recommendation_knn(model_knn, data, mapper, fav_movie):
    
    n_recommendations = movie_user_mat_sparse.get_shape()[0]-1
    
    # fit
    model_knn.fit(data)
    
    # get input movie index
    #print('You have input movie:', fav_movie)
    idx = fuzzy_matching(mapper, fav_movie, verbose=True)
    #print(idx)
    #print('Recommendation system start to make inference')
    #print('......\n')
    distances, indices = model_knn.kneighbors(data[idx], n_neighbors=n_recommendations+1)
    
    # recommendation data in numeric format from knn model output
    raw_recommends = \
        sorted(list(zip(indices.squeeze().tolist(), distances.squeeze().tolist())), key=lambda x: x[1])[1:]
    
    # get reverse mapper
    reverse_mapper = {v: k for k, v in mapper.items()}
    
    # dataframe to hold movie recommendation values
    df = pd.DataFrame()
    
    # adding recommendation entries line by line
    for i, (idx, dist) in enumerate(raw_recommends):
        df = df.append({'title': reverse_mapper[idx], 'knn_distance': dist}, ignore_index=True)
        #df.loc[i] = str(i) + reverse_mapper[idx], dist
        #print('{0}: {1}, with distance of {2}'.format(i+1, reverse_mapper[idx], dist))
        
    # merging recommendation entries with list of movies to add movieId
    df = pd.merge(df, df_movies, on = 'title', how = 'inner')
    
    # changing the order of the dataframe columns
    df = df[['movieId', 'title', 'knn_distance']]
    
    return df


In [358]:
# name of movie to check
movie_check = "Toy Story"

knn_df = get_recommendation_knn( model_knn=model_knn,
                                    data=movie_user_mat_sparse,
                                    fav_movie=movie_check,
                                    mapper=movie_to_idx )

knn_df['knn_closeness'] = knn_df['knn_distance'].apply(lambda x: 1.0 - x)
knn_df.head(20)

#knn_df.iloc[:20, [0]]

Found possible matches in our database: ['Toy Story (1995)', 'Toy Story 3 (2010)', 'Toy Story 2 (1999)']



Unnamed: 0,movieId,title,knn_distance,knn_closeness
0,480,Jurassic Park (1993),0.279779,0.720221
1,356,Forrest Gump (1994),0.306811,0.693189
2,260,Star Wars: Episode IV - A New Hope (1977),0.325845,0.674155
3,1210,Star Wars: Episode VI - Return of the Jedi (1983),0.326706,0.673294
4,296,Pulp Fiction (1994),0.332818,0.667182
5,4306,Shrek (2001),0.333444,0.666556
6,1198,Raiders of the Lost Ark (Indiana Jones and the...,0.334862,0.665138
7,150,Apollo 13 (1995),0.337781,0.662219
8,1196,Star Wars: Episode V - The Empire Strikes Back...,0.340669,0.659331
9,1270,Back to the Future (1985),0.342331,0.657669


In [359]:
movie_user_mat_sparse.get_shape()[0]

450