In [302]:
import os
import pandas as pd

# Loading data paths
data_path = '../data'
movies_filename = 'movies.csv'
ratings_filename = 'ratings.csv'

# Loading data into Pandas Dataframes
df_movies = pd.read_csv(
    os.path.join(data_path, movies_filename),
    usecols=['movieId', 'title'], 
    dtype={'movieId': 'int32', 'title': 'str'})

df_ratings = pd.read_csv(
    os.path.join(data_path, ratings_filename),
    usecols=['userId', 'movieId', 'rating'],
    dtype={'userId': 'int32', 'movieId': 'int32', 'rating': 'float32'})


"""
df_tags = pd.read_csv(
    '../data/tags.csv',
    usecols=['userId', 'movieId', 'tag'])
"""

"\ndf_tags = pd.read_csv(\n    '../data/tags.csv',\n    usecols=['userId', 'movieId', 'tag'])\n"

In [298]:
# Preview movies dataset
df_movies

Unnamed: 0,movieId,title
0,1,Toy Story (1995)
1,2,Jumanji (1995)
2,3,Grumpier Old Men (1995)
3,4,Waiting to Exhale (1995)
4,5,Father of the Bride Part II (1995)
...,...,...
9737,193581,Black Butler: Book of the Atlantic (2017)
9738,193583,No Game No Life: Zero (2017)
9739,193585,Flint (2017)
9740,193587,Bungo Stray Dogs: Dead Apple (2018)


In [299]:
# Preview ratings dataset
df_ratings

Unnamed: 0,userId,movieId,rating
0,1,1,4.0
1,1,3,4.0
2,1,6,4.0
3,1,47,5.0
4,1,50,5.0
...,...,...,...
100831,610,166534,4.0
100832,610,168248,5.0
100833,610,168250,5.0
100834,610,168252,5.0


In [273]:
from scipy.sparse import csr_matrix

# pivot ratings into movie features
df_movie_features = df_ratings.pivot(
    index='movieId',
    columns='userId',
    values='rating'
).fillna(0)

df_movie_features

userId,1,2,3,4,5,6,7,8,9,10,...,601,602,603,604,605,606,607,608,609,610
movieId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,4.0,0.0,0.0,0.0,4.0,0.0,4.5,0.0,0.0,0.0,...,4.0,0.0,4.0,3.0,4.0,2.5,4.0,2.5,3.0,5.0
2,0.0,0.0,0.0,0.0,0.0,4.0,0.0,4.0,0.0,0.0,...,0.0,4.0,0.0,5.0,3.5,0.0,0.0,2.0,0.0,0.0
3,4.0,0.0,0.0,0.0,0.0,5.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,3.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5,0.0,0.0,0.0,0.0,0.0,5.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,3.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
193581,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
193583,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
193585,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
193587,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [274]:
# Converting into CSR (Compressed Sparse Row) Matrix
# This is a scipy packaged, sparse matrix
mat_movie_features = csr_matrix(df_movie_features.values)

# preview the sparse matrix
mat_movie_features.toarray()

array([[4. , 0. , 0. , ..., 2.5, 3. , 5. ],
       [0. , 0. , 0. , ..., 2. , 0. , 0. ],
       [4. , 0. , 0. , ..., 2. , 0. , 0. ],
       ...,
       [0. , 0. , 0. , ..., 0. , 0. , 0. ],
       [0. , 0. , 0. , ..., 0. , 0. , 0. ],
       [0. , 0. , 0. , ..., 0. , 0. , 0. ]], dtype=float32)

In [275]:
#number of ratings each movie got
df_movies_cnt = pd.DataFrame(df_ratings.groupby('movieId').size(), columns=['count'])
df_movies_cnt.head()

# Filter to remove movies that have been rated less than 50 times
popularity_thres = 50
popular_movies = list(set(df_movies_cnt.query('count >= @popularity_thres').index))
df_ratings_drop_movies = df_ratings[df_ratings.movieId.isin(popular_movies)]
print('shape of original ratings data: ', df_ratings.shape)
print('shape of ratings data after dropping unpopular movies: ', df_ratings_drop_movies.shape)

shape of original ratings data:  (100836, 3)
shape of ratings data after dropping unpopular movies:  (41360, 3)


In [276]:
# get number of ratings given by user every user
df_users_cnt = pd.DataFrame(df_ratings_drop_movies.groupby('userId').size(), columns=['count'])
df_users_cnt.head()

# Filter to remove users that have rated less than 50 movies
ratings_thres = 50
active_users = list(set(df_users_cnt.query('count >= @ratings_thres').index))
df_ratings_drop_users = df_ratings_drop_movies[df_ratings_drop_movies.userId.isin(active_users)]
print('shape of original ratings data: ', df_ratings.shape)
print('shape of ratings data after dropping both unpopular movies and inactive users: ', df_ratings_drop_users.shape)


shape of original ratings data:  (100836, 3)
shape of ratings data after dropping both unpopular movies and inactive users:  (32999, 3)


In [277]:
# pivot and create movie-user matrix
movie_user_mat = df_ratings_drop_users.pivot(index='movieId', columns='userId', values='rating').fillna(0)

#map movie titles to images
movie_to_idx = {
    movie: i for i, movie in 
    enumerate(list(df_movies.set_index('movieId').loc[movie_user_mat.index].title))
}

# transform matrix to scipy sparse matrix
movie_user_mat_sparse = csr_matrix(movie_user_mat.values)
movie_user_mat_sparse.toarray()

array([[4. , 0. , 0. , ..., 4. , 2.5, 5. ],
       [0. , 0. , 4. , ..., 0. , 2. , 0. ],
       [4. , 0. , 5. , ..., 0. , 2. , 0. ],
       ...,
       [0. , 0. , 0. , ..., 0. , 0. , 4.5],
       [0. , 0. , 0. , ..., 0. , 0. , 0. ],
       [0. , 0. , 0. , ..., 0. , 0. , 3. ]], dtype=float32)

In [278]:
from fuzzywuzzy import fuzz

#https://www.geeksforgeeks.org/fuzzywuzzy-python-library/?source=post_page-----c8dcd5fd89b2----------------------
def fuzzy_matching(mapper, fav_movie, verbose=True):
    """
    return the closest match via fuzzy ratio. 
    
    Parameters
    ----------    
    mapper: dict, map movie title name to index of the movie in data
    fav_movie: str, name of user input movie
    
    verbose: bool, print log if True
    Return
    ------
    index of the closest match
    """
    match_tuple = []
    # get match
    for title, idx in mapper.items():
        ratio = fuzz.ratio(title.lower(), fav_movie.lower())
        if ratio >= 60:
            match_tuple.append((title, idx, ratio))
    # sort
    match_tuple = sorted(match_tuple, key=lambda x: x[2])[::-1]
    if not match_tuple:
        print('Oops! No match is found')
        return
    if verbose:
        print('Found possible matches in our database: {0}\n'.format([x[0] for x in match_tuple]))
    return match_tuple[0][1]

In [339]:
from sklearn.neighbors import NearestNeighbors

#make an object for the NearestNeighbors Class.
model_knn = NearestNeighbors(metric='cosine', algorithm='brute', n_neighbors=20, n_jobs=-1)

# fit the dataset
model_knn.fit(movie_user_mat_sparse)

model_knn.kneighbors

<bound method KNeighborsMixin.kneighbors of NearestNeighbors(algorithm='brute', leaf_size=30, metric='cosine',
                 metric_params=None, n_jobs=-1, n_neighbors=20, p=2,
                 radius=1.0)>

In [353]:
# function to return a Pandas df with the recommendations sorted by KNN distance, by closest
def get_recommendation_knn(model_knn, data, mapper, fav_movie):
    
    n_recommendations = movie_user_mat_sparse.get_shape()[0]-1
    
    # fit
    model_knn.fit(data)
    
    # get input movie index
    #print('You have input movie:', fav_movie)
    idx = fuzzy_matching(mapper, fav_movie, verbose=True)
    #print(idx)
    #print('Recommendation system start to make inference')
    #print('......\n')
    distances, indices = model_knn.kneighbors(data[idx], n_neighbors=n_recommendations+1)
    
    # recommendation data in numeric format from knn model output
    raw_recommends = \
        sorted(list(zip(indices.squeeze().tolist(), distances.squeeze().tolist())), key=lambda x: x[1])[1:]
    
    # get reverse mapper
    reverse_mapper = {v: k for k, v in mapper.items()}
    
    # dataframe to hold movie recommendation values
    df = pd.DataFrame()
    
    # adding recommendation entries line by line
    for i, (idx, dist) in enumerate(raw_recommends):
        df = df.append({'title': reverse_mapper[idx], 'knn_distance': dist}, ignore_index=True)
        #df.loc[i] = str(i) + reverse_mapper[idx], dist
        #print('{0}: {1}, with distance of {2}'.format(i+1, reverse_mapper[idx], dist))
        
    # merging recommendation entries with list of movies to add movieId
    df = pd.merge(df, df_movies, on = 'title', how = 'inner')
    
    # changing the order of the dataframe columns
    df = df[['movieId', 'title', 'knn_distance']]
    
    return df


In [358]:
movie_check = "Toy Story"

knn_df = get_recommendation_knn( model_knn=model_knn,
                                    data=movie_user_mat_sparse,
                                    fav_movie=movie_check,
                                    mapper=movie_to_idx )

knn_df['knn_closeness'] = knn_df['knn_distance'].apply(lambda x: 1.0 - x)
knn_df.head(20)

#knn_df.iloc[:20, [0]]

Found possible matches in our database: ['Toy Story (1995)', 'Toy Story 3 (2010)', 'Toy Story 2 (1999)']



Unnamed: 0,movieId,title,knn_distance,knn_closeness
0,480,Jurassic Park (1993),0.279779,0.720221
1,356,Forrest Gump (1994),0.306811,0.693189
2,260,Star Wars: Episode IV - A New Hope (1977),0.325845,0.674155
3,1210,Star Wars: Episode VI - Return of the Jedi (1983),0.326706,0.673294
4,296,Pulp Fiction (1994),0.332818,0.667182
5,4306,Shrek (2001),0.333444,0.666556
6,1198,Raiders of the Lost Ark (Indiana Jones and the...,0.334862,0.665138
7,150,Apollo 13 (1995),0.337781,0.662219
8,1196,Star Wars: Episode V - The Empire Strikes Back...,0.340669,0.659331
9,1270,Back to the Future (1985),0.342331,0.657669


In [359]:
movie_user_mat_sparse.get_shape()[0]

450

In [368]:
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import linear_kernel
from sklearn.decomposition import TruncatedSVD
import numpy as np

# Getting content data for content based recommender
movies = pd.read_csv('../data/movies.csv')
tags = pd.read_csv('../data/tags.csv')
ratings = pd.read_csv('../data/ratings.csv')


#Data Preprocessing and filtering data
movies['genres'] = movies['genres'].str.replace('|',' ') #remove horizontal lines from data
ratings_filter = ratings.groupby('userId').filter(lambda x: len(x) >= 50) #keep only users that review 50 or more movies
movie_list_rating = ratings_filter.movieId.unique().tolist()
movies = movies[movies.movieId.isin(movie_list_rating)]
Mapping_file = dict(zip(movies.title.tolist(), movies.movieId.tolist()))
mixed = pd.merge(movies, tags, on='movieId', how='left')
mixed.fillna("", inplace=True)
mixed = pd.DataFrame(mixed.groupby('movieId')['tag'].apply(lambda x: "%s" % ' '.join(x)))
final_data = pd.merge(movies, mixed, on='movieId', how='left')
final_data ['metadata'] = final_data[['tag', 'genres']].apply(lambda x: ' '.join(x), axis = 1)

tfidf = TfidfVectorizer(stop_words='english')
tfidf_matrix = tfidf.fit_transform(final_data['metadata'])
tfidf_df = pd.DataFrame(tfidf_matrix.toarray(), index=final_data.index.tolist())

svd = TruncatedSVD(n_components=200)
latent_matrix = svd.fit_transform(tfidf_df)
# plot var expalined to see what latent dimensions to use
n = 200 
latent_matrix_1_df = pd.DataFrame(latent_matrix[:,0:n], index=final_data.title.tolist())
ratings_f1 = pd.merge(movies[['movieId']], ratings_filter, on="movieId", how="right")

a_1 = np.array(latent_matrix_1_df.loc['Toy Story (1995)']).reshape(1, -1)
score_1 = cosine_similarity(latent_matrix_1_df, a_1).reshape(-1)
dictDf = {'cos_similarity': score_1} #, 'collaborative': score_2, 'hybrid': hybrid} 
similar = pd.DataFrame(dictDf, index = latent_matrix_1_df.index)
similar.sort_values('cos_similarity', ascending=False, inplace=True)
similar['title'] = similar.index
similar.head(11)[1:]

Unnamed: 0,cos_similarity,title
"Bug's Life, A (1998)",0.893582,"Bug's Life, A (1998)"
Toy Story 2 (1999),0.770077,Toy Story 2 (1999)
Up (2009),0.566061,Up (2009)
The Lego Movie (2014),0.494096,The Lego Movie (2014)
"Monsters, Inc. (2001)",0.442301,"Monsters, Inc. (2001)"
The Good Dinosaur (2015),0.442301,The Good Dinosaur (2015)
Moana (2016),0.442301,Moana (2016)
"Wild, The (2006)",0.442301,"Wild, The (2006)"
"Emperor's New Groove, The (2000)",0.442301,"Emperor's New Groove, The (2000)"
Shrek the Third (2007),0.442301,Shrek the Third (2007)


In [369]:
df_hybrid = pd.merge(knn_df, similar, on = 'title', how = 'inner')

# take the mean of knn_closeness and add it as hybrid column
df_hybrid['hybrid'] = (df_hybrid['knn_closeness'] + df_hybrid['cos_similarity'])/2.0

# sort dataframe by hybrid column values
df_hybrid = df_hybrid.sort_values(by=['hybrid'], ascending=False)
df_hybrid.head(10)

Unnamed: 0,movieId,title,knn_distance,knn_closeness,cos_similarity,hybrid
32,2355,"Bug's Life, A (1998)",0.411432,0.588568,0.893582,0.741075
11,3114,Toy Story 2 (1999),0.343819,0.656181,0.770077,0.713129
5,4306,Shrek (2001),0.333444,0.666556,0.405989,0.536273
23,4886,"Monsters, Inc. (2001)",0.381508,0.618492,0.442301,0.530396
197,68954,Up (2009),0.561397,0.438603,0.566061,0.502332
15,588,Aladdin (1992),0.362907,0.637093,0.239641,0.438367
120,5218,Ice Age (2002),0.517785,0.482215,0.387411,0.434813
88,2987,Who Framed Roger Rabbit? (1988),0.488453,0.511547,0.357375,0.434461
6,1198,Raiders of the Lost Ark (Indiana Jones and the...,0.334862,0.665138,0.19247,0.428804
45,2115,Indiana Jones and the Temple of Doom (1984),0.426062,0.573938,0.27601,0.424974


In [371]:
df_hybrid_clean = df_hybrid[['movieId', 'title', 'hybrid']]
df_hybrid_clean.head(10)

Unnamed: 0,movieId,title,hybrid
32,2355,"Bug's Life, A (1998)",0.741075
11,3114,Toy Story 2 (1999),0.713129
5,4306,Shrek (2001),0.536273
23,4886,"Monsters, Inc. (2001)",0.530396
197,68954,Up (2009),0.502332
15,588,Aladdin (1992),0.438367
120,5218,Ice Age (2002),0.434813
88,2987,Who Framed Roger Rabbit? (1988),0.434461
6,1198,Raiders of the Lost Ark (Indiana Jones and the...,0.428804
45,2115,Indiana Jones and the Temple of Doom (1984),0.424974
