In [1]:
import os
import numpy as np
import pandas as pd

import matplotlib.pyplot as plt
from scipy.sparse import csr_matrix
from sklearn.neighbors import KNeighborsClassifier

from fuzzywuzzy import fuzz
import pickle



In [2]:
# File Path
movies_file = "movies.csv"
#ratings_file = "ratings.csv"
# Read in the Data
movies_df = pd.read_csv(movies_file)
#ratings_df = pd.read_csv(ratings_file)

In [3]:
ratings_file = "../../../Downloads/ml-latest/ratings.csv"
ratings_df = pd.read_csv(ratings_file)

In [4]:
movies_df.head()

Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance
4,5,Father of the Bride Part II (1995),Comedy


In [5]:
movies_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 58098 entries, 0 to 58097
Data columns (total 3 columns):
movieId    58098 non-null int64
title      58098 non-null object
genres     58098 non-null object
dtypes: int64(1), object(2)
memory usage: 1.3+ MB


In [6]:
ratings_df.head()

Unnamed: 0,userId,movieId,rating,timestamp
0,1,307,3.5,1256677221
1,1,481,3.5,1256677456
2,1,1091,1.5,1256677471
3,1,1257,4.5,1256677460
4,1,1449,4.5,1256677264


In [7]:
ratings_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 27753444 entries, 0 to 27753443
Data columns (total 4 columns):
userId       int64
movieId      int64
rating       float64
timestamp    int64
dtypes: float64(1), int64(3)
memory usage: 847.0 MB


In [8]:
number_users = len(ratings_df.userId.unique())
number_movies = len(ratings_df.movieId.unique())
print('There are {} users and {} movies in this dataset.'.format(number_users, number_movies) )

There are 283228 users and 53889 movies in this dataset.


In [9]:
# get rating frequency of movies
movie_ratings_count = pd.DataFrame(ratings_df.groupby('movieId').size(), columns=['count'])
movie_ratings_count.sort_values('count', ascending=False).head()                                               

Unnamed: 0_level_0,count
movieId,Unnamed: 1_level_1
318,97999
356,97040
296,92406
593,87899
2571,84545


In [10]:
movie_ratings_count['count'].quantile(np.arange(1, 0.0, -.1))

1.0    97999.0
0.9      531.0
0.8       91.0
0.7       28.0
0.6       12.0
0.5        7.0
0.4        4.0
0.3        2.0
0.2        2.0
0.1        1.0
Name: count, dtype: float64

In [11]:
# filter data by count of reviews
number_reviews = 75
popular_movies = list(set(movie_ratings_count.query('count >= @number_reviews').index))
filtered_movie_df = ratings_df[ratings_df.movieId.isin(popular_movies)]
filtered_movie_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 27322425 entries, 0 to 27753443
Data columns (total 4 columns):
userId       int64
movieId      int64
rating       float64
timestamp    int64
dtypes: float64(1), int64(3)
memory usage: 1.0 GB


In [12]:
# get ratings frequency of users
user_ratings_count = pd.DataFrame(filtered_movie_df.groupby('userId').size(), columns=['count'])
user_ratings_count.sort_values('count', ascending=False).head() 

Unnamed: 0_level_0,count
userId,Unnamed: 1_level_1
123100,8332
117490,6537
242683,6525
212343,5876
63783,5495


In [13]:
user_ratings_count['count'].quantile(np.arange(1, 0.0, -0.10))

1.0    8332.0
0.9     239.0
0.8     121.0
0.7      73.0
0.6      47.0
0.5      30.0
0.4      20.0
0.3      16.0
0.2      13.0
0.1       6.0
Name: count, dtype: float64

In [14]:
# filter data by count of user ratings
user_reviews = 210
popular_users = list(set(movie_ratings_count.query('count >= @user_reviews').index))
filtered_user_df = filtered_movie_df[filtered_movie_df.movieId.isin(popular_users)]
filtered_user_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 26869134 entries, 0 to 27753443
Data columns (total 4 columns):
userId       int64
movieId      int64
rating       float64
timestamp    int64
dtypes: float64(1), int64(3)
memory usage: 1.0 GB


In [None]:
filtered_user_df.head(15)

In [17]:
#Merging Dataset
movie_ratings_df = pd.merge(filtered_user_df, movies_df, on='movieId')
movie_ratings_df.head()

Unnamed: 0,userId,movieId,rating,timestamp,title,genres
0,1,307,3.5,1256677221,Three Colors: Blue (Trois couleurs: Bleu) (1993),Drama
1,6,307,4.0,832059248,Three Colors: Blue (Trois couleurs: Bleu) (1993),Drama
2,56,307,4.0,1383625728,Three Colors: Blue (Trois couleurs: Bleu) (1993),Drama
3,71,307,5.0,1257795414,Three Colors: Blue (Trois couleurs: Bleu) (1993),Drama
4,84,307,3.0,999055519,Three Colors: Blue (Trois couleurs: Bleu) (1993),Drama


In [None]:
# Export file as a CSV
#movie_ratings_df.to_csv("movie_filtered.csv", index=False, header=True)

In [18]:
# Determining the number of unique users and movies in the filtered down database
number_users = len(movie_ratings_df.userId.unique())
number_movies = len(movie_ratings_df.movieId.unique())
print('There are {} users and {} movies in this dataset.'.format(number_users, number_movies) )

There are 282996 users and 8024 movies in this dataset.


In [None]:
# Pivoting the dataframe 
df_movie_pivot = filtered_user_df.pivot(
    index='movieId',
    columns='userId',
    values='rating'
).fillna(0)


In [None]:
# creating movie name index
movie_title_index= {
    movie: i for i, movie in 
    enumerate(list(movies_df.set_index('movieId').loc[df_movie_pivot.index].title))
}

# convert dataframe of movie features to scipy sparse matrix
movie_matrix = csr_matrix(df_movie_pivot.values)

In [None]:
print(movie_title_index)

In [None]:
from sklearn.neighbors import NearestNeighbors
model_knn = NearestNeighbors(metric='cosine', algorithm='brute', n_neighbors=20, n_jobs=-1)
model_knn.fit(movie_matrix)

In [None]:
type(model_knn)


In [None]:
from joblib import dump, load
dump(model_knn, 'knn_model.joblib')

In [None]:
model_knn = load('knn_model.joblib') 

In [None]:

dump(movie_title_index, 'movie_title_index.joblib')

In [None]:
movie_title_index = load('movie_title_index.joblib') 

In [None]:
dump(movie_matrix, 'movie_matrix.joblib')

In [None]:
movie_matrix = load('movie_matrix.joblib') 

In [None]:
def similar_name_search(mapper, fav_movie):  
    match_tuple = []
    # Get a match between user movie choice and titles in the database
    for title, idx in movie_title_index.items():
        ratio = fuzz.ratio(str(title).lower(), str(fav_movie).lower())
        if ratio >= 60:
            match_tuple.append((title, idx, ratio))
    # Sort the possible matches and see if they match up with the tuple
    match_tuple = sorted(match_tuple, key=lambda x: x[2])[::-1]
    if not match_tuple:
        print('Oops! No match is found')
        return 
    else:
        print('Found possible matches in our database: {0}\n'.format([x[0] for x in match_tuple]))
        return match_tuple[0][1]

In [None]:
#Function to make recommendation based on fav_movie or movie choice
def make_recommendation(model_knn, data, fav_movie, mapper, n_recommendations):
    # Choose a movie
    model_knn.fit(data)
    print('You have input movie:', fav_movie)
    # Searching for similar movies
    print('Recommendation system is looking to find similar movies')
    print('...............\n')
    # Idx equals the function defined by the similarnamesearch above
    idx = similar_name_search(movie_title_index, fav_movie)
    distances ,indices = model_knn.kneighbors(data[idx], n_neighbors=n_recommendations+1)
    # get list of raw idx of recommendations
    raw_recommends = \
        sorted(list(zip(indices.squeeze().tolist(), distances.squeeze().tolist())), key=lambda x: x[1])[:-1:0]
    # get reverse mapper
    reverse_mapper = {v: k for k, v in mapper.items()}
    # print recommendations
    print('Recommendations for {}:'.format(fav_movie))
    for i, (idx, dist) in enumerate(raw_recommends):
        print('{0}: {1}, with distance of {2}'.format(i+1, reverse_mapper[idx], dist))

In [None]:
my_favorite= 'Toy Story'

make_recommendation(
    model_knn=model_knn,
    data=movie_matrix,
    fav_movie= my_favorite,
    mapper=movie_title_index,
    n_recommendations=10)

In [None]:
my_favorite= 'Ghostbust'

make_recommendation(
    model_knn=model_knn,
    data=movie_matrix,
    fav_movie= my_favorite,
    mapper=movie_title_index,
    n_recommendations=10)

In [None]:
my_favorite= 'Back to the future'

make_recommendation(
    model_knn=model_knn,
    data=movie_matrix,
    fav_movie= my_favorite,
    mapper=movie_title_index,
    n_recommendations=10)

In [None]:
my_favorite= 'little women'

make_recommendation(
    model_knn=model_knn,
    data=movie_matrix,
    fav_movie= my_favorite,
    mapper=movie_title_index,
    n_recommendations=10)

In [None]:
my_favorite= 'pretty woman'

make_recommendation(
    model_knn=model_knn,
    data=movie_matrix,
    fav_movie= my_favorite,
    mapper=movie_title_index,
    n_recommendations=10)

In [None]:
my_favorite= 'i am sam'

make_recommendation(
    model_knn=model_knn,
    data=movie_matrix,
    fav_movie= my_favorite,
    mapper=movie_title_index,
    n_recommendations=10)

In [None]:
my_favorite= 'legally blonde'

make_recommendation(
    model_knn=model_knn,
    data=movie_matrix,
    fav_movie= my_favorite,
    mapper=movie_title_index,
    n_recommendations=10)

In [None]:
my_favorite= 'my big fat greek wedding'

make_recommendation(
    model_knn=model_knn,
    data=movie_matrix,
    fav_movie= my_favorite,
    mapper=movie_title_index,
    n_recommendations=10)

In [None]:
my_favorite= 'isle of dogs'

make_recommendation(
    model_knn=model_knn,
    data=movie_matrix,
    fav_movie= my_favorite,
    mapper=movie_title_index,
    n_recommendations=10)

In [None]:
my_favorite= 'pitch perfect'

make_recommendation(
    model_knn=model_knn,
    data=movie_matrix,
    fav_movie= my_favorite,
    mapper=movie_title_index,
    n_recommendations=10)

In [None]:
my_favorite= "ghost"
make_recommendation(
    model_knn=model_knn,
    data=movie_matrix,
    fav_movie= my_favorite,
    mapper=movie_title_index,
    n_recommendations=10)