In [5]:
import os
import numpy as np
import pandas as pd

import matplotlib.pyplot as plt
from scipy.sparse import csr_matrix
from sklearn.neighbors import KNeighborsClassifier

from fuzzywuzzy import fuzz
import pickle



In [9]:
# File Path
movies_file = "movies.csv"
#ratings_file = "ratings.csv"
# Read in the Data
movies_df = pd.read_csv(movies_file)
#ratings_df = pd.read_csv(ratings_file)

In [10]:
ratings_file = "../../../Downloads/ml-latest/ratings.csv"
ratings_df = pd.read_csv(ratings_file)

In [None]:
movies_df.head()

In [None]:
movies_df.info()

In [None]:
ratings_df.head()

In [None]:
ratings_df.info()

In [11]:
number_users = len(ratings_df.userId.unique())
number_movies = len(ratings_df.movieId.unique())
print('There are {} users and {} movies in this dataset.'.format(number_users, number_movies) )

There are 283228 users and 53889 movies in this dataset.


In [49]:
# get rating frequency of movies
movie_ratings_count = pd.DataFrame(ratings_df.groupby('movieId').size(), columns=['count'])
movie_ratings_count.sort_values('count', ascending=False).head()                                               

Unnamed: 0_level_0,count
movieId,Unnamed: 1_level_1
318,97999
356,97040
296,92406
593,87899
2571,84545


In [50]:
movie_ratings_count['count'].quantile(np.arange(1, 0.0, -.1))

1.0    97999.0
0.9      531.0
0.8       91.0
0.7       28.0
0.6       12.0
0.5        7.0
0.4        4.0
0.3        2.0
0.2        2.0
0.1        1.0
Name: count, dtype: float64

In [51]:
# filter data by count of reviews
number_reviews = 75
popular_movies = list(set(movie_ratings_count.query('count >= @number_reviews').index))
filtered_movie_df = ratings_df[ratings_df.movieId.isin(popular_movies)]
filtered_movie_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 27322425 entries, 0 to 27753443
Data columns (total 4 columns):
userId       int64
movieId      int64
rating       float64
timestamp    int64
dtypes: float64(1), int64(3)
memory usage: 1.0 GB


In [52]:
# get ratings frequency of users
user_ratings_count = pd.DataFrame(filtered_movie_df.groupby('userId').size(), columns=['count'])
user_ratings_count.sort_values('count', ascending=False).head() 

Unnamed: 0_level_0,count
userId,Unnamed: 1_level_1
123100,8332
117490,6537
242683,6525
212343,5876
63783,5495


In [53]:
user_ratings_count['count'].quantile(np.arange(1, 0.0, -0.10))

1.0    8332.0
0.9     239.0
0.8     121.0
0.7      73.0
0.6      47.0
0.5      30.0
0.4      20.0
0.3      16.0
0.2      13.0
0.1       6.0
Name: count, dtype: float64

In [55]:
# filter data by count of user ratings
user_reviews = 210
popular_users = list(set(movie_ratings_count.query('count >= @user_reviews').index))
filtered_user_df = filtered_movie_df[filtered_movie_df.movieId.isin(popular_users)]
filtered_user_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 26869134 entries, 0 to 27753443
Data columns (total 4 columns):
userId       int64
movieId      int64
rating       float64
timestamp    int64
dtypes: float64(1), int64(3)
memory usage: 1.0 GB


In [56]:
filtered_user_df.head(15)

Unnamed: 0,userId,movieId,rating,timestamp
0,1,307,3.5,1256677221
1,1,481,3.5,1256677456
2,1,1091,1.5,1256677471
3,1,1257,4.5,1256677460
4,1,1449,4.5,1256677264
5,1,1590,2.5,1256677236
6,1,1591,1.5,1256677475
7,1,2134,4.5,1256677464
8,1,2478,4.0,1256677239
9,1,2840,3.0,1256677500


In [58]:
#Merging Dataset
movie_ratings_df = pd.merge(filtered_user_df, movies_df, on='movieId')
movie_ratings_df.head()

Unnamed: 0,userId,movieId,rating,timestamp,title,genres
0,1,307,3.5,1256677221,Three Colors: Blue (Trois couleurs: Bleu) (1993),Drama
1,6,307,4.0,832059248,Three Colors: Blue (Trois couleurs: Bleu) (1993),Drama
2,56,307,4.0,1383625728,Three Colors: Blue (Trois couleurs: Bleu) (1993),Drama
3,71,307,5.0,1257795414,Three Colors: Blue (Trois couleurs: Bleu) (1993),Drama
4,84,307,3.0,999055519,Three Colors: Blue (Trois couleurs: Bleu) (1993),Drama


In [None]:
# Export file as a CSV
#movie_ratings_df.to_csv("movie_filtered.csv", index=False, header=True)

In [59]:
# Determining the number of unique users and movies in the filtered down database
number_users = len(movie_ratings_df.userId.unique())
number_movies = len(movie_ratings_df.movieId.unique())
print('There are {} users and {} movies in this dataset.'.format(number_users, number_movies) )

There are 282996 users and 8024 movies in this dataset.


In [None]:
# Pivoting the dataframe 
df_movie_pivot = filtered_user_df.pivot(
    index='movieId',
    columns='userId',
    values='rating'
).fillna(0)


In [None]:
# creating movie name index
movie_title_index= {
    movie: i for i, movie in 
    enumerate(list(movies_df.set_index('movieId').loc[df_movie_pivot.index].title))
}

# convert dataframe of movie features to scipy sparse matrix
movie_matrix = csr_matrix(df_movie_pivot.values)

In [None]:
print(movie_title_index)

In [None]:
from sklearn.neighbors import NearestNeighbors
model_knn = NearestNeighbors(metric='cosine', algorithm='brute', n_neighbors=20, n_jobs=-1)
model_knn.fit(movie_matrix)

In [None]:
type(model_knn)


In [10]:
from joblib import dump, load
#dump(model_knn, 'knn_model.joblib')

In [11]:
model_knn = load('knn_model.joblib') 

In [9]:

dump(movie_title_index, 'movie_title_index.joblib')

NameError: name 'movie_title_index' is not defined

In [12]:
movie_title_index = load('movie_title_index.joblib') 

In [None]:
dump(movie_matrix, 'movie_matrix.joblib')

In [13]:
movie_matrix = load('movie_matrix.joblib') 

In [14]:
def similar_name_search(mapper, fav_movie):  
    match_tuple = []
    # Get a match between user movie choice and titles in the database
    for title, idx in movie_title_index.items():
        ratio = fuzz.ratio(str(title).lower(), str(fav_movie).lower())
        if ratio >= 50:
            match_tuple.append((title, idx, ratio))
    # Sort the possible matches and see if they match up with the tuple
    match_tuple = sorted(match_tuple, key=lambda x: x[2])[::-1]
    if not match_tuple:
        print('Oops! No match is found')
        return 
    else:
        print('Found possible matches in our database: {0}\n'.format([x[0] for x in match_tuple]))
        return match_tuple[0][1]

In [15]:
#Function to make recommendation based on fav_movie or movie choice
def make_recommendation(model_knn, data, fav_movie, mapper, n_recommendations):
    # Choose a movie
    model_knn.fit(data)
    print('You have input movie:', fav_movie)
    # Searching for similar movies
    print('Recommendation system is looking to find similar movies')
    print('...............\n')
    # Idx equals the function defined by the similarnamesearch above
    idx = similar_name_search(movie_title_index, fav_movie)
    distances ,indices = model_knn.kneighbors(data[idx], n_neighbors=n_recommendations+1)
    # a list of the index without the titles of movie
    raw_recommends = \
        sorted(list(zip(indices.squeeze().tolist(), distances.squeeze().tolist())), key=lambda x:x[1], reverse=True)[:0:-1] 
    # get reverse mapper
    reverse_mapper = {v: k for k, v in mapper.items()}
    # print recommendations
    print('Recommendations for {}:'.format(fav_movie))
    for i, (idx, dist) in enumerate(raw_recommends):
        print('{0}: {1}, with distance of {2}'.format(i+1, reverse_mapper[idx], dist))
   #sorted(iterable, key=None, reverse=False)    

In [16]:
my_favorite= 'Toy Story'

make_recommendation(
    model_knn=model_knn,
    data=movie_matrix,
    fav_movie= my_favorite,
    mapper=movie_title_index,
    n_recommendations=10)

You have input movie: Toy Story
Recommendation system is looking to find similar movies
...............

Found possible matches in our database: ['Toy Story (1995)', 'Toy Story 3 (2010)', 'Toy Story 2 (1999)', 'True Story (2015)', 'Love Story (1970)', 'Ghost Story (1981)', 'Toy Story of Terror (2013)', 'Toy Soldiers (1991)']

Recommendations for Toy Story:
1: Toy Story (1995), with distance of 1.0168532682541809e-12
2: Star Wars: Episode IV - A New Hope (1977), with distance of 0.46156213288171466
3: Independence Day (a.k.a. ID4) (1996), with distance of 0.4637977295557769
4: Toy Story 2 (1999), with distance of 0.47545005991798994
5: Back to the Future (1985), with distance of 0.480166297731775
6: Jurassic Park (1993), with distance of 0.4882799760521648
7: Forrest Gump (1994), with distance of 0.49156524259091383
8: Lion King, The (1994), with distance of 0.49880005679334827
9: Mission: Impossible (1996), with distance of 0.5025598594099484
10: Star Wars: Episode VI - Return of the J

In [17]:
my_favorite= 'back to the future'

make_recommendation(
    model_knn=model_knn,
    data=movie_matrix,
    fav_movie= my_favorite,
    mapper=movie_title_index,
    n_recommendations=10)

You have input movie: back to the future
Recommendation system is looking to find similar movies
...............

Found possible matches in our database: ['Back to the Future (1985)', 'Back to the Future Part II (1989)', 'Back to the Future Part III (1990)', 'Back to the Beach (1987)', 'Black Stallion, The (1979)', 'Black Hole, The (1979)', 'Bank Job, The (2008)', 'Back to School (1986)', 'Jacket, The (2005)', 'Way of the Gun, The (2000)', 'Black Cauldron, The (1985)']

Recommendations for back to the future:
1: Back to the Future (1985), with distance of 0.0
2: Star Wars: Episode V - The Empire Strikes Back (1980), with distance of 0.3918197491944426
3: Terminator, The (1984), with distance of 0.39613046281847786
4: Raiders of the Lost Ark (Indiana Jones and the Raiders of the Lost Ark) (1981), with distance of 0.40601328299346695
5: Indiana Jones and the Last Crusade (1989), with distance of 0.4145116793935628
6: Back to the Future Part II (1989), with distance of 0.41519795804464876

In [18]:
my_favorite= 'Back to the future'

make_recommendation(
    model_knn=model_knn,
    data=movie_matrix,
    fav_movie= my_favorite,
    mapper=movie_title_index,
    n_recommendations=10)

You have input movie: Back to the future
Recommendation system is looking to find similar movies
...............

Found possible matches in our database: ['Back to the Future (1985)', 'Back to the Future Part II (1989)', 'Back to the Future Part III (1990)', 'Back to the Beach (1987)', 'Black Stallion, The (1979)', 'Black Hole, The (1979)', 'Bank Job, The (2008)', 'Back to School (1986)', 'Jacket, The (2005)', 'Way of the Gun, The (2000)', 'Black Cauldron, The (1985)']

Recommendations for Back to the future:
1: Star Wars: Episode IV - A New Hope (1977), with distance of 0.43291319537576345
2: Men in Black (a.k.a. MIB) (1997), with distance of 0.4263429026027963
3: Star Wars: Episode VI - Return of the Jedi (1983), with distance of 0.4240330225239052
4: Groundhog Day (1993), with distance of 0.4205874602899877
5: Ghostbusters (a.k.a. Ghost Busters) (1984), with distance of 0.41896789051260175
6: Back to the Future Part II (1989), with distance of 0.41519795804464876
7: Indiana Jones an

In [18]:
my_favorite= 'Ghost'

make_recommendation(
    model_knn=model_knn,
    data=movie_matrix,
    fav_movie= my_favorite,
    mapper=movie_title_index,
    n_recommendations=10)

You have input movie: Ghost
Recommendation system is looking to find similar movies
...............

Found possible matches in our database: ['Ghost (1990)']

Recommendations for Ghost:
1: Ghost (1990), with distance of 0.0
2: Pretty Woman (1990), with distance of 0.332954985758119
3: Sleepless in Seattle (1993), with distance of 0.3782815055140518
4: Mrs. Doubtfire (1993), with distance of 0.3864624330916189
5: Speed (1994), with distance of 0.41046807021681153
6: Firm, The (1993), with distance of 0.4511962341521186
7: Home Alone (1990), with distance of 0.46442288469133486
8: Four Weddings and a Funeral (1994), with distance of 0.4791571411521294
9: Mask, The (1994), with distance of 0.48560786099198094
10: Fugitive, The (1993), with distance of 0.4897904200466393


In [24]:
my_favorite= 'Ghost ('

make_recommendation(
    model_knn=model_knn,
    data=movie_matrix,
    fav_movie= my_favorite,
    mapper=movie_title_index,
    n_recommendations=100)

You have input movie: Ghost (
Recommendation system is looking to find similar movies
...............

Found possible matches in our database: ['Ghost (1990)', 'Ghost Dad (1990)', '13 Ghosts (1960)', 'Hostel (2005)']

Recommendations for Ghost (:
1: Ghostbusters (a.k.a. Ghost Busters) (1984), with distance of 0.6810261411273322
2: Judge Dredd (1995), with distance of 0.680275577893255
3: Mr. Holland's Opus (1995), with distance of 0.6802102409232286
4: Nine Months (1995), with distance of 0.6794771136623183
5: Species (1995), with distance of 0.6794146669747448
6: Big (1988), with distance of 0.67854711212489
7: Terminator, The (1984), with distance of 0.6777821773317065
8: Hot Shots! Part Deux (1993), with distance of 0.6775044009429522
9: Broken Arrow (1996), with distance of 0.6734261338152843
10: Rob Roy (1995), with distance of 0.6724330161679829
11: Titanic (1997), with distance of 0.6694361874842627
12: Ace Ventura: When Nature Calls (1995), with distance of 0.6692087599192573
1

In [20]:
my_favorite= 'i am sam'

make_recommendation(
    model_knn=model_knn,
    data=movie_matrix,
    fav_movie= my_favorite,
    mapper=movie_title_index,
    n_recommendations=10)

You have input movie: i am sam
Recommendation system is looking to find similar movies
...............

Found possible matches in our database: ['I Am Sam (2001)']

Recommendations for i am sam:
1: I Am Sam (2001), with distance of 1.8207657603852567e-14
2: Pay It Forward (2000), with distance of 0.7415963551029496
3: My Big Fat Greek Wedding (2002), with distance of 0.7489706841381081
4: Panic Room (2002), with distance of 0.7582108204468905
5: Shallow Hal (2001), with distance of 0.7588520809495967
6: Meet the Parents (2000), with distance of 0.7590736247185537
7: Beautiful Mind, A (2001), with distance of 0.7593589343015823
8: Terminal, The (2004), with distance of 0.761562855568555
9: Cast Away (2000), with distance of 0.7643816505998018
10: Something's Gotta Give (2003), with distance of 0.7644390496067265


In [None]:
my_favorite= 'legally blonde'

make_recommendation(
    model_knn=model_knn,
    data=movie_matrix,
    fav_movie= my_favorite,
    mapper=movie_title_index,
    n_recommendations=10)

In [None]:
my_favorite= 'my big fat greek wedding'

make_recommendation(
    model_knn=model_knn,
    data=movie_matrix,
    fav_movie= my_favorite,
    mapper=movie_title_index,
    n_recommendations=10)

In [None]:
my_favorite= 'isle of dogs'

make_recommendation(
    model_knn=model_knn,
    data=movie_matrix,
    fav_movie= my_favorite,
    mapper=movie_title_index,
    n_recommendations=10)

In [19]:
my_favorite= 'pitch perfect'

make_recommendation(
    model_knn=model_knn,
    data=movie_matrix,
    fav_movie= my_favorite,
    mapper=movie_title_index,
    n_recommendations=10)

You have input movie: pitch perfect
Recommendation system is looking to find similar movies
...............

Found possible matches in our database: ['Pitch Perfect (2012)', 'Pitch Perfect 2 (2015)', 'Picture Perfect (1997)']

Recommendations for pitch perfect:
1: Pitch Perfect (2012), with distance of 0.0
2: Pitch Perfect 2 (2015), with distance of 0.5063372284694982
3: Easy A (2010), with distance of 0.5781909324359206
4: Bridesmaids (2011), with distance of 0.627369485910972
5: Frozen (2013), with distance of 0.650127820439771
6: 21 Jump Street (2012), with distance of 0.6528737552776134
7: Mean Girls (2004), with distance of 0.6581824613454497
8: The Hunger Games: Catching Fire (2013), with distance of 0.6643230907140077
9: The Hunger Games (2012), with distance of 0.6702019606191154
10: Proposal, The (2009), with distance of 0.6721258673739673


In [None]:
my_favorite= "ghost"
make_recommendation(
    model_knn=model_knn,
    data=movie_matrix,
    fav_movie= my_favorite,
    mapper=movie_title_index,
    n_recommendations=10)