In [3]:
import os
import numpy as np
import pandas as pd

import matplotlib.pyplot as plt
from scipy.sparse import csr_matrix
from sklearn.neighbors import KNeighborsClassifier

from fuzzywuzzy import fuzz

In [4]:
# File Path
movies_file = "movies.csv"
ratings_file = "ratings.csv"
# Read in the Data
movies_df = pd.read_csv(movies_file)
ratings_df = pd.read_csv(ratings_file)

In [5]:
movies_df.head()

Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance
4,5,Father of the Bride Part II (1995),Comedy


In [6]:
movies_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 58098 entries, 0 to 58097
Data columns (total 3 columns):
movieId    58098 non-null int64
title      58098 non-null object
genres     58098 non-null object
dtypes: int64(1), object(2)
memory usage: 1.3+ MB


In [7]:
ratings_df.head()

Unnamed: 0,userId,movieId,rating,timestamp
0,1,307,3.5,1256677221
1,1,481,3.5,1256677456
2,1,1091,1.5,1256677471
3,1,1257,4.5,1256677460
4,1,1449,4.5,1256677264


In [8]:
ratings_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 27753444 entries, 0 to 27753443
Data columns (total 4 columns):
userId       int64
movieId      int64
rating       float64
timestamp    int64
dtypes: float64(1), int64(3)
memory usage: 847.0 MB


In [9]:
number_users = len(ratings_df.userId.unique())
number_movies = len(ratings_df.movieId.unique())
print('There are {} users and {} movies in this dataset.'.format(number_users, number_movies) )

There are 283228 users and 53889 movies in this dataset.


In [10]:
# get rating frequency of movies
movie_ratings_count = pd.DataFrame(ratings_df.groupby('movieId').size(), columns=['count'])
movie_ratings_count.sort_values('count', ascending=False).head()                                               

Unnamed: 0_level_0,count
movieId,Unnamed: 1_level_1
318,97999
356,97040
296,92406
593,87899
2571,84545


In [11]:
movie_ratings_count['count'].quantile(np.arange(1, 0.0, -.1))

1.0    97999.0
0.9      531.0
0.8       91.0
0.7       28.0
0.6       12.0
0.5        7.0
0.4        4.0
0.3        2.0
0.2        2.0
0.1        1.0
Name: count, dtype: float64

In [12]:
# filter data by count of reviews
number_reviews = 75
popular_movies = list(set(movie_ratings_count.query('count >= @number_reviews').index))
filtered_movie_df = ratings_df[ratings_df.movieId.isin(popular_movies)]
filtered_movie_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 27322425 entries, 0 to 27753443
Data columns (total 4 columns):
userId       int64
movieId      int64
rating       float64
timestamp    int64
dtypes: float64(1), int64(3)
memory usage: 1.0 GB


In [13]:
# get ratings frequency of users
user_ratings_count = pd.DataFrame(filtered_movie_df.groupby('userId').size(), columns=['count'])
user_ratings_count.sort_values('count', ascending=False).head() 

Unnamed: 0_level_0,count
userId,Unnamed: 1_level_1
123100,8332
117490,6537
242683,6525
212343,5876
63783,5495


In [14]:
user_ratings_count['count'].quantile(np.arange(1, 0.0, -0.10))

1.0    8332.0
0.9     239.0
0.8     121.0
0.7      73.0
0.6      47.0
0.5      30.0
0.4      20.0
0.3      16.0
0.2      13.0
0.1       6.0
Name: count, dtype: float64

In [15]:
# filter data by count of user ratings
user_reviews = 210
popular_users = list(set(movie_ratings_count.query('count >= @user_reviews').index))
filtered_user_df = filtered_movie_df[filtered_movie_df.movieId.isin(popular_users)]
filtered_user_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 26869134 entries, 0 to 27753443
Data columns (total 4 columns):
userId       int64
movieId      int64
rating       float64
timestamp    int64
dtypes: float64(1), int64(3)
memory usage: 1.0 GB


In [16]:
filtered_user_df.head(15)

Unnamed: 0,userId,movieId,rating,timestamp
0,1,307,3.5,1256677221
1,1,481,3.5,1256677456
2,1,1091,1.5,1256677471
3,1,1257,4.5,1256677460
4,1,1449,4.5,1256677264
5,1,1590,2.5,1256677236
6,1,1591,1.5,1256677475
7,1,2134,4.5,1256677464
8,1,2478,4.0,1256677239
9,1,2840,3.0,1256677500


In [None]:
##Merging Dataset
#movie_ratings_df = pd.merge(filtered_user_df, movies_df, on='movieId')
#movie_ratings_df.head()

In [None]:
# Export file as a CSV
#movie_ratings_df.to_csv("movie_filtered.csv", index=False, header=True)

In [None]:
# Determining the number of unique users and movies in the filtered down database
#number_users = len(movie_ratings_df.userId.unique())
#number_movies = len(movie_ratings_df.movieId.unique())
#print('There are {} users and {} movies in this dataset.'.format(number_users, number_movies) )

In [17]:
# Pivoting the dataframe 
df_movie_pivot = filtered_user_df.pivot(
    index='movieId',
    columns='userId',
    values='rating'
).fillna(0)


In [18]:
# creating movie name index
movie_title_index= {
    movie: i for i, movie in 
    enumerate(list(movies_df.set_index('movieId').loc[df_movie_pivot.index].title))
}

# convert dataframe of movie features to scipy sparse matrix
movie_matrix = csr_matrix(df_movie_pivot.values)

In [39]:
print(movie_title_index)

{'Toy Story (1995)': 0, 'Jumanji (1995)': 1, 'Grumpier Old Men (1995)': 2, 'Waiting to Exhale (1995)': 3, 'Father of the Bride Part II (1995)': 4, 'Heat (1995)': 5, 'Sabrina (1995)': 6, 'Tom and Huck (1995)': 7, 'Sudden Death (1995)': 8, 'GoldenEye (1995)': 9, 'American President, The (1995)': 10, 'Dracula: Dead and Loving It (1995)': 11, 'Balto (1995)': 12, 'Nixon (1995)': 13, 'Cutthroat Island (1995)': 14, 'Casino (1995)': 15, 'Sense and Sensibility (1995)': 16, 'Four Rooms (1995)': 17, 'Ace Ventura: When Nature Calls (1995)': 18, 'Money Train (1995)': 19, 'Get Shorty (1995)': 20, 'Copycat (1995)': 21, 'Assassins (1995)': 22, 'Powder (1995)': 23, 'Leaving Las Vegas (1995)': 24, 'Othello (1995)': 25, 'Now and Then (1995)': 26, 'Persuasion (1995)': 27, 'City of Lost Children, The (Cité des enfants perdus, La) (1995)': 28, 'Shanghai Triad (Yao a yao yao dao waipo qiao) (1995)': 29, 'Dangerous Minds (1995)': 30, 'Twelve Monkeys (a.k.a. 12 Monkeys) (1995)': 31, 'Babe (1995)': 32, 'Carring

In [19]:
from sklearn.neighbors import NearestNeighbors
model_knn = NearestNeighbors(metric='cosine', algorithm='brute', n_neighbors=20, n_jobs=-1)
model_knn.fit(movie_matrix)

NearestNeighbors(algorithm='brute', leaf_size=30, metric='cosine',
         metric_params=None, n_jobs=-1, n_neighbors=20, p=2, radius=1.0)

In [26]:
def similar_name_search(mapper, fav_movie):  
    match_tuple = []
    # Get a match between user movie choice and titles in the database
    for title, idx in movie_title_index.items():
        ratio = fuzz.ratio(str(title).lower(), str(fav_movie).lower())
        if ratio >= 60:
            match_tuple.append((title, idx, ratio))
    # Sort the possible matches and see if they match up with the tuple
    match_tuple = sorted(match_tuple, key=lambda x: x[2])[::-1]
    if not match_tuple:
        print('Oops! No match is found')
        return
    else:
        print('Found possible matches in our database: {0}\n'.format([x[0] for x in match_tuple]))
        return match_tuple[0][1]

In [27]:
#Function to make recommendation based on fav_movie or movie choice
def make_recommendation(model_knn, data, fav_movie, mapper, n_recommendations):
    # Choose a movie
    model_knn.fit(data)
    print('You have input movie:', fav_movie)
    # Searching for similar movies
    print('Recommendation system is looking to find similar movies')
    print('...............\n')
    # Idx equals the function defined by the similarnamesearch above
    idx = similar_name_search(movie_title_index, fav_movie)
    distances ,indices = model_knn.kneighbors(data[idx], n_neighbors=n_recommendations+1)
    # get list of raw idx of recommendations
    raw_recommends = \
        sorted(list(zip(indices.squeeze().tolist(), distances.squeeze().tolist())), key=lambda x: x[1])[:0:-1]
    # get reverse mapper
    reverse_mapper = {v: k for k, v in mapper.items()}
    # print recommendations
    print('Recommendations for {}:'.format(fav_movie))
    for i, (idx, dist) in enumerate(raw_recommends):
        print('{0}: {1}, with distance of {2}'.format(i+1, reverse_mapper[idx], dist))

In [28]:
my_favorite= 'Toy Story'

make_recommendation(
    model_knn=model_knn,
    data=movie_matrix,
    fav_movie= my_favorite,
    mapper=movie_title_index,
    n_recommendations=10)

You have input movie: Toy Story
Recommendation system is looking to find similar movies
......

Found possible matches in our database: ['Toy Story (1995)', 'Toy Story 3 (2010)', 'Toy Story 2 (1999)']

Recommendations for Toy Story:
1: Aladdin (1992), with distance of 0.5084475091639866
2: Star Wars: Episode VI - Return of the Jedi (1983), with distance of 0.5032396256231308
3: Mission: Impossible (1996), with distance of 0.5025598594099484
4: Lion King, The (1994), with distance of 0.49880005679334827
5: Forrest Gump (1994), with distance of 0.49156524259091383
6: Jurassic Park (1993), with distance of 0.4882799760521648
7: Back to the Future (1985), with distance of 0.480166297731775
8: Toy Story 2 (1999), with distance of 0.47545005991798994
9: Independence Day (a.k.a. ID4) (1996), with distance of 0.4637977295557769
10: Star Wars: Episode IV - A New Hope (1977), with distance of 0.46156213288171466


In [30]:
my_favorite= 'Ghostbusters'

make_recommendation(
    model_knn=model_knn,
    data=movie_matrix,
    fav_movie= my_favorite,
    mapper=movie_title_index,
    n_recommendations=10)

You have input movie: Ghostbusters
Recommendation system is looking to find similar movies
......

Found possible matches in our database: ['Ghostbusters (2016)', 'Ghostbusters II (1989)']

Recommendations for Ghostbusters:
1: The Secret Life of Pets (2016), with distance of 0.7491072634949827
2: Finding Dory (2016), with distance of 0.7482338157274169
3: Spy (2015), with distance of 0.7465351978781363
4: Captain America: Civil War (2016), with distance of 0.745887251684789
5: Doctor Strange (2016), with distance of 0.7367237271770883
6: Wonder Woman (2017), with distance of 0.7363294460376415
7: Batman v Superman: Dawn of Justice (2016), with distance of 0.7357077380381718
8: X-Men: Apocalypse (2016), with distance of 0.73132733075304
9: Star Trek Beyond (2016), with distance of 0.7067950039068354
10: Suicide Squad (2016), with distance of 0.6957270769674829


In [31]:
my_favorite= 'Back to the future'

make_recommendation(
    model_knn=model_knn,
    data=movie_matrix,
    fav_movie= my_favorite,
    mapper=movie_title_index,
    n_recommendations=10)

You have input movie: Back to the future
Recommendation system is looking to find similar movies
......

Found possible matches in our database: ['Back to the Future (1985)', 'Back to the Future Part II (1989)', 'Back to the Future Part III (1990)', 'Back to the Beach (1987)']

Recommendations for Back to the future:
1: Star Wars: Episode IV - A New Hope (1977), with distance of 0.43291319537576345
2: Men in Black (a.k.a. MIB) (1997), with distance of 0.4263429026027963
3: Star Wars: Episode VI - Return of the Jedi (1983), with distance of 0.4240330225239052
4: Groundhog Day (1993), with distance of 0.4205874602899877
5: Ghostbusters (a.k.a. Ghost Busters) (1984), with distance of 0.41896789051260175
6: Back to the Future Part II (1989), with distance of 0.41519795804464876
7: Indiana Jones and the Last Crusade (1989), with distance of 0.4145116793935628
8: Raiders of the Lost Ark (Indiana Jones and the Raiders of the Lost Ark) (1981), with distance of 0.40601328299346695
9: Terminator

In [32]:
my_favorite= 'little women'

make_recommendation(
    model_knn=model_knn,
    data=movie_matrix,
    fav_movie= my_favorite,
    mapper=movie_title_index,
    n_recommendations=10)

You have input movie: little women
Recommendation system is looking to find similar movies
......

Found possible matches in our database: ['Little Women (1949)', 'Little Women (1933)', 'Little Women (1994)', 'Little Man (2006)']

Recommendations for little women:
1: I Was a Male War Bride (1949), with distance of 0.8177798279100017
2: Cheaper by the Dozen (1950), with distance of 0.8163727841670384
3: Jezebel (1938), with distance of 0.8129362758004749
4: Yearling, The (1946), with distance of 0.8052358959859217
5: Dark Victory (1939), with distance of 0.80284164243616
6: Bachelor and the Bobby-Soxer, The (1947), with distance of 0.7949466510488311
7: Jane Eyre (1944), with distance of 0.776672628323415
8: Anastasia (1956), with distance of 0.771318838079173
9: Little Women (1933), with distance of 0.7492498750817504
10: National Velvet (1944), with distance of 0.7364948251468996


In [33]:
my_favorite= 'pretty woman'

make_recommendation(
    model_knn=model_knn,
    data=movie_matrix,
    fav_movie= my_favorite,
    mapper=movie_title_index,
    n_recommendations=10)

You have input movie: pretty woman
Recommendation system is looking to find similar movies
......

Found possible matches in our database: ['Pretty Woman (1990)']

Recommendations for pretty woman:
1: Lion King, The (1994), with distance of 0.48873952741734883
2: Mask, The (1994), with distance of 0.4827780357507019
3: Forrest Gump (1994), with distance of 0.480833252462078
4: Firm, The (1993), with distance of 0.4755034090179937
5: Home Alone (1990), with distance of 0.4518774080465702
6: Four Weddings and a Funeral (1994), with distance of 0.4439169521200472
7: Speed (1994), with distance of 0.4200666349724289
8: Mrs. Doubtfire (1993), with distance of 0.37331748958206645
9: Sleepless in Seattle (1993), with distance of 0.35054111592464376
10: Ghost (1990), with distance of 0.332954985758119


In [34]:
my_favorite= 'i am sam'

make_recommendation(
    model_knn=model_knn,
    data=movie_matrix,
    fav_movie= my_favorite,
    mapper=movie_title_index,
    n_recommendations=10)

You have input movie: i am sam
Recommendation system is looking to find similar movies
......

Found possible matches in our database: ['I Am Sam (2001)']

Recommendations for i am sam:
1: Legally Blonde (2001), with distance of 0.7656445692092018
2: Something's Gotta Give (2003), with distance of 0.7644390496067265
3: Cast Away (2000), with distance of 0.7643816505998018
4: Terminal, The (2004), with distance of 0.761562855568555
5: Beautiful Mind, A (2001), with distance of 0.7593589343015823
6: Meet the Parents (2000), with distance of 0.7590736247185537
7: Shallow Hal (2001), with distance of 0.7588520809495967
8: Panic Room (2002), with distance of 0.7582108204468905
9: My Big Fat Greek Wedding (2002), with distance of 0.7489706841381081
10: Pay It Forward (2000), with distance of 0.7415963551029496


In [36]:
my_favorite= 'legally blonde'

make_recommendation(
    model_knn=model_knn,
    data=movie_matrix,
    fav_movie= my_favorite,
    mapper=movie_title_index,
    n_recommendations=10)

You have input movie: legally blonde
Recommendation system is looking to find similar movies
......

Found possible matches in our database: ['Legally Blonde (2001)']

Recommendations for legally blonde:
1: Bruce Almighty (2003), with distance of 0.6230531524979254
2: How to Lose a Guy in 10 Days (2003), with distance of 0.6207687447184992
3: Sweet Home Alabama (2002), with distance of 0.6200760465411453
4: Meet the Parents (2000), with distance of 0.6191770507937648
5: What Women Want (2000), with distance of 0.6078229544705066
6: Mean Girls (2004), with distance of 0.607479382094293
7: Princess Diaries, The (2001), with distance of 0.6067270797736706
8: My Big Fat Greek Wedding (2002), with distance of 0.5745837404585868
9: Bridget Jones's Diary (2001), with distance of 0.5606045674946092
10: Miss Congeniality (2000), with distance of 0.505610279199231


In [37]:
my_favorite= 'my big fat greek wedding'

make_recommendation(
    model_knn=model_knn,
    data=movie_matrix,
    fav_movie= my_favorite,
    mapper=movie_title_index,
    n_recommendations=10)

You have input movie: my big fat greek wedding
Recommendation system is looking to find similar movies
......

Found possible matches in our database: ['My Big Fat Greek Wedding (2002)', 'My Big Fat Greek Wedding 2 (2016)']

Recommendations for my big fat greek wedding:
1: Finding Nemo (2003), with distance of 0.618611331078136
2: Ocean's Eleven (2001), with distance of 0.6176843095852745
3: About a Boy (2002), with distance of 0.6060377653940389
4: Bend It Like Beckham (2002), with distance of 0.6055099237374983
5: Meet the Parents (2000), with distance of 0.604390658296253
6: Chicago (2002), with distance of 0.6032081660250368
7: Miss Congeniality (2000), with distance of 0.5988324128409972
8: Chocolat (2000), with distance of 0.5952574052563209
9: Legally Blonde (2001), with distance of 0.5745837404585868
10: Bridget Jones's Diary (2001), with distance of 0.5582759019053924


In [40]:
my_favorite= 'isle of dogs'

make_recommendation(
    model_knn=model_knn,
    data=movie_matrix,
    fav_movie= my_favorite,
    mapper=movie_title_index,
    n_recommendations=10)

You have input movie: isle of dogs
Recommendation system is looking to find similar movies
......

Found possible matches in our database: ['Isle of Dogs (2018)']

Recommendations for isle of dogs:
1: I, Tonya (2017), with distance of 0.7625290206949706
2: A Quiet Place (2018), with distance of 0.7584197922024709
3: Ready Player One, with distance of 0.7550404249516878
4: Avengers: Infinity War - Part I (2018), with distance of 0.7507151850506899
5: Deadpool 2 (2018), with distance of 0.7506158629304305
6: Coco (2017), with distance of 0.7216856053467562
7: Blade Runner 2049 (2017), with distance of 0.7151358740197407
8: Lady Bird (2017), with distance of 0.7096531781269286
9: The Shape of Water (2017), with distance of 0.6963712381370563
10: Three Billboards Outside Ebbing, Missouri (2017), with distance of 0.6746781503761939


In [41]:
my_favorite= 'pitch perfect'

make_recommendation(
    model_knn=model_knn,
    data=movie_matrix,
    fav_movie= my_favorite,
    mapper=movie_title_index,
    n_recommendations=10)

You have input movie: pitch perfect
Recommendation system is looking to find similar movies
......

Found possible matches in our database: ['Pitch Perfect (2012)', 'Pitch Perfect 2 (2015)', 'Picture Perfect (1997)']

Recommendations for pitch perfect:
1: The Hunger Games: Mockingjay - Part 1 (2014), with distance of 0.6739724815310233
2: Proposal, The (2009), with distance of 0.6721258673739673
3: The Hunger Games (2012), with distance of 0.6702019606191154
4: The Hunger Games: Catching Fire (2013), with distance of 0.6643230907140077
5: Mean Girls (2004), with distance of 0.6581824613454497
6: 21 Jump Street (2012), with distance of 0.6528737552776134
7: Frozen (2013), with distance of 0.650127820439771
8: Bridesmaids (2011), with distance of 0.627369485910972
9: Easy A (2010), with distance of 0.5781909324359206
10: Pitch Perfect 2 (2015), with distance of 0.5063372284694982
