In [1]:
import pandas as pd
movies = pd.read_csv("movies.csv")

In [2]:
movies.head()

Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance
4,5,Father of the Bride Part II (1995),Comedy


In [3]:
import re
def clean_tables(title):
  title = re.sub("[^a-zA-Z0-9 ]","",title)
  return title

In [4]:
movies["new_title"]=movies["title"].apply(clean_tables)

In [5]:
movies

Unnamed: 0,movieId,title,genres,new_title
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,Toy Story 1995
1,2,Jumanji (1995),Adventure|Children|Fantasy,Jumanji 1995
2,3,Grumpier Old Men (1995),Comedy|Romance,Grumpier Old Men 1995
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance,Waiting to Exhale 1995
4,5,Father of the Bride Part II (1995),Comedy,Father of the Bride Part II 1995
...,...,...,...,...
62418,209157,We (2018),Drama,We 2018
62419,209159,Window of the Soul (2001),Documentary,Window of the Soul 2001
62420,209163,Bad Poems (2018),Comedy|Drama,Bad Poems 2018
62421,209169,A Girl Thing (2001),(no genres listed),A Girl Thing 2001


In [6]:
from sklearn.feature_extraction.text import TfidfVectorizer
vectorizer = TfidfVectorizer(ngram_range=(1,2))
tfidf = vectorizer.fit_transform(movies["new_title"])

In [7]:
from warnings import simplefilter
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np
def search(title):
  title = clean_tables(title)
  query_vec = vectorizer.transform([title])
  similarity = cosine_similarity(query_vec,tfidf).flatten()
  indices = np.argpartition(similarity,-5)[-5:]
  results = movies.iloc[indices].iloc[::-1]
  return results

In [8]:
import ipywidgets as widgets
from IPython.display import display
movie_input = widgets.Text(
    value='Toy Story',
    description='Movie Title:',
    disabled=False
)
movie_list = widgets.Output()
def on_type(data):
    with movie_list:
        movie_list.clear_output()
        title = data["new"]
        if len(title) > 5:
            display(search(title))

movie_input.observe(on_type, names='value')


display(movie_input, movie_list)

Text(value='Toy Story', description='Movie Title:')

Output()

In [9]:
movie_id = 89745

#def find_similar_movies(movie_id):
movie = movies[movies["movieId"] == movie_id]

In [11]:
ratings = pd.read_csv("ratings.csv")

In [12]:
ratings.dtypes

Unnamed: 0,0
userId,int64
movieId,float64
rating,float64
timestamp,float64


In [13]:
similar_users = ratings[(ratings["movieId"] == movie_id) & (ratings["rating"] > 4)]["userId"].unique()

In [14]:
similar_user_recs = ratings[(ratings["userId"].isin(similar_users)) & (ratings["rating"] > 4)]["movieId"]

In [15]:
similar_user_recs = similar_user_recs.value_counts() / len(similar_users)
similar_user_recs = similar_user_recs[similar_user_recs > .10]

In [16]:
all_users = ratings[(ratings["movieId"].isin(similar_user_recs.index)) & (ratings["rating"] > 4)]

In [17]:
all_user_recs = all_users["movieId"].value_counts() / len(all_users["userId"].unique())

In [18]:
rec_percentages = pd.concat([similar_user_recs, all_user_recs], axis=1)
rec_percentages.columns = ["similar", "all"]

In [19]:
rec_percentages

Unnamed: 0_level_0,similar,all
movieId,Unnamed: 1_level_1,Unnamed: 2_level_1
89745.0,1.000000,0.040000
79132.0,0.666667,0.130000
58559.0,0.625000,0.131667
260.0,0.500000,0.205000
134130.0,0.500000,0.053333
...,...,...
159817.0,0.125000,0.011667
159093.0,0.125000,0.005000
7438.0,0.125000,0.046667
1213.0,0.125000,0.093333


In [20]:
rec_percentages["score"] = rec_percentages["similar"] / rec_percentages["all"]

In [21]:
rec_percentages = rec_percentages.sort_values("score", ascending=False)

In [22]:
rec_percentages.head(10).merge(movies,left_index = True,right_on = "movieId")

Unnamed: 0,similar,all,score,movieId,title,genres,new_title
17067,1.0,0.04,25.0,89745,"Avengers, The (2012)",Action|Adventure|Sci-Fi|IMAX,Avengers The 2012
14025,0.125,0.005,25.0,72641,"Blind Side, The (2009)",Drama,Blind Side The 2009
12961,0.125,0.005,25.0,65514,Ip Man (2008),Action|Drama|War,Ip Man 2008
17273,0.125,0.005,25.0,90647,Puss in Boots (2011),Adventure|Animation|Comedy|Fantasy|IMAX,Puss in Boots 2011
19722,0.125,0.005,25.0,102407,"Great Gatsby, The (2013)",Drama,Great Gatsby The 2013
10666,0.125,0.005,25.0,44022,Ice Age 2: The Meltdown (2006),Adventure|Animation|Children|Comedy,Ice Age 2 The Meltdown 2006
40295,0.125,0.005,25.0,159093,Now You See Me 2 (2016),Action|Comedy|Thriller,Now You See Me 2 2016
16523,0.208333,0.01,20.833333,87222,Kung Fu Panda 2 (2011),Action|Adventure|Animation|Children|Comedy|IMAX,Kung Fu Panda 2 2011
18747,0.166667,0.008333,20.0,97913,Wreck-It Ralph (2012),Animation|Comedy,WreckIt Ralph 2012
19615,0.166667,0.008333,20.0,101864,Oblivion (2013),Action|Adventure|Sci-Fi|IMAX,Oblivion 2013


In [23]:
def find_similar_movies(movie_id):
    similar_users = ratings[(ratings["movieId"] == movie_id) & (ratings["rating"] > 4)]["userId"].unique()
    similar_user_recs = ratings[(ratings["userId"].isin(similar_users)) & (ratings["rating"] > 4)]["movieId"]
    similar_user_recs = similar_user_recs.value_counts() / len(similar_users)

    similar_user_recs = similar_user_recs[similar_user_recs > .10]
    all_users = ratings[(ratings["movieId"].isin(similar_user_recs.index)) & (ratings["rating"] > 4)]
    all_user_recs = all_users["movieId"].value_counts() / len(all_users["userId"].unique())
    rec_percentages = pd.concat([similar_user_recs, all_user_recs], axis=1)
    rec_percentages.columns = ["similar", "all"]

    rec_percentages["score"] = rec_percentages["similar"] / rec_percentages["all"]
    rec_percentages = rec_percentages.sort_values("score", ascending=False)
    return rec_percentages.head(10).merge(movies, left_index=True, right_on="movieId")[["score", "title", "genres"]]

In [24]:
import ipywidgets as widgets
from IPython.display import display

movie_name_input = widgets.Text(
    value='Toy Story',
    description='Movie Title:',
    disabled=False
)
recommendation_list = widgets.Output()

def on_type(data):
    with recommendation_list:
        recommendation_list.clear_output()
        title = data["new"]
        if len(title) > 5:
            results = search(title)
            movie_id = results.iloc[0]["movieId"]
            display(find_similar_movies(movie_id))

movie_name_input.observe(on_type, names='value')

display(movie_name_input, recommendation_list)

Text(value='Toy Story', description='Movie Title:')

Output()