In [2]:
import pandas as pd

movies = pd.read_csv("ml-25m/movies.csv")

In [3]:
#read data with pandas
movies

Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance
4,5,Father of the Bride Part II (1995),Comedy
...,...,...,...
62418,209157,We (2018),Drama
62419,209159,Window of the Soul (2001),Documentary
62420,209163,Bad Poems (2018),Comedy|Drama
62421,209169,A Girl Thing (2001),(no genres listed)


In [4]:
#Cleaning movies titles regular expressions "regex"
import re
def clean_title(title):
    #look for characters that arent a space, digit or letter and removes them
    return re.sub("[^a-zA-Z0-9 ]","",title)

In [5]:
#apply the above function to clean and store the titles in a new column
movies["clean_title"]=movies["title"].apply(clean_title)

In [10]:
movies

Unnamed: 0,movieId,title,genres,clean_title
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,Toy Story 1995
1,2,Jumanji (1995),Adventure|Children|Fantasy,Jumanji 1995
2,3,Grumpier Old Men (1995),Comedy|Romance,Grumpier Old Men 1995
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance,Waiting to Exhale 1995
4,5,Father of the Bride Part II (1995),Comedy,Father of the Bride Part II 1995
...,...,...,...,...
62418,209157,We (2018),Drama,We 2018
62419,209159,Window of the Soul (2001),Documentary,Window of the Soul 2001
62420,209163,Bad Poems (2018),Comedy|Drama,Bad Poems 2018
62421,209169,A Girl Thing (2001),(no genres listed),A Girl Thing 2001


In [6]:
#getting the Term Frequency and Inverse document frequency matrix
#using scikit-learn to get TFIDF matrix

from sklearn.feature_extraction.text import TfidfVectorizer
#ngram_range looks at a pair of words as well hence optimizing TFIDF
vectorizer=TfidfVectorizer(ngram_range=(1,2))
tfidf= vectorizer.fit_transform(movies["clean_title"])


In [32]:
#creating a search function
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np

def search(title):
    title = clean_title(title)

    #turns title entered into a set of numbers 
    query_vec = vectorizer.transform([title])
    #compares title to all the data and returns similarity to the title
    similarity=cosine_similarity(query_vec,tfidf).flatten()
    #find the titles with the greatest similarity to our search term
    indices=np.argpartition(similarity,-5)[-5:]
    results = movies.iloc[indices][::-1]
    return results



In [34]:
#build jupyter notebook interactive widget to type and see the search results
import ipywidgets as widgets
from IPython.display import display

movie_input = widgets.Text(
    value='',
    description = "Movie Title:",
    disabled=False
)

movie_list=widgets.Output()

def on_type(data):
    with movie_list:
        movie_list.clear_output()
        title=data["new"]
        if len(title)>5:
            display(search(title))

movie_input.observe(on_type, names='value')

display(movie_input,movie_list)

Text(value='', description='Movie Title:')

Output()

In [35]:
#read the ratings.csv file
ratings = pd.read_csv("ml-25m/ratings.csv")

In [36]:
ratings

Unnamed: 0,userId,movieId,rating,timestamp
0,1,296,5.0,1147880044
1,1,306,3.5,1147868817
2,1,307,5.0,1147868828
3,1,665,5.0,1147878820
4,1,899,3.5,1147868510
...,...,...,...,...
25000090,162541,50872,4.5,1240953372
25000091,162541,55768,2.5,1240951998
25000092,162541,56176,2.0,1240950697
25000093,162541,58559,4.0,1240953434


In [37]:
ratings.dtypes

userId         int64
movieId        int64
rating       float64
timestamp      int64
dtype: object

In [41]:
#finding the users who liked the same movie
movie_id=1
#anyone who likes the movie entered and the other movies related to
similar_users = ratings[(ratings['movieId']==movie_id)&(ratings["rating"]>=4)]["userId"].unique()
similar_users

array([    36,     75,     86, ..., 162518, 162519, 162530], dtype=int64)

In [51]:
#anyone who is similar to us who liked the same movie as us, find any movie they rated 4 or more stars
similar_users_recs = ratings[(ratings["userId"].isin(similar_users)) & (ratings["rating"]>4)]["movieId"]

In [52]:
similar_users_recs

5101           1
5105          34
5111         110
5114         150
5127         260
            ... 
24998388    3706
24998389    3735
24998391    3763
24998392    4187
24998393    4321
Name: movieId, Length: 912084, dtype: int64

In [53]:
#look for movies that are 10% or more that are similar to us also liked
similar_users_recs=similar_users_recs.value_counts()/len(similar_users)
similar_users_recs=similar_users_recs[similar_users_recs>.1]

In [54]:
similar_users_recs

1       1.000000
318     0.414556
260     0.404561
356     0.347253
296     0.342663
          ...   
1259    0.102991
7361    0.101881
1206    0.101362
1307    0.101066
1208    0.100918
Name: movieId, Length: 92, dtype: float64

In [56]:
#define a func to get the movies that relate to the movie queried and not just random interest
all_users=ratings[(ratings["movieId"].isin(similar_users_recs.index)) & (ratings["rating"]>4)]

In [57]:
#convert the recs into percentages
all_users_recs=all_users["movieId"].value_counts()/len(all_users["userId"].unique())

In [58]:
all_users_recs

318      0.345282
296      0.287220
2571     0.246217
356      0.237370
593      0.227930
           ...   
1387     0.047886
1307     0.046195
745      0.037362
78499    0.035445
2355     0.025316
Name: movieId, Length: 92, dtype: float64

In [59]:
#compare the percentages, use the pandas concatinate method
rec_percentages= pd.concat([similar_users_recs, all_users_recs],axis=1)
rec_percentages.columns=["similar","all"]


In [64]:
rec_percentages

Unnamed: 0,similar,all
1,1.000000,0.125844
32,0.157856,0.101190
34,0.158226,0.052696
47,0.203613,0.145762
50,0.262180,0.202306
...,...,...
58559,0.160743,0.147779
60069,0.137272,0.076990
68954,0.136310,0.065525
78499,0.138161,0.035445


In [80]:
#get score by dividing the values of the two
rec_percentages["score"] = rec_percentages["similar"]/rec_percentages["all"]

In [82]:
#sort score by pandas sort_values method
rec_percentages= rec_percentages.sort_values("score",ascending=False)

In [83]:
rec_percentages

Unnamed: 0,similar,all,score
1,1.000000,0.125844,7.946323
3114,0.295498,0.054186,5.453383
2355,0.124685,0.025316,4.925186
78499,0.138161,0.035445,3.897906
588,0.233674,0.068117,3.430480
...,...,...,...
58559,0.160743,0.147779,1.087725
79132,0.129424,0.132559,0.976349
7361,0.101881,0.105172,0.968704
2959,0.205020,0.218656,0.937638


In [84]:
#take top 10 rec and merge with the movies data to know the movie title
rec_percentages.head(10).merge(movies, left_index=True, right_on="movieId")

Unnamed: 0,similar,all,score,movieId,title,genres,clean_title
0,1.0,0.125844,7.946323,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,Toy Story 1995
3021,0.295498,0.054186,5.453383,3114,Toy Story 2 (1999),Adventure|Animation|Children|Comedy|Fantasy,Toy Story 2 1999
2264,0.124685,0.025316,4.925186,2355,"Bug's Life, A (1998)",Adventure|Animation|Children|Comedy,Bugs Life A 1998
14813,0.138161,0.035445,3.897906,78499,Toy Story 3 (2010),Adventure|Animation|Children|Comedy|Fantasy|IMAX,Toy Story 3 2010
580,0.233674,0.068117,3.43048,588,Aladdin (1992),Adventure|Animation|Children|Comedy|Musical,Aladdin 1992
587,0.198949,0.060514,3.287671,595,Beauty and the Beast (1991),Animation|Children|Fantasy|Musical|Romance|IMAX,Beauty and the Beast 1991
33,0.158226,0.052696,3.002602,34,Babe (1995),Children|Drama,Babe 1995
4780,0.210647,0.071444,2.94841,4886,"Monsters, Inc. (2001)",Adventure|Animation|Children|Comedy|Fantasy,Monsters Inc 2001
1047,0.143418,0.049202,2.914882,1073,Willy Wonka & the Chocolate Factory (1971),Children|Comedy|Fantasy|Musical,Willy Wonka the Chocolate Factory 1971
729,0.108322,0.037362,2.899227,745,Wallace & Gromit: A Close Shave (1995),Animation|Children|Comedy,Wallace Gromit A Close Shave 1995


In [93]:
#build the recomendation function

def find_similar_movies(movie_id):
    similar_users = ratings[(ratings['movieId']==movie_id)&(ratings["rating"]>=4)]["userId"].unique()
    similar_users_recs = ratings[(ratings["userId"].isin(similar_users)) & (ratings["rating"]>4)]["movieId"]
    
    similar_users_recs=similar_users_recs.value_counts()/len(similar_users)
    similar_users_recs=similar_users_recs[similar_users_recs>.1]
    
    all_users=ratings[(ratings["movieId"].isin(similar_users_recs.index)) & (ratings["rating"]>4)]
    all_users_recs=all_users["movieId"].value_counts()/len(all_users["userId"].unique())
    
    rec_percentages= pd.concat([similar_users_recs, all_users_recs],axis=1)
    rec_percentages.columns=["similar","all"]

    rec_percentages["score"] = rec_percentages["similar"]/rec_percentages["all"]
    rec_percentages= rec_percentages.sort_values("score",ascending=False)
    return rec_percentages.head(10).merge(movies, left_index=True, right_on="movieId")[["score","title","genres"]]

In [94]:
movie_name_input=widgets.Text(
    value="Avengers",
    description="Movie Title:",
    disabled=False
)

recommendation_list=widgets.Output()

def on_type(data):
    with recommendation_list:
        #clears previous output whenevr typing
        recommendation_list.clear_output()
        title=data["new"]
        if len(title)>5:
            results= search(title)
            movie_id = results.iloc[0]["movieId"]
            display(find_similar_movies(movie_id))
            
movie_name_input.observe(on_type, names="value")
display(movie_name_input, recommendation_list)
            



Text(value='Avengers', description='Movie Title:')

Output()