# Importing the required libraries

In [6]:
import pandas as pd
import numpy as np
import re
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

# Importing and Loading the dataset

In [7]:
df = pd.read_csv('movies.csv')
rating = pd.read_csv('ratings.csv')
df

Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance
4,5,Father of the Bride Part II (1995),Comedy
...,...,...,...
62418,209157,We (2018),Drama
62419,209159,Window of the Soul (2001),Documentary
62420,209163,Bad Poems (2018),Comedy|Drama
62421,209169,A Girl Thing (2001),(no genres listed)


In [9]:
#cleaning the title for good and efficient searching
import re

def clean_title(title):
    return re.sub("[^a-zA-Z0-9 ]","",title)
df['new_title'] = df['title'].apply(clean_title)

# Creating a TFIDF matrix

In [10]:
#creating an object for the tfidfvectorizer with word limit of 2 at a time
vector = TfidfVectorizer(ngram_range=(1,2))
tfidf = vector.fit_transform(df['new_title'])

# Creating a search function

In [11]:
def search(title):
    title=clean_title(title)
    q_vec = vector.transform([title])
    simi = cosine_similarity(q_vec , tfidf).flatten()
    indi = np.argpartition(simi,-5)[-5:]
    result = df.iloc[indi][::-1]
    return result

In [12]:
#creating an interactive search box with jupyter
import ipywidgets as widgets
from IPython.display import display

#creating a search box
input_movie = widgets.Text(
    value='',
    description="Movie title",
    disabled=False
)

#creating a list of output for search function
list_movie = widgets.Output()

def on_type(data):
    with list_movie:
        list_movie.clear_output()
        title = data['new']
        if len(title) > 5:
            display(search(title))

#observing the value in search box
input_movie.observe(on_type , names='value')
#displaying the search box and output values
display(input_movie , list_movie)

Text(value='', description='Movie title')

Output()

# ratings 

In [13]:
rating

Unnamed: 0,userId,movieId,rating,timestamp
0,1,296,5.0,1147880044
1,1,306,3.5,1147868817
2,1,307,5.0,1147868828
3,1,665,5.0,1147878820
4,1,899,3.5,1147868510
...,...,...,...,...
25000090,162541,50872,4.5,1240953372
25000091,162541,55768,2.5,1240951998
25000092,162541,56176,2.0,1240950697
25000093,162541,58559,4.0,1240953434


In [14]:
movie_id = 1
simi_users = rating[(rating['movieId'] == movie_id) & (rating['rating'] > 4)]['userId'].unique()
simi_users

array([    36,     75,     86, ..., 162527, 162530, 162533], dtype=int64)

In [15]:
simi_users_rec = rating[(rating['userId'].isin(simi_users)) & (rating['rating'] > 4)]['movieId']
simi_users_rec

5101            1
5105           34
5111          110
5114          150
5127          260
            ...  
24998854    60069
24998861    67997
24998876    78499
24998884    81591
24998888    88129
Name: movieId, Length: 1358326, dtype: int64

In [16]:
#finding how many times a movie has been repeated
simi_users_rec = simi_users_rec.value_counts() / len(simi_users)

#taking the value that is greater then 10%
simi_users_rec = simi_users_rec[simi_users_rec > .1]
simi_users_rec

1        1.000000
318      0.445607
260      0.403770
356      0.370215
296      0.367295
           ...   
953      0.103053
551      0.101195
1222     0.100876
745      0.100345
48780    0.100186
Name: movieId, Length: 113, dtype: float64

# Finding how much all users like movies

In [17]:
all_user = rating[(rating['movieId'].isin(simi_users_rec.index) & (rating['rating'] > 4))]

In [18]:
all_user

Unnamed: 0,userId,movieId,rating,timestamp
0,1,296,5.0,1147880044
29,1,4973,4.5,1147869080
48,1,7361,5.0,1147880055
72,2,110,5.0,1141416589
76,2,260,5.0,1141417172
...,...,...,...,...
25000062,162541,5618,4.5,1240953299
25000065,162541,5952,5.0,1240952617
25000078,162541,7153,5.0,1240952613
25000081,162541,7361,4.5,1240953484


In [19]:
all_user_rec = all_user['movieId'].value_counts() / len(all_user['userId'].unique())
all_user_rec

318      0.342220
296      0.284674
2571     0.244033
356      0.235266
593      0.225909
           ...   
551      0.040918
50872    0.039111
745      0.037031
78499    0.035131
2355     0.025091
Name: movieId, Length: 113, dtype: float64

# Creating a recommendation score

In [20]:
rec_percentage = pd.concat([simi_users_rec , all_user_rec], axis=1)
rec_percentage.columns = ['simi','all']
rec_percentage

Unnamed: 0,simi,all
1,1.000000,0.124728
318,0.445607,0.342220
260,0.403770,0.222207
356,0.370215,0.235266
296,0.367295,0.284674
...,...,...
953,0.103053,0.045792
551,0.101195,0.040918
1222,0.100876,0.066877
745,0.100345,0.037031


In [21]:
#creating a new column score
rec_percentage['score'] = rec_percentage['simi'] / rec_percentage['all']

In [22]:
rec_percentage = rec_percentage.sort_values('score',ascending = False)
rec_percentage

Unnamed: 0,simi,all,score
1,1.000000,0.124728,8.017414
3114,0.280648,0.053706,5.225654
2355,0.110539,0.025091,4.405452
78499,0.152960,0.035131,4.354038
4886,0.235147,0.070811,3.320783
...,...,...,...
2858,0.216724,0.167634,1.292845
296,0.367295,0.284674,1.290232
79132,0.166817,0.131384,1.269693
4973,0.142501,0.112405,1.267747


In [23]:
#adding title to the recommendationds
rec_percentage.head(10).merge(df , left_index = True , right_on = "movieId")

Unnamed: 0,simi,all,score,movieId,title,genres,new_title
0,1.0,0.124728,8.017414,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,Toy Story 1995
3021,0.280648,0.053706,5.225654,3114,Toy Story 2 (1999),Adventure|Animation|Children|Comedy|Fantasy,Toy Story 2 1999
2264,0.110539,0.025091,4.405452,2355,"Bug's Life, A (1998)",Adventure|Animation|Children|Comedy,Bugs Life A 1998
14813,0.15296,0.035131,4.354038,78499,Toy Story 3 (2010),Adventure|Animation|Children|Comedy|Fantasy|IMAX,Toy Story 3 2010
4780,0.235147,0.070811,3.320783,4886,"Monsters, Inc. (2001)",Adventure|Animation|Children|Comedy|Fantasy,Monsters Inc 2001
580,0.216618,0.067513,3.208539,588,Aladdin (1992),Adventure|Animation|Children|Comedy|Musical,Aladdin 1992
6258,0.228139,0.072268,3.156862,6377,Finding Nemo (2003),Adventure|Animation|Children|Comedy,Finding Nemo 2003
587,0.1794,0.059977,2.99115,595,Beauty and the Beast (1991),Animation|Children|Fantasy|Musical|Romance|IMAX,Beauty and the Beast 1991
8246,0.203504,0.068453,2.972889,8961,"Incredibles, The (2004)",Action|Adventure|Animation|Children|Comedy,Incredibles The 2004
359,0.253411,0.085764,2.954762,364,"Lion King, The (1994)",Adventure|Animation|Children|Drama|Musical|IMAX,Lion King The 1994


# Building a recommendation function

In [24]:
def find_simi_movies(movie_id):
    #finding recommendations users similar to us
    simi_users = rating[(rating['movieId'] == movie_id) & (rating['rating'] > 4)]['userId'].unique()
    simi_users_rec = rating[(rating['userId'].isin(simi_users)) & (rating['rating'] > 4)]['movieId']
    
    #adjusting so that we can have only greater than 10% suggested
    simi_users_rec = simi_users_rec.value_counts() / len(simi_users)
    simi_users_rec = simi_users_rec[simi_users_rec > .1]
    
    #finding common recommendations among all of the users
    all_user = rating[(rating['movieId'].isin(simi_users_rec.index) & (rating['rating'] > 4))]
    all_user_rec = all_user['movieId'].value_counts() / len(all_user['userId'].unique())
    
    rec_percentage = pd.concat([simi_users_rec , all_user_rec], axis=1)
    rec_percentage.columns = ['simi','all']
    
    rec_percentage['score'] = rec_percentage['simi'] / rec_percentage['all']
    
    rec_percentage = rec_percentage.sort_values('score',ascending = False)
    return rec_percentage.head(10).merge(df , left_index = True , right_on = "movieId")[['score','title','genres']]

# creating an interactive recommendation widget

In [25]:
input_movie_name = widgets.Text(
    value='',
    description="Movie title",
    disabled=False
)

recommendation_list =  widgets.Output()

def on_type(data):
    with recommendation_list:
        recommendation_list.clear_output()
        title = data['new']
        if len(title) > 5:
            results = search(title)
            movie_id = results.iloc[0]['movieId']
            display(find_simi_movies(movie_id))

input_movie_name.observe(on_type , names = 'value')
display(input_movie_name , recommendation_list)

Text(value='', description='Movie title')

Output()