## Recommending movies using Collaborative Filtering 

In [1]:
import pandas as pd
import numpy as np
from scipy.spatial.distance import hamming

In [2]:
import warnings
warnings.simplefilter(action='ignore',category=Warning)

In [3]:
df_ratings = pd.read_csv("recent_ratings.csv")
df_movies = pd.read_csv('recent_movies.csv')

In [4]:
df_ratings.shape, df_movies.shape

((552, 4), (188, 4))

In [5]:
df_ratings.userId.unique().size  # No. of unique users 

63

In [6]:
df_ratings.movieId.unique().size  # No. of unique movies 

188

In [7]:
df_ratings.sample(5)

Unnamed: 0,userId,movieId,rating,timestamp
485,586,122926,5.0,1529899317
62,89,167380,4.5,1520409150
295,318,173291,2.0,1511907895
198,212,168252,4.5,1523217270
269,305,122912,4.5,1534613120


In [8]:
# Remove timestamp column  
df_ratings.drop(columns='timestamp', inplace=True)

In [9]:
df_ratings.sample(5)

Unnamed: 0,userId,movieId,rating
129,125,171763,5.0
436,514,188797,4.0
51,62,179401,3.5
173,184,183911,1.5
299,318,182715,3.5


In [10]:
# Get rating for each user and movie - userid is row label and movieid is column label 
ratings = df_ratings.pivot(
    index='userId',
    columns='movieId',
    values='rating')

In [11]:
ratings.sample(5)

movieId,122896,122898,122906,122912,122916,122918,122926,143355,166534,167064,...,189381,189713,190183,190209,190215,191005,193581,193583,193585,193587
userId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
596,,3.5,5.0,4.0,5.0,4.0,4.0,4.5,,,...,,,,,,,,,,
433,,,,,,,,,,,...,,,,,,,,,,
515,,,,,5.0,5.0,,,,,...,,,,,,,,,,
548,,,,,,,,,,,...,,,,,,,,,,
567,,2.0,1.5,3.0,3.0,3.5,4.0,1.5,2.5,,...,,,,,,,,,,


In [12]:
ratings.loc[233, ratings.loc[233,:].notna()]  # Ratings given by 233 

movieId
122912    2.0
168266    3.5
174055    2.5
177593    5.0
178061    3.0
180031    3.5
183011    1.5
183897    3.5
187593    2.5
Name: 233, dtype: float64

### Hamming Distance
Measures how different two sequences are. It is <b>% of disagreement </b>. A value of 1 indicates sequences are very different, 0 indicates they are very similar.

In [20]:
l1 = (1,2,4)
l2 = (1,2,3)
l3 = (1,4,5)
print(hamming(l1,l2))
print(hamming(l1,l3))

0.3333333333333333
0.6666666666666666


In [14]:
# Find out hamming distance between ratings of two users
def hamming_distance(user1,user2):
    # Ratings of a user 
    try:
        user1_ratings = ratings.loc[user1,:]
        user2_ratings = ratings.loc[user2,:]
        distance = hamming(user1_ratings, user2_ratings)
    except:
        distance = np.NaN

    return distance    

In [15]:
# Get neighbours of the give user 
def get_nearest_users(active_user, k = 10):
    all_users = pd.DataFrame(ratings.index) # UserIds 
    other_users = all_users[all_users.userId != active_user]
    
    other_users['distance'] = other_users['userId'].apply(lambda x: hamming_distance(active_user,x))
    
    # find out hamming distance and return users with low hamming distance from active user 
    return  other_users.sort_values(['distance'], ascending = True).userId[:k]

In [16]:
def get_recommended_movies(ratings, movies, user, top=5):
    # Find out nearest neighbours based on hamming distance 
    nn_users = get_nearest_users(user,10)
    
    # Get ratings of nearest neighbours(users)
    nn_ratings = ratings[ratings.index.isin(nn_users)]
    
    # Average ratings given by nearest neighbours for all movies
    avg_ratings = nn_ratings.apply(np.nanmean).dropna()
    
    # Find out movies that user had already watched
    movies_watched = ratings.transpose()[user].dropna().index
    
    # remove movies that user already watched
    avg_ratings = avg_ratings[~ avg_ratings.index.isin(movies_watched)]
    
    # Findout top n movies based on avg ratings given by nearest neighbours 
    top_movies_ids = avg_ratings.sort_values(ascending=False).index[:top]
   
    # Return recommended movies 
    return movies[movies.movieId.isin(top_movies_ids)].title

In [17]:
get_recommended_movies(ratings,df_movies,249,5)

22                                 The Boss Baby (2017)
56                               Tickling Giants (2017)
97     Three Billboards Outside Ebbing, Missouri (2017)
105                                 Paddington 2 (2017)
145                                 Isle of Dogs (2018)
Name: title, dtype: object

In [18]:
get_recommended_movies(ratings, df_movies, 433,5)

3     Avengers: Infinity War - Part I (2018)
12              The Lego Batman Movie (2017)
15             John Wick: Chapter Two (2017)
20                       The Big Sick (2017)
23               Call Me by Your Name (2017)
Name: title, dtype: object

In [19]:
get_recommended_movies(ratings, df_movies, 125, 10)

1                                 Justice League (2017)
3                Avengers: Infinity War - Part I (2018)
6                     Untitled Spider-Man Reboot (2017)
8                                          Split (2017)
58                                    The Square (2017)
65                                 Seven Sisters (2017)
97     Three Billboards Outside Ebbing, Missouri (2017)
157                                A Quiet Place (2018)
158                                        Alpha (2018)
166                                   Deadpool 2 (2018)
Name: title, dtype: object