This project aims to recommend movies to the user by item based and user based collaborative filtering. 

In the item based filtering, the item is recommended based on the voting the user had done for other items. 

In the user based filtering, the user have the same preferences and same likes with the other users. The item which other users liked is recommended to the user. 



In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/movielense20m/rating.csv
/kaggle/input/movielense20m/movie.csv


# Preparing User - Movie Matrix

In [2]:
pd.set_option('display.max_columns', 20)


In [3]:
# Two datasets are merged so that both ratings, user ids and movie names take place in the same dataset.

movie = pd.read_csv('../input/movielense20m/movie.csv')
rating = pd.read_csv('../input/movielense20m/rating.csv')
df = movie.merge(rating, how="left", on="movieId")
df.head()

Unnamed: 0,movieId,title,genres,userId,rating,timestamp
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,3.0,4.0,1999-12-11 13:36:47
1,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,6.0,5.0,1997-03-13 17:50:52
2,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,8.0,4.0,1996-06-05 13:37:51
3,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,10.0,4.0,1999-11-25 02:44:47
4,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,11.0,4.5,2009-01-02 01:13:41


In [4]:
# The number of comments for each movie. 

comment_counts = pd.DataFrame(df["title"].value_counts())
comment_counts.head()

Unnamed: 0,title
Pulp Fiction (1994),67310
Forrest Gump (1994),66172
"Shawshank Redemption, The (1994)",63366
"Silence of the Lambs, The (1991)",63299
Jurassic Park (1993),59715


In [5]:
df.shape

(20000797, 6)

In [6]:
# Movies which have comments less than 1000 are rare, considering the size of the dataset.

rare_movies = comment_counts[comment_counts["title"] <= 1000].index


In [7]:
# The rare movies are left out. 

common_movies = df[~df["title"].isin(rare_movies)]


In [8]:
# A pivot table where user id is the index, movie id is the column and ratings are the values is constructed.

user_movie_df = common_movies.pivot_table(index = ["userId"], columns = ["title"], values = "rating")
user_movie_df.head()

title,"'burbs, The (1989)",(500) Days of Summer (2009),*batteries not included (1987),...And Justice for All (1979),10 Things I Hate About You (1999),"10,000 BC (2008)",101 Dalmatians (1996),101 Dalmatians (One Hundred and One Dalmatians) (1961),102 Dalmatians (2000),12 Angry Men (1957),...,Zero Dark Thirty (2012),Zero Effect (1998),Zodiac (2007),Zombieland (2009),Zoolander (2001),Zulu (1964),[REC] (2007),eXistenZ (1999),xXx (2002),¡Three Amigos! (1986)
userId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1.0,,,,,,,,,,,...,,,,,,,,,,
2.0,,,,,,,,,,,...,,,,,,,,,,
3.0,,,,,,,,,,,...,,,,,,,,,,
4.0,,,,,,,,,,,...,,,,,,,,,,
5.0,,,,,,,,,,,...,,,,,,,,,,


# Detecting the movies that the user watched

In [9]:
# An arbitrary user is picked. The user's id is chosen from the pivot table and assigned to a new dataframe.

user = 108170
user_df = user_movie_df[user_movie_df.index == 108170]
user_df.head()

title,"'burbs, The (1989)",(500) Days of Summer (2009),*batteries not included (1987),...And Justice for All (1979),10 Things I Hate About You (1999),"10,000 BC (2008)",101 Dalmatians (1996),101 Dalmatians (One Hundred and One Dalmatians) (1961),102 Dalmatians (2000),12 Angry Men (1957),...,Zero Dark Thirty (2012),Zero Effect (1998),Zodiac (2007),Zombieland (2009),Zoolander (2001),Zulu (1964),[REC] (2007),eXistenZ (1999),xXx (2002),¡Three Amigos! (1986)
userId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
108170.0,,,,,,,,,,,...,,,,,,,,,,


In [10]:
# NaN values are left out. So the remaining indexes are the movies that the user watched.

movies_watched = user_df.columns[user_df.notna().any()].tolist()


# Reaching the other users who watched the same movies as the user

In [11]:
# A new dataframe is constructed with the same columns as the pivot table filtering watched movies of the user.

movies_watched_df = user_movie_df[movies_watched]

movies_watched_df.head()

title,2001: A Space Odyssey (1968),"Adventures of Priscilla, Queen of the Desert, The (1994)",Akira (1988),Aladdin (1992),Aliens (1986),Almost Famous (2000),Along Came Polly (2004),Amadeus (1984),American History X (1998),Any Given Sunday (1999),...,"Untouchables, The (1987)","Usual Suspects, The (1995)",Wallace & Gromit: A Close Shave (1995),Wallace & Gromit: The Best of Aardman Animation (1996),Wallace & Gromit: The Wrong Trousers (1993),"Wedding Singer, The (1998)",Who Framed Roger Rabbit? (1988),Wild at Heart (1990),Willow (1988),X2: X-Men United (2003)
userId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1.0,3.5,,,,4.0,,,,,,...,3.5,3.5,,,,,,,4.0,4.0
2.0,5.0,,,,,,,,,4.0,...,,,,,,,,,,
3.0,5.0,,,,4.0,,,3.0,4.0,,...,,5.0,,,,,,,,
4.0,,,,,,,,,,,...,,,,,,,,,,
5.0,,,,5.0,,,,,,,...,,,,5.0,,,,,,


In [12]:
user_movie_count = movies_watched_df.T.notnull().sum()

In [13]:
user_movie_count

userId
1.0         53
2.0         11
3.0         47
4.0          5
5.0         16
            ..
138489.0    11
138490.0    10
138491.0     3
138492.0    20
138493.0    57
Length: 138493, dtype: int64

In [14]:
user_movie_count = user_movie_count.reset_index()

In [15]:
user_movie_count 

Unnamed: 0,userId,0
0,1.0,53
1,2.0,11
2,3.0,47
3,4.0,5
4,5.0,16
...,...,...
138488,138489.0,11
138489,138490.0,10
138490,138491.0,3
138491,138492.0,20


In [16]:
# The number of movies each user watched in the movies_watched dataframe, hence the number of movies each user watched commonly with the user.

user_movie_count = movies_watched_df.T.notnull().sum()
user_movie_count = user_movie_count.reset_index()
user_movie_count.columns = ["userId", "movie_count"]
user_movie_count.head()


Unnamed: 0,userId,movie_count
0,1.0,53
1,2.0,11
2,3.0,47
3,4.0,5
4,5.0,16


# Finding the most similar users with the user

In [17]:
# Watching 60% or over movies with the user is a step to be considered as similar. These similar tasted users are gathered. 
    
perc = len(movies_watched) * 60 / 100
users_same_movies = user_movie_count[user_movie_count["movie_count"] > perc]["userId"]
users_same_movies.head()
    

90      91.0
115    116.0
155    156.0
293    294.0
297    298.0
Name: userId, dtype: float64

In [18]:
movies_watched_df[movies_watched_df.index.isin(users_same_movies)].head()

title,2001: A Space Odyssey (1968),"Adventures of Priscilla, Queen of the Desert, The (1994)",Akira (1988),Aladdin (1992),Aliens (1986),Almost Famous (2000),Along Came Polly (2004),Amadeus (1984),American History X (1998),Any Given Sunday (1999),...,"Untouchables, The (1987)","Usual Suspects, The (1995)",Wallace & Gromit: A Close Shave (1995),Wallace & Gromit: The Best of Aardman Animation (1996),Wallace & Gromit: The Wrong Trousers (1993),"Wedding Singer, The (1998)",Who Framed Roger Rabbit? (1988),Wild at Heart (1990),Willow (1988),X2: X-Men United (2003)
userId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
91.0,2.5,4.0,2.5,,4.0,3.0,,3.5,3.5,,...,4.0,3.5,4.0,4.0,4.0,3.5,4.0,,3.5,3.5
116.0,,,,3.0,,3.0,,,4.5,3.5,...,,4.5,,,3.5,2.0,3.0,,1.0,4.5
156.0,,3.0,,,5.0,5.0,,4.0,,,...,5.0,5.0,,,,4.0,5.0,,,
294.0,4.5,4.0,4.5,4.0,4.5,3.5,,2.5,2.0,,...,,,,,,1.0,5.0,,4.5,3.5
298.0,,,,3.0,5.0,5.0,,,4.0,5.0,...,3.0,5.0,,,,4.0,3.0,,4.0,


In [19]:
user_df[movies_watched]

title,2001: A Space Odyssey (1968),"Adventures of Priscilla, Queen of the Desert, The (1994)",Akira (1988),Aladdin (1992),Aliens (1986),Almost Famous (2000),Along Came Polly (2004),Amadeus (1984),American History X (1998),Any Given Sunday (1999),...,"Untouchables, The (1987)","Usual Suspects, The (1995)",Wallace & Gromit: A Close Shave (1995),Wallace & Gromit: The Best of Aardman Animation (1996),Wallace & Gromit: The Wrong Trousers (1993),"Wedding Singer, The (1998)",Who Framed Roger Rabbit? (1988),Wild at Heart (1990),Willow (1988),X2: X-Men United (2003)
userId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
108170.0,5.0,3.5,5.0,3.0,5.0,4.5,1.0,3.0,4.5,0.5,...,4.0,5.0,4.0,4.5,5.0,4.5,3.5,5.0,4.5,3.5


In [20]:
# Watching the same movies is not enough to be similar. They should both give same ratings for the movies. 

final_df = pd.concat([movies_watched_df[movies_watched_df.index.isin(users_same_movies)],
                      user_df[movies_watched]])
final_df.head()

title,2001: A Space Odyssey (1968),"Adventures of Priscilla, Queen of the Desert, The (1994)",Akira (1988),Aladdin (1992),Aliens (1986),Almost Famous (2000),Along Came Polly (2004),Amadeus (1984),American History X (1998),Any Given Sunday (1999),...,"Untouchables, The (1987)","Usual Suspects, The (1995)",Wallace & Gromit: A Close Shave (1995),Wallace & Gromit: The Best of Aardman Animation (1996),Wallace & Gromit: The Wrong Trousers (1993),"Wedding Singer, The (1998)",Who Framed Roger Rabbit? (1988),Wild at Heart (1990),Willow (1988),X2: X-Men United (2003)
userId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
91.0,2.5,4.0,2.5,,4.0,3.0,,3.5,3.5,,...,4.0,3.5,4.0,4.0,4.0,3.5,4.0,,3.5,3.5
116.0,,,,3.0,,3.0,,,4.5,3.5,...,,4.5,,,3.5,2.0,3.0,,1.0,4.5
156.0,,3.0,,,5.0,5.0,,4.0,,,...,5.0,5.0,,,,4.0,5.0,,,
294.0,4.5,4.0,4.5,4.0,4.5,3.5,,2.5,2.0,,...,,,,,,1.0,5.0,,4.5,3.5
298.0,,,,3.0,5.0,5.0,,,4.0,5.0,...,3.0,5.0,,,,4.0,3.0,,4.0,


In [21]:
final_df.T.corr().head()

userId,91.0,116.0,156.0,294.0,298.0,359.0,367.0,388.0,586.0,614.0,...,137686.0,137839.0,137854.0,137885.0,138134.0,138208.0,138254.0,138325.0,138411.0,108170.0
userId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
91.0,1.0,0.454166,0.322393,0.338482,0.152716,0.245899,0.505629,0.438347,0.188644,0.572365,...,0.437469,0.311582,0.52988,0.419165,0.267141,0.548877,0.199193,0.37153,0.521917,0.328344
116.0,0.454166,1.0,0.460489,0.173071,0.347831,0.18601,0.539608,0.639885,0.594176,0.445155,...,0.540097,0.476253,0.444682,0.517867,0.419395,0.420043,0.568327,0.48151,0.477922,0.497016
156.0,0.322393,0.460489,1.0,0.219779,0.339833,0.201932,0.499251,0.416804,0.369159,0.317214,...,0.477,0.363992,0.411427,0.34784,0.041885,0.34649,0.43903,0.342354,0.441789,0.514391
294.0,0.338482,0.173071,0.219779,1.0,0.276074,-0.046278,0.422012,0.239812,0.224724,0.151781,...,0.28339,0.21088,0.233735,0.282503,0.137793,0.212778,0.286854,0.322522,0.307772,0.237754
298.0,0.152716,0.347831,0.339833,0.276074,1.0,0.02942,0.40085,0.17061,0.330291,-0.050431,...,0.337318,0.129187,0.204721,0.322179,0.043679,0.215007,0.333704,0.03616,0.160045,0.214872


In [22]:
corr_df = final_df.T.corr().stack().sort_values().drop_duplicates()

In [23]:
corr_df.head()

userId    userId  
72838.0   110130.0   -0.581570
100618.0  33581.0    -0.481252
89242.0   126121.0   -0.476220
72838.0   7232.0     -0.472418
20011.0   104973.0   -0.458733
dtype: float64

In [24]:
corr_df = pd.DataFrame(corr_df, columns=["corr"])

In [25]:
corr_df.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,corr
userId,userId,Unnamed: 2_level_1
72838.0,110130.0,-0.58157
100618.0,33581.0,-0.481252
89242.0,126121.0,-0.47622
72838.0,7232.0,-0.472418
20011.0,104973.0,-0.458733


In [26]:
corr_df.index.names = ['user_id_1', 'user_id_2']
corr_df = corr_df.reset_index()

In [27]:
# The correlation between each user ratings are specified and sorted in descending order. 
corr_df.head()

Unnamed: 0,user_id_1,user_id_2,corr
0,72838.0,110130.0,-0.58157
1,100618.0,33581.0,-0.481252
2,89242.0,126121.0,-0.47622
3,72838.0,7232.0,-0.472418
4,20011.0,104973.0,-0.458733


In [28]:
# One of the users in the correlation table should be the first user. To be considered as similar, the correlation threshold value is taken 0.65.

top_users = corr_df[(corr_df["user_id_1"] == user) & (corr_df["corr"] >= 0.65)][["user_id_2", "corr"]].reset_index(drop=True)
top_users = top_users.sort_values(by='corr', ascending=False)

In [29]:
top_users.rename(columns={"user_id_2": "userId"}, inplace=True)
top_users.head()

Unnamed: 0,userId,corr
3,108170.0,1.0
2,5155.0,0.716406
1,121747.0,0.673295
0,121747.0,0.673295


In [30]:
# The movie id and the rating data are merged with the correlation data. The highest correlation belongs to the user himself, so it is removed.

rating = pd.read_csv('../input/movielense20m/rating.csv')
top_users_ratings = top_users.merge(rating[["userId", "movieId", "rating"]], how='inner')

In [31]:
top_users_ratings = top_users_ratings[top_users_ratings["userId"] != user]
top_users_ratings.head()

Unnamed: 0,userId,corr,movieId,rating
194,5155.0,0.716406,1,3.5
195,5155.0,0.716406,2,3.0
196,5155.0,0.716406,5,3.0
197,5155.0,0.716406,9,3.0
198,5155.0,0.716406,10,4.0


# The calculation of Weighted Average Recommendation Score

In [32]:
# It is wanted to have the suggestion from the most similar ones, however rating may differ among the most similar ones. So a scale is 
# arranged with correlation and rating.

top_users_ratings['weighted_rating'] = top_users_ratings['corr'] * top_users_ratings['rating']
top_users_ratings.head()


Unnamed: 0,userId,corr,movieId,rating,weighted_rating
194,5155.0,0.716406,1,3.5,2.507423
195,5155.0,0.716406,2,3.0,2.149219
196,5155.0,0.716406,5,3.0,2.149219
197,5155.0,0.716406,9,3.0,2.149219
198,5155.0,0.716406,10,4.0,2.865626


# User Based Recommendation

In [33]:
# The weighted average scores are sorted according to movie id.

recommendation_df = top_users_ratings.groupby('movieId').agg({"weighted_rating": "mean"})
recommendation_df = recommendation_df.reset_index()
recommendation_df.head()

Unnamed: 0,movieId,weighted_rating
0,1,2.40683
1,2,1.614134
2,5,2.149219
3,6,2.693181
4,9,2.149219


In [34]:
# The movies having score greater than 3.5 are selected.

movies_to_be_recommend = recommendation_df[recommendation_df["weighted_rating"] > 3.5].sort_values("weighted_rating", ascending=False)

In [35]:
# The movie names are needed so the datasets are merged. So here is the list of the movies to recommend for the specified user. 

movie = pd.read_csv('../input/movielense20m/movie.csv')
movies_to_be_recommend.merge(movie[["movieId", "title"]])["title"]

0                        Natural Born Killers (1994)
1     Mystery Science Theater 3000: The Movie (1996)
2             One Flew Over the Cuckoo's Nest (1975)
3                         Princess Bride, The (1987)
4          Butch Cassidy and the Sundance Kid (1969)
5                                  Birds, The (1963)
6              Fear and Loathing in Las Vegas (1998)
7                               Exorcist, The (1973)
8                          Christmas Story, A (1983)
9                                Natural, The (1984)
10             Jay and Silent Bob Strike Back (2001)
11                             Super Troopers (2001)
12      Anchorman: The Legend of Ron Burgundy (2004)
Name: title, dtype: object

# Item Based Recommendation

In [36]:
# The movie id which the user last watched and rated high

movie_id = rating[(rating["userId"] == user) & (rating["rating"] == 5.0)].sort_values(by = "timestamp", ascending = False)["movieId"][0:1].values[0]

In [37]:
# A pivot table where user id is the index, movie name is the column and ratings are the values is constructed.

user_moviename_df = common_movies.pivot_table(index = ["userId"], columns = ["title"], values = "rating")

In [38]:
# The function gets the movie name from the pivot table, and brigns the most correlated movies with it in descending order.

def item_based_recommender(movie_name, user_moviename_df):
    movie = user_moviename_df[movie_name]
    return user_moviename_df.corrwith(movie).sort_values(ascending=False).head(10)


movies_from_item_based = item_based_recommender(movie[movie["movieId"] == movie_id]["title"].values[0], user_moviename_df)

In [39]:
# These are the 5 movies to suggest to the user based on his taste.

movies_from_item_based[1:6].index

Index(['My Science Project (1985)', 'Mediterraneo (1991)',
       'Old Man and the Sea, The (1958)',
       'National Lampoon's Senior Trip (1995)', 'Clockwatchers (1997)'],
      dtype='object', name='title')