This project aims to recommend movies to the user by item based and user based collaborative filtering. 

In the item based filtering, the item is recommended based on the voting the user had done for other items. 

In the user based filtering, the user have the same preferences and same likes with the other users. The item which other users liked is recommended to the user. 



In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

# Preparing User - Movie Matrix

In [2]:
import pandas as pd
pd.set_option('display.max_columns', 20)


In [3]:
# Two datasets are merged so that both ratings, user ids and movie names take place in the same dataset.

movie = pd.read_csv('../input/movielense20m/movie.csv')
rating = pd.read_csv('../input/movielense20m/rating.csv')
df = movie.merge(rating, how="left", on="movieId")
df.head()

In [4]:
# The number of comments for each movie. 

comment_counts = pd.DataFrame(df["title"].value_counts())
comment_counts.head()

In [5]:
df.shape

In [6]:
# Movies which have comments less than 1000 are rare, considering the size of the dataset.

rare_movies = comment_counts[comment_counts["title"] <= 1000].index


In [7]:
# The rare movies are left out. 

common_movies = df[~df["title"].isin(rare_movies)]


In [8]:
# A pivot table where user id is the index, movie id is the column and ratings are the values is constructed.

user_movie_df = common_movies.pivot_table(index = ["userId"], columns = ["movieId"], values = "rating")
user_movie_df.head()

# Detecting the movies that the user watched

In [9]:
# An arbitrary user is picked. The user's id is chosen from the pivot table and assigned to a new dataframe.

user = 108170
user_df = user_movie_df[user_movie_df.index == 108170]
user_df.head()

In [10]:
# NaN values are left out. So the remaining indexes are the movies that the user watched.

movies_watched = user_df.columns[user_df.notna().any()].tolist()


# Reaching the other users who watched the same movies as the user

In [11]:
# A new dataframe is constructed with the same columns as the pivot table filtering watched movies of the user.

movies_watched_df = user_movie_df[movies_watched]

movies_watched_df.head()

In [12]:
# The number of movies each user watched in the movies_watched dataframe, hence the number of movies each user watched commonly with the user.

user_movie_count = movies_watched_df.T.notnull().sum()
user_movie_count = user_movie_count.reset_index()
user_movie_count.columns = ["userId", "movie_count"]
user_movie_count.head()


# Finding the most similar users with the user

In [13]:
# Watching 60% or over movies with the user is a step to be considered as similar. These similar tasted users are gathered. 
    
perc = len(movies_watched) * 60 / 100
users_same_movies = user_movie_count[user_movie_count["movie_count"] > perc]["userId"]
users_same_movies.head()
    

In [14]:
# Watching the same movies is not enough to be similar. They should both give same ratings for the movies. 

final_df = pd.concat([movies_watched_df[movies_watched_df.index.isin(users_same_movies)],
                      user_df[movies_watched]])
final_df.head()

In [15]:
# The correlation between each user ratings are specified and sorted in descending order. 

corr_df = final_df.T.corr().unstack().sort_values().drop_duplicates()
corr_df = pd.DataFrame(corr_df, columns=["corr"])
corr_df.index.names = ['user_id_1', 'user_id_2']
corr_df = corr_df.reset_index()
corr_df.head()

In [16]:
# One of the users in the correlation table should be the first user. To be considered as similar, the correlation threshold value is taken 0.65.

top_users = corr_df[(corr_df["user_id_1"] == user) & (corr_df["corr"] >= 0.65)][["user_id_2", "corr"]].reset_index(drop=True)
top_users = top_users.sort_values(by='corr', ascending=False)
top_users.rename(columns={"user_id_2": "userId"}, inplace=True)
top_users.head()


In [17]:
# The movie id and the rating data are merged with the correlation data. The highest correlation belongs to the user himself, so it is removed.

rating = pd.read_csv('../input/movielense20m/rating.csv')
top_users_ratings = top_users.merge(rating[["userId", "movieId", "rating"]], how='inner')
top_users_ratings = top_users_ratings[top_users_ratings["userId"] != user]
top_users_ratings.head()

# The calculation of Weighted Average Recommendation Score

In [18]:
# It is wanted to have the suggestion from the most similar ones, however rating may differ among the most similar ones. So a scale is 
# arranged with correlation and rating.

top_users_ratings['weighted_rating'] = top_users_ratings['corr'] * top_users_ratings['rating']
top_users_ratings.head()


# User Based Recommendation

In [19]:
# The weighted average scores are sorted according to movie id.

recommendation_df = top_users_ratings.groupby('movieId').agg({"weighted_rating": "mean"})
recommendation_df = recommendation_df.reset_index()
recommendation_df.head()

In [20]:
# The movies having score greater than 3.5 are selected.

movies_to_be_recommend = recommendation_df[recommendation_df["weighted_rating"] > 3.5].sort_values("weighted_rating", ascending=False)

In [21]:
# The movie names are needed so the datasets are merged. So here is the list of the 5 movies to recommend for the specified user. 

movie = pd.read_csv('../input/movielense20m/movie.csv')
movies_to_be_recommend.merge(movie[["movieId", "title"]])["title"]

# Item Based Recommendation

In [22]:
# The movie id which the user last watched and rated high

movie_id = rating[(rating["userId"] == user) & (rating["rating"] == 5.0)].sort_values(by = "timestamp", ascending = False)["movieId"][0:1].values[0]

In [None]:
# A pivot table where user id is the index, movie name is the column and ratings are the values is constructed.

user_moviename_df = common_movies.pivot_table(index = ["userId"], columns = ["title"], values = "rating")

In [None]:
# The function gets the movie name from the pivot table, and brigns the most correlated movies with it in descending order.

def item_based_recommender(movie_name, user_moviename_df):
    movie = user_moviename_df[movie_name]
    return user_moviename_df.corrwith(movie).sort_values(ascending=False).head(10)


movies_from_item_based = item_based_recommender(movie[movie["movieId"] == movie_id]["title"].values[0], user_moviename_df)

In [None]:
# These are the 5 movies to suggest to the user based on his taste.

movies_from_item_based[1:6].index