In [1]:
import pandas as pd

In [2]:
import os
from google.colab import drive
drive.mount('/content/gdrive')
os.chdir('/content/gdrive/My Drive/AML Project Group 20/ml-25m')

Drive already mounted at /content/gdrive; to attempt to forcibly remount, call drive.mount("/content/gdrive", force_remount=True).


# Data Loading and Preprocessing

In [3]:
# Load data
movie = pd.read_csv('movies.csv')
rating = pd.read_csv('ratings.csv')

# Merging dataframes on movieId
df = movie.merge(rating, how="left", on="movieId")

df.head()
df.info()

# Counting titles and assigning as comment_counts 
comment_counts = pd.DataFrame(df["title"].value_counts())

# exclude movies with rare comments
rare_movies = comment_counts[comment_counts["title"] <= 3000].index
common_movies = df[~df["title"].isin(rare_movies)]

# Show user ratings by considering userid and movie names and assign the resulting pivot table as user_movie_df
user_movie_df = common_movies.pivot_table(index=["userId"], columns=["title"], values="rating")

# Determining the random user to suggest
# random_user = int(pd.Series(user_movie_df.index).sample(1, random_state=45).values)
# print(random_user)
# In this case i will use the following id
random_user = 108170

<class 'pandas.core.frame.DataFrame'>
Int64Index: 25003471 entries, 0 to 25003470
Data columns (total 6 columns):
 #   Column     Dtype  
---  ------     -----  
 0   movieId    int64  
 1   title      object 
 2   genres     object 
 3   userId     float64
 4   rating     float64
 5   timestamp  float64
dtypes: float64(3), int64(1), object(2)
memory usage: 1.3+ GB


# User-based Recommendation


In [4]:
# Determining the movies watched by the user to be suggested.

# Assining random user as df
random_user_df = user_movie_df[user_movie_df.index == random_user]

# Assining the movies watched by random user as df
movies_watched_random_user = random_user_df.columns[random_user_df.notna().any()].tolist()

# Number of movies watched by random user
len(movies_watched_random_user)

44

In [5]:
# Detecting the ids of other users watching the same movies

# Shooting the watched movies with the audience ids in df
movies_watched_df = user_movie_df[movies_watched_random_user]
movies_watched_df.head()
movies_watched_df.shape

# Calculating how many of the movies watched by the users are the movies watched by the random user
user_movie_count = movies_watched_df.T.notnull().sum()
user_movie_count = user_movie_count.reset_index()
user_movie_count.columns = ["userId", "movie_count"]

# Selecting users with more than 60 percent similarity to recommend
perc = len(movies_watched_random_user) * 60 / 100
users_same_movies = user_movie_count[user_movie_count["movie_count"] > perc]["userId"]

# Checking the selected users
users_same_movies.head()
users_same_movies.count()
users_same_movies.index

Int64Index([   170,    320,    425,    430,    439,    540,    547,    647,
               756,    845,
            ...
            161963, 162034, 162158, 162258, 162283, 162374, 162381, 162471,
            162482, 162503],
           dtype='int64', length=3898)

In [6]:
# Determining the users who are most similar to the user to be suggested

# Creating the final df
final_df = movies_watched_df[movies_watched_df.index.isin(users_same_movies)]
final_df.head()
final_df.shape

# Examining the relationship between selected users and each other
corr_df = final_df.T.corr().unstack().sort_values()
corr_df = pd.DataFrame(corr_df, columns=["corr"])
corr_df.index.names = ['user_id_1', 'user_id_2']
corr_df = corr_df.reset_index()
corr_df.head()

# Examining the relationship between random user and selected users
top_users = corr_df[(corr_df["user_id_1"] == random_user) & (corr_df["corr"] >= 0.65)][
    ["user_id_2", "corr"]].reset_index(drop=True)
top_users = top_users.sort_values(by='corr', ascending=False)
top_users.rename(columns={"user_id_2": "userId"}, inplace=True)

In [7]:
# Weighted average recommendation score
top_users_ratings = top_users.merge(rating[["userId", "movieId", "rating"]], how='inner')
top_users_ratings = top_users_ratings[top_users_ratings["userId"] != random_user]
top_users_ratings['weighted_rating'] = top_users_ratings['corr'] * top_users_ratings['rating']

# Recommendation_df
recommendation_df = top_users_ratings.groupby('movieId').agg({"weighted_rating": "mean"})
recommendation_df = recommendation_df.reset_index()

# User-based recommendation
movies_to_be_recommend = recommendation_df[recommendation_df["weighted_rating"] > 3.5].sort_values("weighted_rating",
                                                                                                   ascending=False)
movies_to_be_recommend = movies_to_be_recommend.merge(movie[["movieId", "title"]])[0:5]
print(movies_to_be_recommend)

   movieId  weighted_rating                                              title
0      501         3.944607                                       Naked (1993)
1     3823         3.944607                                  Wonderland (1999)
2    27317         3.944607                          Audition (Ôdishon) (1999)
3     7043         3.944607  Vivre sa vie: Film en douze tableaux (My Life ...
4     5951         3.944607                              Morvern Callar (2002)


In [8]:
# Selection of the user's most recent highest rated movie
user = 108170
movie_id = rating[(rating["userId"] == user) & (rating["rating"] == 5.0)]. \
               sort_values(by="timestamp", ascending=False)["movieId"][0:1].values[0]
print(movie_id)

4979


# Item-based recommendation

In [32]:
def item_based_recommender(movie_name, user_movie_df):
    movie = user_movie_df[movie_name]
    return user_movie_df.corrwith(movie).sort_values(ascending=False).head(10)

In [33]:
movies_from_item_based = item_based_recommender(movie[movie["movieId"] == movie_id]["title"].values[0], user_movie_df)

In [34]:
#1 to 6th. 0 has the movie itself. We left him out.
movies_from_item_based[1:6]

title
Life Aquatic with Steve Zissou, The (2004)    0.548138
Rushmore (1998)                               0.544014
Darjeeling Limited, The (2007)                0.499189
Grand Budapest Hotel, The (2014)              0.444453
Moonrise Kingdom (2012)                       0.419742
dtype: float64