In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [170]:
movie_details = pd.read_csv("/kaggle/input/movielens-20m-dataset/movie.csv")
movie_details.head(2)

In [2]:
df = pd.read_csv("/kaggle/input/movielens-20m-dataset/rating.csv")
df_subset = df[df.userId < 1000]
del df
df_subset = df_subset[df_subset.movieId < 1000]
df_subset.head()

In [3]:
del df_subset['timestamp']

In [5]:
print("Unique users: {}".format(df_subset['userId'].nunique()))
print("Unique movies: {}".format(df_subset['movieId'].nunique()))

In [6]:
df_subset.shape

In [7]:
user_bias = df_subset.groupby('userId').agg({"rating": np.mean}).reset_index()
user_bias.columns = ['userId', 'meanUserRating']
user_bias.head()

In [8]:
df_subset = pd.merge(df_subset, user_bias, on = 'userId', how='left')
df_subset['dev'] = df_subset['rating'] - df_subset['meanUserRating']

In [9]:
df_subset.head()

In [23]:
from scipy.stats.stats import pearsonr

def user_user_corellation(df_subset, userId1, userId2):
    user1 = df_subset[df_subset.userId == userId1]
    user2 = df_subset[df_subset.userId == userId2]
    
    user1_movies = np.array(user1['movieId'].unique())
    user2_movies = np.array(user2['movieId'].unique())
    
    common_movies = list(np.intersect1d(user1_movies, user2_movies))
    n_common_movies = len(common_movies)
    
    if n_common_movies < 2:
        user_correlation = -2
    else:
        user1_dev = list(user1[user1['movieId'].isin(common_movies)]['rating'])
        user2_dev = list(user2[user2['movieId'].isin(common_movies)]['rating'])
        user_correlation = pearsonr(user1_dev, user2_dev)[0]
        #print(user1_dev, user2_dev)
            
    return (n_common_movies, user_correlation)

In [24]:
user_user_corellation(df_subset, 10, 251)

In [25]:
#user_user_corellation(df_subset, 10, 500)
all_user_ids = list(df_subset['userId'].unique())
n_user_ids = len(all_user_ids)

correlation_matrix = np.identity(n_user_ids)

for i in range(n_user_ids):
    for j in range(i+1, n_user_ids, 1):
        _, user_correlation = user_user_corellation(df_subset, all_user_ids[i], all_user_ids[j])
        correlation_matrix[i, j] = user_correlation
        correlation_matrix[j, i] = user_correlation
    if (i % 25 == 0):
        print("Current user: {}".format(i))

In [45]:
correlation_matrix_ = np.nan_to_num(correlation_matrix, copy = True, nan=-2)

In [133]:
# index to userId mapping
index_to_user_mapping = dict(zip([*range(n_user_ids)], all_user_ids))
user_to_index_mapping = dict(zip(all_user_ids, [*range(n_user_ids)]))

In [154]:
def prediction(user_id, movie_ids, limit_nearest_neigh = 10):
    i = user_to_index_mapping[user_id]
    
    top_correlated_user_indices = np.argsort(correlation_matrix_[i])[::-1][:limit_nearest_neigh]

    top_correlated_user_ids = [index_to_user_mapping[k] for k in top_correlated_user_indices]

    correlations_with_top_users = correlation_matrix_[i, top_correlated_user_indices]

    #print("Movie id\tPredicted Rating")
    predictions = {}
    for movie_id in movie_ids:
        num = 0
        den = 0
        for j, user in enumerate(top_correlated_user_ids):
            if df_subset[(df_subset.userId == user) & (df_subset.movieId == movie_id)]['dev'].shape[0] > 0:
                dev = float(df_subset[(df_subset.userId == user) & (df_subset.movieId == movie_id)]['dev'])
                num += correlations_with_top_users[j] * dev
                den += correlations_with_top_users[j]

        if den > 0 and num > 0:
            predicted_rating = float(df_subset[df_subset.userId == user_id]['meanUserRating'].iloc[0]) + (num / den)
            if predicted_rating > 5:
                predicted_rating = 5
            elif predicted_rating < 0.5:
                predicted_rating = 0.5
            #print("{}\t{}".format(movie_id, predicted_rating))
            predictions[movie_id] = predicted_rating
    return predictions

In [175]:
def suggest_movies(user_id, max_movies = 10, limit_nearest_neigh = 10):
    movies_watched = list(df_subset[df_subset.userId == user_id]['movieId'])
    all_movies = list(df_subset['movieId'].unique())
    
    movies_not_watched = [m for m in all_movies if m not in movies_watched]
    
    predictions = prediction(user_id, movies_not_watched, limit_nearest_neigh)
    predictions = dict(sorted(predictions.items(), key=lambda item: item[1], reverse=True))
    
    current_sugestion = 0
    print("Suggstions:")
    print("Movie ID\tGenres")
    for k, v in predictions.items():
        if current_sugestion < max_movies:
            title = str(movie_details[movie_details['movieId'] == k]['title'].iloc[0])
            genre = str(movie_details[movie_details['movieId'] == k]['genres'].iloc[0])
            print("{}\t{}".format(k, genre))
        current_sugestion += 1
    
    ratings_movies_watched = list(df_subset[df_subset.userId == user_id]['rating'])
    dict_prev_watched = dict(zip(movies_watched, ratings_movies_watched))
    dict_prev_watched = dict(sorted(dict_prev_watched.items(), key=lambda item: item[1], reverse=True))
    
    current_c = 0
    print("Previously watched movies:")
    print("Movie ID\tGenres")
    for k, v in dict_prev_watched.items():
        if current_c < max_movies:
            title = str(movie_details[movie_details['movieId'] == k]['title'].iloc[0])
            genre = str(movie_details[movie_details['movieId'] == k]['genres'].iloc[0])
            print("{}\t{}".format(k, genre))
        current_c += 1
    
suggest_movies(11)

In [130]:
def compute_mse(limit_nearest_neigh = 10, verbose=False):

    mse = 0
    counter = 0 

    for i, current_user_id in enumerate(all_user_ids):

        if verbose and i % 25 == 0:
            print("Current userId: {}".format(i))

        movie_ids = list(df_subset[df_subset.userId == current_user_id]['movieId'])

        top_correlated_user_indices = np.argsort(correlation_matrix_[i])[::-1][:limit_nearest_neigh]

        top_correlated_user_ids = [index_to_user_mapping[k] for k in top_correlated_user_indices]

        correlations_with_top_users = correlation_matrix_[i, top_correlated_user_indices]

        #print("Movie id\tTrue Rating\tPredicted Rating")
        for movie_id in movie_ids:
            num = 0
            den = 0
            for j, user in enumerate(top_correlated_user_ids):
                if df_subset[(df_subset.userId == user) & (df_subset.movieId == movie_id)]['dev'].shape[0] > 0:
                    dev = float(df_subset[(df_subset.userId == user) & (df_subset.movieId == movie_id)]['dev'])
                    num += correlations_with_top_users[j] * dev
                    den += correlations_with_top_users[j]

                    if den > 0:
                        predicted_rating = float(df_subset[df_subset.userId == current_user_id]['meanUserRating'].iloc[0]) + (num / den)
                        tr = df_subset[(df_subset.userId == current_user_id) & (df_subset.movieId == movie_id)]['rating']
                        if tr.shape[0] > 0:
                            true_rating = float(tr)
                            #print("{}\t{}\t{}".format(movie_id, true_rating, predicted_rating))
                            mse += (true_rating - predicted_rating)**2
                            counter += 1
    mse /= counter
    return mse

In [131]:
import math
rmse = math.sqrt(compute_mse(10, True))

In [132]:
print("RMSE: {}".format(rmse))