Authors: Gabriel Vaz, Luiz Henrique Mosmann, Marcelo Drumm, Márcio Góes
Dataset from: [MovieLens 100K](https://grouplens.org/datasets/movielens/100k/)

In [1]:
import numpy as np
import pandas as pd
import random as rnd

from sklearn.model_selection import train_test_split
from sklearn.metrics.pairwise import pairwise_distances
from sklearn.metrics import mean_squared_error
from math import sqrt


In [2]:
column_data = ['user_id', 'item_id', 'rating', 'timestamp']
items = pd.read_csv('ml-100k/u.data', sep='\t', names=column_data)
items.head()

Unnamed: 0,user_id,item_id,rating,timestamp
0,196,242,3,881250949
1,186,302,3,891717742
2,22,377,1,878887116
3,244,51,2,880606923
4,166,346,1,886397596


In [3]:
column_movies = ['item_id', 'movie_title']
movies = pd.read_csv("ml-100k/u.item", sep='|', names=column_movies, encoding='ISO-8859-1', usecols=[0,1])
movies.head()

Unnamed: 0,item_id,movie_title
0,1,Toy Story (1995)
1,2,GoldenEye (1995)
2,3,Four Rooms (1995)
3,4,Get Shorty (1995)
4,5,Copycat (1995)


In [4]:
df = pd.merge(items, movies, on='item_id')
df.head()

Unnamed: 0,user_id,item_id,rating,timestamp,movie_title
0,196,242,3,881250949,Kolya (1996)
1,63,242,3,875747190,Kolya (1996)
2,226,242,5,883888671,Kolya (1996)
3,154,242,3,879138235,Kolya (1996)
4,306,242,5,876503793,Kolya (1996)


In [5]:
n_users = df.user_id.nunique()
n_movies = df.item_id.nunique()

In [6]:
train, test = train_test_split(df, test_size=0.20)

In [7]:
user_test = np.zeros((1, n_movies))
random_movies = movies.sample(300)

watched_movies = []
for movie in random_movies.itertuples():
    random_rating = rnd.randint(1,5)
    user_test[0, movie[1]-1] = random_rating
    watched_movies.append(movie[1]-1)
    # print('has watched {} with rating {} line {}'.format(movie[2], random_rating, movie[1]-1))

In [8]:
n_users = n_users + 1
matrix = np.zeros((n_users, n_movies))
id_to_movie = {}

for movie_info in train.itertuples():
    id_to_movie[movie_info[2]-1] = movie_info[5]
    matrix[movie_info[1]-1, movie_info[2]-1] = movie_info[3]  

matrix[n_users-1] = user_test

In [9]:
similarity = pairwise_distances(matrix, metric='cosine')

In [10]:
users_mean = matrix.mean(axis=1)[:, np.newaxis]
normalized_matrix = (matrix - users_mean)
predictions = users_mean + similarity.dot(normalized_matrix) / np.array([np.abs(similarity).sum(axis=1)]).T

In [11]:
print('Recommended movies for our user:')
print('')
user_predictions = predictions[n_users-1]

for watched in watched_movies:
    user_predictions[watched] = 0.0

recommended_movies = np.argpartition(user_predictions, -10)[-10:]

for movie_index in recommended_movies:
  print(id_to_movie[movie_index])



Recommended movies for our user:

Pulp Fiction (1994)
Air Force One (1997)
Silence of the Lambs, The (1991)
Godfather, The (1972)
Toy Story (1995)
Fargo (1996)
Return of the Jedi (1983)
English Patient, The (1996)
Contact (1997)
Star Wars (1977)


In [12]:
n_users = n_users - 1

In [13]:
matrix_test = np.zeros((n_users, n_movies))

for movie_info in test.itertuples():
    matrix_test[movie_info[1]-1, movie_info[2]-1] = movie_info[3]  

In [14]:
prediction = predictions[matrix_test.nonzero()].flatten() 
ground_truth = matrix_test[matrix_test.nonzero()].flatten()
sqrt(mean_squared_error(prediction, ground_truth))

3.0851638230224476