In [1]:
# !wget 'http://files.grouplens.org/datasets/movielens/ml-1m.zip'
# !unzip 'ml-1m.zip'

--2018-12-27 13:56:55--  http://files.grouplens.org/datasets/movielens/ml-1m.zip
Resolving files.grouplens.org (files.grouplens.org)... 128.101.34.235
Connecting to files.grouplens.org (files.grouplens.org)|128.101.34.235|:80... connected.
HTTP request sent, awaiting response... 200 OK
Length: 5917549 (5.6M) [application/zip]
Saving to: 'ml-1m.zip'


2018-12-27 13:57:05 (613 KB/s) - 'ml-1m.zip' saved [5917549/5917549]

Archive:  ml-1m.zip
   creating: ml-1m/
  inflating: ml-1m/movies.dat        
  inflating: ml-1m/ratings.dat       
  inflating: ml-1m/README            
  inflating: ml-1m/users.dat         


In [2]:
import pandas as pd
import numpy as np

In [77]:
ratings = pd.read_csv('ml-1m/ratings.dat', 
                    sep='::', 
                    engine='python', 
                    encoding='latin-1',
                    names=['userId', 'movieId', 'rating', 'timestamp']).drop(['timestamp'], axis=1)
movies = pd.read_csv('ml-1m/movies.dat', 
                    sep='::', 
                    engine='python', 
                    encoding='latin-1',
                    names=['movieId', 'title', 'genres'])

ratings.shape, movies.shape

((1000209, 3), (3883, 3))

In [4]:
ratings.head(2)

Unnamed: 0,userId,movieId,rating
0,1,1193,5
1,1,661,3


In [5]:
movies.head(2)

Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Animation|Children's|Comedy
1,2,Jumanji (1995),Adventure|Children's|Fantasy


In [6]:
num_movies = ratings['movieId'].unique().shape[0]
num_users = ratings['userId'].unique().shape[0]
print('Number of movies: ',num_movies)
print('Number of users: ',num_users)

Number of movies:  3706
Number of users:  6040


In [7]:
# users have at least rated 20 movies
ratings.groupby(['userId']).size().reset_index().sort_values(0)[:5]


Unnamed: 0,userId,0
946,947,20
4067,4068,20
2529,2530,20
340,341,20
5257,5258,20


In [9]:
def train_test_split(ratings, seed = 0):
    test = np.zeros(ratings.shape)
    train = ratings.copy()
    np.random.seed(seed)
    for user in range(ratings.shape[0]):
        test_ratings = np.random.choice(ratings[user, :].nonzero()[0], 
                                        size=5, 
                                        replace=False)
        train[user, test_ratings] = 0.
        test[user, test_ratings] = ratings[user, test_ratings]

    return train, test

In [10]:
sparse_ratings = ratings.pivot(index = 'userId', columns ='movieId', values = 'rating').fillna(0).values

train, test = train_test_split(sparse_ratings)

train.shape, test.shape

((6040, 3706), (6040, 3706))

In [66]:
def predict(ratings):
    global_mean = np.true_divide(ratings.sum(), (ratings!=0).sum())
    # user bias
    mean_user_rating = (np.true_divide(ratings.sum(axis=1), (ratings!=0).sum(axis=1)) - global_mean).reshape(-1,1)
    # movie bias
    mean_movie_rating = (np.true_divide(ratings.sum(axis=0), (ratings!=0).sum(axis=0)) - global_mean).reshape(1,-1)

    pred = np.zeros(ratings.shape) + global_mean + mean_user_rating + mean_movie_rating
    return pred

In [67]:
pred = predict(sparse_ratings)
pred.shape

(6040, 3706)

In [68]:
from sklearn.metrics import mean_squared_error

def get_rmse(pred, actual):
    pred = pred[actual.nonzero()].flatten()
    actual = actual[actual.nonzero()].flatten()
    return np.sqrt(mean_squared_error(pred, actual))

In [69]:
print('Baseline Prediction RMSE: ' + str(get_rmse(pred, test)))

Baseline Prediction RMSE: 1.4769495786486977


In [70]:
def recommend_movies(userId, ratings, predictions, n=5):
    userId = userId-1
    to_watch_movies = np.where(ratings[userId] == 0)[0]
    
    pred_sort = to_watch_movies[predictions[userId, to_watch_movies].argsort()[:-n-1:-1]]
    pred_values = predictions[userId][pred_sort]
    
    pred_movies = [movies.loc[x+1]['title'] for x in pred_sort]
    
    return pd.DataFrame({'movies': pred_movies, 'predictions': pred_values})


In [84]:
recommend_movies(1, train, pred, 10)

Unnamed: 0,movies,predictions
0,Deconstructing Harry (1997),5.607115
1,Carmen (1984),5.607115
2,"Reluctant Debutante, The (1958)",5.607115
3,Bird on a Wire (1990),5.607115
4,Cool as Ice (1991),5.607115
5,"Skulls, The (2000)",5.607115
6,Agnes Browne (1999),5.607115
7,"Goodbye, 20th Century (Zbogum na dvadesetiot v...",5.607115
8,Rough Night in Jericho (1967),5.607115
9,Kim (1950),5.607115


In [64]:
np.sort(pred, axis=1)

array([[1.60711479, 1.60711479, 1.60711479, ..., 5.60711479, 5.60711479,
        5.60711479],
       [1.13161384, 1.13161384, 1.13161384, ..., 5.13161384, 5.13161384,
        5.13161384],
       [1.32039633, 1.32039633, 1.32039633, ..., 5.32039633, 5.32039633,
        5.32039633],
       ...,
       [1.21843555, 1.21843555, 1.21843555, ..., 5.21843555, 5.21843555,
        5.21843555],
       [1.29648433, 1.29648433, 1.29648433, ..., 5.29648433, 5.29648433,
        5.29648433],
       [0.99614816, 0.99614816, 0.99614816, ..., 4.99614816, 4.99614816,
        4.99614816]])

In [65]:
pred

array([[4.75396121, 3.80825602, 3.62385119, ..., 4.27378146, 4.50711479,
        4.38804263],
       [4.27846025, 3.33275507, 3.14835024, ..., 3.79828051, 4.03161384,
        3.91254168],
       [4.46724274, 3.52153756, 3.33713273, ..., 3.987063  , 4.22039633,
        4.10132417],
       ...,
       [4.36528196, 3.41957677, 3.23517195, ..., 3.88510221, 4.11843555,
        3.99936338],
       [4.44333074, 3.49762555, 3.31322073, ..., 3.96315099, 4.19648433,
        4.07741216],
       [4.14299457, 3.19728938, 3.01288456, ..., 3.66281482, 3.89614816,
        3.77707599]])

In [85]:
movies[movies['title'] == 'Carmen (1984)']

Unnamed: 0,movieId,title,genres
3153,3222,Carmen (1984),Drama


In [86]:
ratings[ratings['movieId'] == 3222].describe()

Unnamed: 0,userId,movieId,rating
count,30.0,30.0,30.0
mean,2472.933333,3222.0,4.166667
std,1501.635291,0.0,0.791478
min,238.0,3222.0,2.0
25%,1333.5,3222.0,4.0
50%,1983.0,3222.0,4.0
75%,3651.75,3222.0,5.0
max,5185.0,3222.0,5.0
