## Testing Algorithm

In [2]:
import numpy as np
import pandas as pd
from scipy import sparse
from mf import *

## Encoding rating data
Here is a very small subset of fake data to get us started.

In [3]:
# The first row says that User 11 rated movie 1 with a score of 4
!cat tiny_training.csv 

userId,movieId,rating
11,1,4
11,23,5
2,23,5
2,4,3
31,1,4
31,23,4
4,1,5
4,3,2
52,1,1
52,3,4
61,3,5
7,23,1
7,3,3


In [None]:
df = pd.read_csv("tiny_training.csv")
df, num_users, num_movies = encode_data(df)

In [4]:
df

Unnamed: 0,userId,movieId,rating
0,0,0,4
1,0,1,5
2,1,1,5
3,1,2,3
4,2,0,4
5,2,1,4
6,3,0,5
7,3,3,2
8,4,0,1
9,4,3,4


In [5]:
assert(num_users == 7)

In [6]:
assert(num_movies == 4)

In [7]:
np.testing.assert_equal(df["userId"].values, np.array([0, 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 6, 6]))

In [8]:
np.testing.assert_equal(df["movieId"].values, np.array([0, 1, 1, 2, 0, 1, 0, 3, 0, 3, 3, 1, 3]))

## Initializing parameters

In [9]:
# Here is an example of how the prediction matrix would look like with 7 users and 5 movies
np.dot(create_embedings(7,3), create_embedings(5,3).transpose())

array([[3.55790894, 4.69774849, 0.92361109, 1.58739544, 3.00593239],
       [4.69774849, 7.44656163, 1.18135616, 2.64524868, 4.74559066],
       [0.92361109, 1.18135616, 0.24548062, 0.34025121, 0.69616965],
       [1.58739544, 2.64524868, 0.34025121, 1.61561   , 2.41361975],
       [3.00593239, 4.74559066, 0.69616965, 2.41361975, 3.82505541],
       [2.02000808, 3.29656257, 0.43174569, 2.065911  , 3.07264619],
       [2.07691001, 3.02887291, 0.53270924, 1.02482544, 1.90251125]])

## Encoding Y as a sparse matrix
This code helps you encode a $Y$ as a sparse matrix from the dataframe. 

In [10]:
df = pd.read_csv("tiny_training.csv")
df, num_users, num_movies = encode_data(df)
Y = df2matrix(df, num_users, num_movies)

In [11]:
print(Y)

  (0, 0)	4
  (2, 0)	4
  (3, 0)	5
  (4, 0)	1
  (0, 1)	5
  (1, 1)	5
  (2, 1)	4
  (6, 1)	1
  (1, 2)	3
  (3, 3)	2
  (4, 3)	4
  (5, 3)	5
  (6, 3)	3


## Calculating the cost function

In [12]:
emb_user = np.ones((num_users, 3))
emb_movie = np.ones((num_movies, 3))
error = cost(df, emb_user, emb_movie)
assert(np.around(error, decimals=2) == 2.23)

## Calculating gradient

In [13]:
K = 3
emb_user = create_embedings(num_users, K)
emb_movie = create_embedings(num_movies, K)
Y = df2matrix(df, emb_user.shape[0], emb_movie.shape[0])
grad_user, grad_movie = gradient(df, Y, emb_user, emb_movie)

In [14]:
user=1
approx = np.array([finite_difference(df, emb_user, emb_movie, ind_u=user, k=i) for i in range(K)])
assert(np.all(np.abs(grad_user[user] - approx) < 0.0001))

In [15]:
movie=1
approx = np.array([finite_difference(df, emb_user, emb_movie, ind_m=movie, k=i) for i in range(K)])
assert(np.all(np.abs(grad_movie[movie] - approx) < 0.0001))

## Gradient descent with momentum

In [16]:
emb_user = create_embedings(num_users, 3)
emb_movie = create_embedings(num_movies, 3)
emb_user, emb_movie = gradient_descent(df, emb_user, emb_movie, iterations=200, learning_rate=0.01)

0 4.75400712709481 None
50 1.9576510642217757 None
100 1.0458015647845733 None
150 0.7333410655387986 None


In [17]:
train_mse = cost(df, emb_user, emb_movie)
assert(np.around(train_mse, decimals=2) == 0.55)

## Predicting on new data
Now we should write a function that given new data is able to predict ratings. First we write a function that encodes new data. If a new user or item is present that row should be remove. Collaborative Filtering is not good at handling new users or new items. To help with this task, you could write an auxiliary function similar to `proc_col`.

In [18]:
df_t = pd.read_csv("tiny_training.csv")
df_v = pd.read_csv("tiny_val.csv")
df_v = encode_new_data(df_v, df_t)

In [19]:
assert(len(df_v.userId.unique())==2)

In [20]:
assert(len(df_v) == 2)

## Putting it all together
For this part you should get data from here
`wget http://files.grouplens.org/datasets/movielens/ml-latest-small.zip`

In [22]:
# sorting by timestamp
path = "ml-latest-small/"
data = pd.read_csv(path + "ratings.csv")
data = data.sort_values(by=['timestamp'])

In [23]:
data.head()

Unnamed: 0,userId,movieId,rating,timestamp
66719,429,595,5.0,828124615
66716,429,588,5.0,828124615
66717,429,590,5.0,828124615
66718,429,592,5.0,828124615
66712,429,432,3.0,828124615


In [24]:
m = int(data.shape[0]*0.8)
data.shape, m

((100836, 4), 80668)

In [25]:
train = data[:m].copy()
val = data[m:].copy()
df_train, num_users, num_movies = encode_data(train.copy())
df_val = encode_new_data(val.copy(), train.copy())
print(len(val), len(df_val))

20168 1311


In [26]:
K = 50
emb_user = create_embedings(num_users, K)
emb_movie = create_embedings(num_movies, K)
emb_user, emb_movie = gradient_descent(df_train, emb_user, emb_movie, iterations=2000,
                                       learning_rate=1, df_val=df_val)

0 12.136404414670121 12.310601296043616
50 9.613903397530699 10.353878141282681
100 6.483520773751978 8.351624227410637
150 4.519311775145557 6.969253043021952
200 3.487453177675796 6.135265489703384
250 2.8460807656602087 5.541986970150607
300 2.4112356461406046 5.063727564992296
350 2.1010335242073626 4.659691742103504
400 1.8708720440690993 4.3092037974859965
450 1.694504158855836 4.000288133654592
500 1.5556622025742277 3.725332389437499
550 1.4438504233658025 3.479028498334427
600 1.3520657516237249 3.257392917031448
650 1.2754943751352938 3.057277550962372
700 1.2107358665675338 2.8761081894092895
750 1.1553268723124004 2.71173145402287
800 1.107441123716535 2.562317043034037
850 1.0656960462401999 2.426290683415797
900 1.0290252375567204 2.302285660993972
950 0.9965924750010866 2.1891064402915155
1000 0.9677324372083855 2.085700606675237
1050 0.9419089653646692 1.9911367772431057
1100 0.9186850812325752 1.9045869210038149
1150 0.8977010487286815 1.8253120086008572
1200 0.8786580

In [28]:
train_mse = cost(df_train, emb_user, emb_movie)
val_mse = cost(df_val, emb_user, emb_movie)
print(train_mse, val_mse)

0.7113513270477632 1.140599450059145


In [29]:
train_mse = cost(df_train, emb_user, emb_movie)
assert(np.around(train_mse, decimals=2) == 0.71)

In [30]:
val_mse = cost(df_val, emb_user, emb_movie)
assert(np.around(val_mse, decimals=2) == 1.14)

In [31]:
K = 50
emb_user = create_embedings(num_users, K)
emb_movie = create_embedings(num_movies, K)
emb_user, emb_movie = gradient_descent(df_train, emb_user, emb_movie, iterations=200, learning_rate=1, df_val=df_val)

0 12.136404414670121 12.310601296043616
50 9.613903397530699 10.353878141282681
100 6.483520773751978 8.351624227410637
150 4.519311775145557 6.969253043021952


In [32]:
train_mse = cost(df_train, emb_user, emb_movie)
val_mse = cost(df_val, emb_user, emb_movie)
print(train_mse, val_mse)

3.5033212064585033 6.148866866928117


In [33]:
train_mse = cost(df_train, emb_user, emb_movie)
assert(np.around(train_mse, decimals=1) == 3.5)

In [34]:
val_mse = cost(df_val, emb_user, emb_movie)
assert(np.around(val_mse, decimals=1) == 6.1)