In [8]:
import numpy as np
import tensorflow as tf
import pandas as pd

In [22]:
# Preparing the data
# Import the data, data is stacked. This dataset contains 100000 movie ratings (movieLens datasets).
# All movieLens datasets can be found at https://grouplens.org/datasets/movielens/
ratings_df = pd.read_csv('u.data', sep='\t')
ratings_pt = ratings_df.pivot(index = 'user_id', columns ='item_id', values = 'rating').fillna(0)

# Break data into a training and test set
train_ratings_df = ratings_df[0:80000]
test_ratings_df = ratings_df[80000:100000]

user_indecies_train = [i-1 for i in train_ratings_df.user_id.values]
item_indecies_train = [i-1 for i in train_ratings_df.item_id.values]
R_known_train = train_ratings_df.rating.values

user_indecies_test = [x-1 for x in test_ratings_df.user_id.values]
item_indecies_test = [x-1 for x in test_ratings_df.item_id.values]
R_known_test = test_ratings_df.rating.values
ratings_pt

item_id,1,2,3,4,5,6,7,8,9,10,...,1673,1674,1675,1676,1677,1678,1679,1680,1681,1682
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,5.0,3.0,4.0,3.0,3.0,5.0,4.0,1.0,5.0,3.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,4.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5,4.0,3.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
6,4.0,0.0,0.0,0.0,0.0,0.0,2.0,4.0,4.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
7,0.0,0.0,0.0,5.0,0.0,0.0,5.0,5.0,5.0,4.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
8,0.0,0.0,0.0,0.0,0.0,0.0,3.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
9,0.0,0.0,0.0,0.0,0.0,5.0,4.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
10,4.0,0.0,0.0,4.0,0.0,0.0,4.0,0.0,4.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [23]:
# Prepare data
R = ratings_pt
N = 943      # number of users
M = 1682     # number of items
K = 10       # number of hidden features
p = np.random.rand(N,K)
q = np.random.rand(K,M)

## Prediction: $\hat{r} = pq$

In [24]:
# Prediction
P = tf.Variable(p, dtype=tf.float32, name='P')
Q = tf.Variable(q, dtype=tf.float32, name='Q')
R_hat = tf.matmul(P, Q)

# For cost function, we want to compare only the known values
R_hat_flatten = tf.reshape(R_hat, [-1])
R_hat_train_known = tf.gather(R_hat_flatten, user_indecies_train * tf.shape(R_hat)[1] + 
              item_indecies_train, name='extracting_user_rate')

## Cost: $\left \|  \hat{r}-r\right \|$

In [25]:
# Cost function
diff_ratings = tf.subtract(R_hat_train_known, R_known_train, name='diff_ratings')
diff_ratings_abs = tf.abs(diff_ratings, name="diff_ratings_abs")
base_cost = tf.reduce_sum(diff_ratings_abs, name="sum_abs_error")



## Regularization: $\sum \lambda(\left \| p \right \|+\left \| q \right \|)$

In [26]:
# Regularization
lda = tf.constant(.001, name='lambda')
norm_sums = tf.add(tf.reduce_sum(tf.abs(P, name='user_abs'), name='user_norm'),
                   tf.reduce_sum(tf.abs(Q, name='item_abs'), name='item_norm'))
regularizer = tf.multiply(norm_sums, lda, 'regularizer')

## Total Cost: $\sum \left \|  \hat{r}-r\right \|+\lambda(\left \| p \right \|+\left \| q \right \|)$

In [27]:
# Total Cost
cost = tf.add(base_cost, regularizer)

In [28]:
# Create an optimizer to minimize the loss
optimizer = tf.train.GradientDescentOptimizer(0.001)
train = optimizer.minimize(cost)

In [30]:
# Execute the TF session
sess = tf.Session();
init = tf.global_variables_initializer()
sess.run(init)

for i in range(500):
    sess.run(train)

In [31]:
# Accuracy for known R ratings vs predicted R_hat ratings of testset
R_hat_test_known = tf.gather(R_hat_flatten, user_indecies_test * tf.shape(R_hat)[1] + item_indecies_test, name='extracting_user_rate_test')
diff_ratings_test = tf.subtract(R_hat_test_known, R_known_test, name='diff_ratings_test')
diff_ratings_abs_test = tf.abs(diff_ratings_test, name="diff_ratings_abs_test")
sum_diff_ratings_abs_test = tf.reduce_sum(diff_ratings_abs_test, name="sum_diff_ratings_abs_test")
accuracy_test = tf.div(sum_diff_ratings_abs_test, R_known_test.size, name="accuracy_test")
print(sess.run(accuracy_test))

0.82848406


In [32]:
# Visualize known R ratings, to compare with R_hat.
R.head()

item_id,1,2,3,4,5,6,7,8,9,10,...,1673,1674,1675,1676,1677,1678,1679,1680,1681,1682
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,5.0,3.0,4.0,3.0,3.0,5.0,4.0,1.0,5.0,3.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,4.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5,4.0,3.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [33]:
# Visualize R_hat rating predictions, to compare with known R.
pred = sess.run(R_hat)
pred_df = pd.DataFrame(pred)
pred_df.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,1672,1673,1674,1675,1676,1677,1678,1679,1680,1681
0,4.005579,2.836279,2.874639,3.158922,2.542316,4.29801,4.112962,4.738904,4.40008,4.249037,...,3.582376,4.343962,2.000542,3.399863,2.675956,2.306795,3.903484,1.905586,3.59809,1.798704
1,3.650688,3.904401,3.459788,2.803754,3.21429,4.298946,3.32837,4.105541,3.553438,3.646142,...,3.098334,4.320636,1.887151,4.421002,3.551914,2.25674,4.003697,2.771195,3.257403,2.434872
2,3.096481,2.367609,1.20903,2.832684,3.26721,3.871538,3.692363,3.209928,3.002473,2.799113,...,2.722111,3.734202,2.312139,2.759692,2.741664,2.086207,2.904494,2.55744,3.118649,1.927404
3,4.260223,3.741255,3.005303,3.954841,3.543999,4.940014,4.629095,4.985224,4.760236,4.733638,...,3.358366,4.760239,2.55551,3.84667,3.531389,2.623989,4.25388,2.198349,4.078449,2.247434
4,3.88076,2.943702,1.302033,2.556416,2.246641,4.838901,3.258488,4.170143,2.591933,2.484431,...,2.639563,3.723637,1.349156,2.467576,2.626587,1.370068,3.064966,1.807597,3.531862,1.66978
