In [4]:
import numpy as np
import pandas as pd
import tensorflow as tf

In [5]:
# Preparing the data
# Import the data, data is stacked.
movie_ratings_raw = pd.read_csv('movie_smallset.csv')

user_indices = [i-1 for i in movie_ratings_raw.user_id.values]
item_indices = [i-1 for i in movie_ratings_raw.item_id.values]
R_known = movie_ratings_raw.rating.values

In [7]:
# Use pivot table to unstack data to more easily visualize
movie_ratings_pt = movie_ratings_raw.pivot(index = 'user_id', columns ='item_id', values = 'rating').fillna(0)

#Unpivot table to name columns and rows, for clarification and visualization of data.
index = movie_ratings_pt.index.union(movie_ratings_pt.columns)
movie_ratings = movie_ratings_pt.reindex(index=index, columns=index, fill_value=0)
movie_ratings.columns = ['Toy Story','Terminator','Terminator 2','Lion King','Despicable Me','Despicable Me 2',
                                       'Die Hard', 'Die Hard 2','Toy Story 2','Die Hard 3']
movie_ratings.index = ['Justin','Mike','Stef','Jim','Claire','Joe','Amie','Charles','Katie','Chuck']
movie_ratings

Unnamed: 0,Toy Story,Terminator,Terminator 2,Lion King,Despicable Me,Despicable Me 2,Die Hard,Die Hard 2,Toy Story 2,Die Hard 3
Justin,1.0,5.0,0.0,1.0,2.0,1.0,5.0,5.0,0.0,4.0
Mike,2.0,4.0,5.0,0.0,1.0,1.0,5.0,0.0,1.0,5.0
Stef,0.0,5.0,4.0,1.0,0.0,1.0,5.0,4.0,1.0,5.0
Jim,1.0,5.0,0.0,1.0,1.0,1.0,5.0,0.0,1.0,5.0
Claire,1.0,0.0,5.0,2.0,1.0,2.0,4.0,5.0,0.0,5.0
Joe,5.0,1.0,1.0,0.0,5.0,5.0,1.0,1.0,5.0,1.0
Amie,5.0,0.0,1.0,5.0,5.0,5.0,1.0,1.0,5.0,0.0
Charles,0.0,1.0,1.0,5.0,5.0,5.0,1.0,1.0,5.0,1.0
Katie,5.0,1.0,1.0,0.0,5.0,5.0,1.0,1.0,5.0,1.0
Chuck,5.0,1.0,0.0,5.0,5.0,5.0,1.0,1.0,5.0,1.0


In [8]:
# Prepare data
R = np.array(movie_ratings)
R_known = movie_ratings_raw.rating.values
N = 10    # number of users
M = 10    # number of items
K = 2     # number of hidden features
p = np.random.rand(N,K)
q = np.random.rand(K,M)

## Prediction: $\hat{r} = pq$

In [9]:
# Prediction
P = tf.Variable(p, dtype=tf.float32, name='P')
Q = tf.Variable(q, dtype=tf.float32, name='Q')
R_hat = tf.matmul(P, Q)

# For cost function, we want to compare only the known values
R_hat_flat = tf.reshape(R_hat, [-1])
R_hat_known = tf.gather(R_hat_flat, user_indices * tf.shape(R_hat)[1] + 
              item_indices, name='extracting_user_rate')

## Cost: $\left \|  \hat{r}-r\right \|$

In [10]:
# Cost
diff_ratings = tf.subtract(R_hat_known, R_known, name='diff_ratings')
diff_ratings_abs = tf.abs(diff_ratings, name="diff_ratings_abs")
base_cost = tf.reduce_sum(diff_ratings_abs, name="sum_abs_error")

## Regularization: $\sum \lambda(\left \| p \right \|+\left \| q \right \|)$

In [11]:
# Regularization
lda = tf.constant(.001, name='lambda')
norm_sums = tf.add(tf.reduce_sum(tf.abs(P, name='user_abs'), name='user_norm'),
                   tf.reduce_sum(tf.abs(Q, name='item_abs'), name='item_norm'))
regularizer = tf.multiply(norm_sums, lda, 'regularizer')

## Total Cost: $\sum \left \|  \hat{r}-r\right \|+\lambda(\left \| p \right \|+\left \| q \right \|)$

In [12]:
# Total Cost
cost = tf.add(base_cost, regularizer)

In [13]:
# Create an optimizer to minimize the loss
optimizer = tf.train.GradientDescentOptimizer(0.001)
train = optimizer.minimize(cost)

In [15]:
# Execute the TF session
sess = tf.Session();
init = tf.global_variables_initializer()
sess.run(init)

for i in range(5000):
    sess.run(train)

In [16]:
# Accuracy for known R ratings vs predicted R_hat ratings
diff_ratings = tf.subtract(R_hat_known, R_known, name='diff_ratings')
diff_ratings_abs = tf.abs(diff_ratings, name="diff_ratings_abs")
sum_diff_ratings_abs = tf.reduce_sum(diff_ratings_abs, name="sum_diff_ratings_abs")
accuracy = tf.div(sum_diff_ratings_abs, R_known.size, name="accuracy")
print(sess.run(accuracy))

0.12226123


In [17]:
# Visualize known R ratings, to compare with R_hat.
movie_ratings

Unnamed: 0,Toy Story,Terminator,Terminator 2,Lion King,Despicable Me,Despicable Me 2,Die Hard,Die Hard 2,Toy Story 2,Die Hard 3
Justin,1.0,5.0,0.0,1.0,2.0,1.0,5.0,5.0,0.0,4.0
Mike,2.0,4.0,5.0,0.0,1.0,1.0,5.0,0.0,1.0,5.0
Stef,0.0,5.0,4.0,1.0,0.0,1.0,5.0,4.0,1.0,5.0
Jim,1.0,5.0,0.0,1.0,1.0,1.0,5.0,0.0,1.0,5.0
Claire,1.0,0.0,5.0,2.0,1.0,2.0,4.0,5.0,0.0,5.0
Joe,5.0,1.0,1.0,0.0,5.0,5.0,1.0,1.0,5.0,1.0
Amie,5.0,0.0,1.0,5.0,5.0,5.0,1.0,1.0,5.0,0.0
Charles,0.0,1.0,1.0,5.0,5.0,5.0,1.0,1.0,5.0,1.0
Katie,5.0,1.0,1.0,0.0,5.0,5.0,1.0,1.0,5.0,1.0
Chuck,5.0,1.0,0.0,5.0,5.0,5.0,1.0,1.0,5.0,1.0


In [18]:
# Visualize R_hat rating predictions, to compare with known R.
pred = sess.run(R_hat)
pred_df = pd.DataFrame(pred)
pred_df

Unnamed: 0,0,1,2,3,4,5,6,7,8,9
0,1.020535,4.999187,4.998703,1.029619,1.00876,1.032444,4.996092,4.985912,1.018021,5.002262
1,1.041266,4.998856,4.998513,1.050455,1.029528,1.053287,4.995762,4.985748,1.03879,5.002046
2,1.006333,4.984047,4.983489,1.015335,0.994575,1.018146,4.98096,4.970724,1.003807,4.987051
3,1.023359,5.014816,5.014327,1.03247,1.011547,1.035304,5.011711,5.001496,1.020837,5.017898
4,1.974957,4.998468,5.004513,1.988835,1.964852,1.992034,4.995479,4.992834,1.974186,5.006884
5,4.999937,1.0045,1.037201,5.025848,5.005953,5.0283,1.004408,1.0402,5.008187,1.031948
6,5.00784,1.005111,1.037865,5.033792,5.013868,5.036246,1.005019,1.040871,5.016104,1.032602
7,4.99718,1.006773,1.039452,5.023078,5.003184,5.02553,1.006679,1.042441,5.005422,1.034204
8,4.999645,1.005859,1.038556,5.025556,5.005656,5.028008,1.005766,1.04155,5.007893,1.033304
9,5.005337,1.01081,1.043539,5.03128,5.011345,5.033737,1.010715,1.046526,5.013591,1.038285
