In [1]:
import tensorflow as tf
from tensorflow.python.framework import graph_util
import os
os.chdir("/Users/sweaterr/PycharmProjects/TF-recomm")
import dataio
import numpy as np
from collections import deque
from six import next

np.random.seed(13575)

BATCH_SIZE = 1000
USER_NUM = 6040
ITEM_NUM = 3952
DIM = 15
EPOCH_MAX = 100
DEVICE = "/cpu:0"

import time
def get_data():
    df = dataio.read_process("/tmp/movielens/ml-1m/ratings.dat", sep="::")
    rows = len(df)
    df = df.iloc[np.random.permutation(rows)].reset_index(drop=True)
    split_index = int(rows * 0.9)
    df_train = df[0:split_index]
    df_test = df[split_index:].reset_index(drop=True)
    return df_train, df_test

In [2]:
df_train, df_test = get_data()

In [4]:
print(df_train.first)

<bound method DataFrame.first of         user  item  rate          st
0       1893  1692   4.0   974695176
1       5947  2312   4.0   957190990
2        162   365   2.0   977323187
3       5117   456   3.0   962294766
4       2029   315   1.0   974929369
5       2220  1844   5.0   974603135
6       4385  3385   3.0   965172804
7       5779   282   4.0   958156569
8       3617  3740   2.0   966600773
9       3640   584   4.0   966482594
10       515   110   4.0   976205508
11      5138   160   3.0   962060976
12      3032  1290   5.0   970291018
13       521  1199   3.0   976196943
14      3409  1273   5.0   967416389
15      3733  2430   5.0   966194170
16        76  2540   2.0   977813753
17      1299  1320   5.0   974786901
18      5874  3750   4.0   965274403
19      5538   925   5.0   986573601
20      3361   140   4.0   967672860
21      3649   378   4.0   966460630
22      1828  1196   3.0   974696861
23      4276  2659   5.0   983696191
24      5538  2301   4.0  1027814481
25   

In [5]:
train = df_train
test = df_test
samples_per_batch = len(train) // BATCH_SIZE

iter_train = dataio.ShuffleIterator([train["user"],
                                     train["item"],
                                     train["rate"]],
                                    batch_size=BATCH_SIZE)

iter_test = dataio.OneEpochIterator([test["user"],
                                     test["item"],
                                     test["rate"]],
                                    batch_size=-1)

user_batch = tf.placeholder(tf.int32, shape=[None], name="id_user")
item_batch = tf.placeholder(tf.int32, shape=[None], name="id_item")
rate_batch = tf.placeholder(tf.float32, shape=[None])

In [6]:
def inference_svd(user_batch, item_batch, user_num, item_num, dim=5, device="/cpu:0"):
    with tf.device("/cpu:0"):
        bias_global = tf.get_variable("bias_global", shape=[])
        w_bias_user = tf.get_variable("embd_bias_user", shape=[user_num])
        w_bias_item = tf.get_variable("embd_bias_item", shape=[item_num])
        bias_user = tf.nn.embedding_lookup(w_bias_user, user_batch, name="bias_user")
        bias_item = tf.nn.embedding_lookup(w_bias_item, item_batch, name="bias_item")
        w_user = tf.get_variable("embd_user", shape=[user_num, dim],
                                 initializer=tf.truncated_normal_initializer(stddev=0.02))
        w_item = tf.get_variable("embd_item", shape=[item_num, dim],
                                 initializer=tf.truncated_normal_initializer(stddev=0.02))
        embd_user = tf.nn.embedding_lookup(w_user, user_batch, name="embedding_user")
        embd_item = tf.nn.embedding_lookup(w_item, item_batch, name="embedding_item")
    with tf.device(device):
        infer = tf.reduce_sum(tf.mul(embd_user, embd_item), 1)
        infer = tf.add(infer, bias_global)
        infer = tf.add(infer, bias_user)
        infer = tf.add(infer, bias_item, name="svd_inference")
        regularizer = tf.add(tf.nn.l2_loss(embd_user), tf.nn.l2_loss(embd_item), name="svd_regularizer")
    return infer, regularizer


def optimiaztion(infer, regularizer, rate_batch, learning_rate=0.001, reg=0.1, device="/cpu:0"):
    with tf.device(device):
        cost_l2 = tf.nn.l2_loss(tf.sub(infer, rate_batch))
        panelty = tf.constant(reg, dtype=tf.float32, shape=[], name="l2")
        cost = tf.add(cost_l2, tf.mul(regularizer, panelty))
        train_op = tf.train.AdamOptimizer(learning_rate).minimize(cost)
    return cost, train_op

In [7]:
infer, regularizer = inference_svd(user_batch, item_batch, user_num=USER_NUM, item_num=ITEM_NUM, dim=DIM,device=DEVICE)

In [10]:
_, train_op = optimiaztion(infer, regularizer, rate_batch, learning_rate=0.001, reg=0.05, device=DEVICE)

In [14]:
init_op = tf.initialize_all_variables()

In [13]:
def clip(x):
    return np.clip(x, 1.0, 5.0)

with tf.Session() as sess:
    sess.run(init_op)
    print("{} {} {} {}".format("epoch", "train_error", "val_error", "elapsed_time"))
    errors = deque(maxlen=samples_per_batch)
    start = time.time()
    for i in range(EPOCH_MAX * samples_per_batch):
        users, items, rates = next(iter_train)
        _, pred_batch = sess.run([train_op, infer], feed_dict={user_batch: users,
                                                               item_batch: items,
                                                               rate_batch: rates})
        pred_batch = clip(pred_batch)
        errors.append(np.power(pred_batch - rates, 2))
        if i % samples_per_batch == 0:
            train_err = np.sqrt(np.mean(errors))
            test_err2 = np.array([])
            for users, items, rates in iter_test:
                pred_batch = sess.run(infer, feed_dict={user_batch: users,
                                                        item_batch: items})
                pred_batch = clip(pred_batch)
                test_err2 = np.append(test_err2, np.power(pred_batch - rates, 2))
            end = time.time()
            print("{:3d} {:f} {:f} {:f}(s)".format(i // samples_per_batch, train_err, np.sqrt(np.mean(test_err2)),
                                                   end - start))
            start = end

    output_graph_def = graph_util.extract_sub_graph(sess.graph.as_graph_def(),
                                                                     ["svd_inference", "svd_regularizer"])
    tf.train.SummaryWriter(logdir="/tmp/svd", graph_def=output_graph_def)