In [1]:
import numpy as np
import pandas as pd

In [2]:
import tensorflow as tf

In [3]:
n_users, n_items = 17770, 2649429
n_features = 10

In [4]:
users_embedding = tf.Variable(tf.random_normal([n_users, n_features], stddev=0.01), name="users_embedding")
items_embedding = tf.Variable(tf.random_normal([n_items, n_features], stddev=0.01), name="items_embedding")

In [5]:
users_indexes = tf.placeholder(dtype=tf.int32, shape=[None], name="users_indexes")
items_indexes = tf.placeholder(dtype=tf.int32, shape=[None], name="items_indexes")
ranks_indexes = tf.placeholder(dtype=tf.int32, shape=[None], name="ranks")

In [6]:
users_embedding_chosen = tf.gather(users_embedding, users_indexes, name="users_embedding_chosen")
items_embedding_chosen = tf.gather(items_embedding, items_indexes, name="items_embedding_chosen")

In [7]:
layer_0 = tf.concat([users_embedding_chosen, items_embedding_chosen], axis=-1, name="layer_0")

In [8]:
def seq_layer(name, prev, dim_out, activation="sigmoid"):
    dim_in = int(prev.get_shape()[1])
    
    with tf.name_scope(name):    
        weights = tf.Variable(tf.random_normal([dim_in, dim_out], stddev=0.01), name="W")
        bias = tf.Variable(tf.random_normal([dim_out], stddev=0.01), name="b")
                
        prev = tf.nn.bias_add(tf.matmul(prev, weights, name="weights"), bias, name="bias")
        if activation != "linear":
            active_func = {
                "sigmoid":  tf.nn.sigmoid,
                "relu":     tf.nn.relu,
                "tanh":     tf.nn.tanh,
            }[activation]
            prev = active_func(prev, name=activation)
        
        return prev

In [9]:
layer_1 = seq_layer("layer_1", layer_0, 10, "relu")
layer_2 = seq_layer("layer_2", layer_1, 1, "linear")
layer_3 = tf.squeeze(layer_2)

In [10]:
t_loss = tf.losses.mean_squared_error(ranks_indexes, layer_3)

In [11]:
from IPython.display import clear_output, Image, display, HTML

def strip_consts(graph_def, max_const_size=32):
    """Strip large constant values from graph_def."""
    strip_def = tf.GraphDef()
    for n0 in graph_def.node:
        n = strip_def.node.add() 
        n.MergeFrom(n0)
        if n.op == 'Const':
            tensor = n.attr['value'].tensor
            size = len(tensor.tensor_content)
            if size > max_const_size:
                tensor.tensor_content = bytes("<stripped {} bytes>".format(size))
    return strip_def

  
def rename_nodes(graph_def, rename_func):
    res_def = tf.GraphDef()
    for n0 in graph_def.node:
        n = res_def.node.add() 
        n.MergeFrom(n0)
        n.name = rename_func(n.name)
        for i, s in enumerate(n.input):
            n.input[i] = rename_func(s) if s[0]!='^' else '^'+rename_func(s[1:])
    return res_def


def show_graph(graph_def, max_const_size=32):
    """Visualize TensorFlow graph."""
    if hasattr(graph_def, 'as_graph_def'):
        graph_def = graph_def.as_graph_def()
    strip_def = strip_consts(graph_def, max_const_size=max_const_size)
    code = """
        <script>
          function load() {{
            document.getElementById("{id}").pbtxt = {data};
          }}
        </script>
        <link rel="import" href="https://tensorboard.appspot.com/tf-graph-basic.build.html" onload=load()>
        <div style="height:600px">
          <tf-graph-basic id="{id}"></tf-graph-basic>
        </div>
    """.format(data=repr(str(strip_def)), id='graph'+str(np.random.rand()))
  
    iframe = """
        <iframe seamless style="width:950px;height:620px;border:0" srcdoc="{}"></iframe>
    """.format(code.replace('"', '&quot;'))
    display(HTML(iframe))

In [12]:
show_graph(tf.get_default_graph())

In [13]:
t_train = tf.train.AdamOptimizer(learning_rate=0.001).minimize(t_loss)

In [14]:
import operator

def get_entry(line):
    try:
        user, item, time, rank = map(int, line.strip().split(','))
        return user, item, time, rank
    except ValueError:
        return None

NAME_TRAIN = "./data/train.timestamp.txt"
NAME_TEST  = "./data/test.timestamp.txt"

def batch_iterator(file_name, size=100):
    size = int(size)
    
    cache_size = 1e7
    cache = []
    
    with open(file_name) as f_name:
        for line in f_name:
            user, item, time, rank = get_entry(line)
            cache.append((user-1, item-1, rank))
            
            if len(cache) == cache_size:
                for i in range(0, len(cache), size):
                    yield cache[i:i+size]
                cache = []
        
        if len(cache) > 0:
            for i in range(0, len(cache), size):
                yield cache[i:i+size]
        
        cache = []

In [21]:
n_epochs = 12

In [16]:
sess = tf.Session()
sess.run(tf.global_variables_initializer())

In [17]:
saver = tf.train.Saver()

In [18]:
saver.restore(sess, "./models/model_epoch_{:03}.ckpt".format(5))

In [19]:
scores = []

In [22]:
for epoch_i, score in enumerate(scores):
    print "\r{:>02} {}".format(epoch_i + 1, score)

for epoch_i in range(len(scores), n_epochs):
    n_processed = 0
    
    for batch in batch_iterator(NAME_TRAIN, size=1000):
        users_b = map(operator.itemgetter(0), batch)
        items_b = map(operator.itemgetter(1), batch)
        ranks_b = map(operator.itemgetter(2), batch)
        
        sess.run(t_train, feed_dict={
            users_indexes: np.asarray(users_b),
            items_indexes: np.asarray(items_b),
            ranks_indexes: np.asarray(ranks_b)
        })
            
        n_processed += len(batch)
        
        if n_processed % 1000 == 0:
            print "\r{:>6.5f}".format(n_processed / 80384529.0),
        
    # look up train error....
    costs = []
    for batch in batch_iterator(NAME_TRAIN, size=1e6):
        users_b = map(operator.itemgetter(0), batch)
        items_b = map(operator.itemgetter(1), batch)
        ranks_b = map(operator.itemgetter(2), batch)
        
        cost = sess.run(t_loss, feed_dict={
            users_indexes: np.asarray(users_b),
            items_indexes: np.asarray(items_b),
            ranks_indexes: np.asarray(ranks_b)
        })
        
        costs.append(cost)        
        n_processed += len(batch)
        
        if n_processed % 1000 == 0:
            print "\r~{:>6.5f}".format(n_processed / 80384529.0),
    
    scores.append(np.mean(costs))
    print "\r{:>02} {}".format(epoch_i + 1, scores[-1])
    
    saver.save(sess, "./models/model_epoch_{:03}.ckpt".format(epoch_i + 1))

01 0.968505740166
02 0.896692156792
03 0.822329699993
04 0.805112123489
05 0.796422719955
06 0.789402842522
07 0.791018247604
08 0.778770923615
09 0.772395074368
10 0.768692016602
11 0.765019118786
12 0.764975488186
