In [1]:
import numpy as np
import pandas as pd

In [2]:
import tensorflow as tf

In [3]:
n_train = !wc -l ./data/train.timestamp.txt
n_valid = !wc -l ./data/valid.timestamp.txt
n_test  = !wc -l ./data/test.timestamp.txt

n_train, n_valid, n_test = map(lambda x: int(x[0].split()[0]), [n_train, n_valid, n_test])
n_train, n_valid, n_test

(62603802, 17780727, 20095978)

In [4]:
n_users, n_items = 17770, 2649429
n_features = 10

In [5]:
users_indexes = tf.sparse_placeholder(tf.float32, name="users_indexes")
items_indexes = tf.sparse_placeholder(tf.float32, name="items_indexes")
ranks_indexes = tf.placeholder(dtype=tf.int32, shape=[None], name="ranks")

In [6]:
with tf.name_scope("users_emebedding"):
    users_weights = tf.Variable(tf.random_normal([n_users, n_features], stddev=0.01), name="users_weights")
    users_embedding = tf.sparse_tensor_dense_matmul(users_indexes, users_weights, name="users_embedding")
    
with tf.name_scope("items_embedding"):
    items_weights = tf.Variable(tf.random_normal([n_items, n_features], stddev=0.01), name="items_weights")
    items_embedding = tf.sparse_tensor_dense_matmul(items_indexes, items_weights, name="items_embedding")

In [7]:
with tf.name_scope("concatenation"):
    layer_0 = tf.concat([users_embedding, items_embedding], axis=-1, name="layer_0")

In [8]:
def seq_layer(name, prev, dim_out, activation="sigmoid"):
    dim_in = int(prev.get_shape()[1])
    
    with tf.name_scope(name):    
        weights = tf.Variable(tf.random_normal([dim_in, dim_out], stddev=0.01), name="W")
        bias = tf.Variable(tf.random_normal([dim_out], stddev=0.01), name="b")
                
        prev = tf.nn.bias_add(tf.matmul(prev, weights, name="weights"), bias, name="bias")
        active_func = {
            "sigmoid":  tf.nn.sigmoid,
            "relu":     tf.nn.relu,
            "tanh":     tf.nn.tanh,
            "linear":   tf.identity
        }[activation]
        prev = active_func(prev, name=activation)
        
        return prev

In [9]:
layer_1 = seq_layer("layer_1", layer_0, 10, "relu")
layer_2 = seq_layer("layer_2", layer_1, 1,  "relu")
layer_3 = tf.squeeze(layer_2)

In [10]:
t_loss = tf.losses.mean_squared_error(ranks_indexes, layer_3)

In [11]:
tf.summary.scalar('loss', t_loss)

<tf.Tensor 'loss:0' shape=() dtype=string>

In [12]:
writer = tf.summary.FileWriter("/tmp/tensorflow", tf.get_default_graph())

**Run:** tensorboard --logdir=run1:/tmp/tensorflow/

In [13]:
t_train = tf.train.AdamOptimizer(learning_rate=0.001).minimize(t_loss)

In [14]:
import operator

def get_entry(line):
    try:
        user, item, time, rank = map(int, line.strip().split(','))
        return user, item, time, rank
    except ValueError:
        return None

NAME_TRAIN = "./data/train.timestamp.txt"
NAME_VALID = "./data/valid.timestamp.txt"
NAME_TEST  = "./data/test.timestamp.txt"

def batch_iterator(file_name, size=100):
    size = int(size)
    
    cache_size = 10**7
    cache = []
    
    with open(file_name) as f_name:
        for line in f_name:
            user, item, time, rank = get_entry(line)
            cache.append((user-1, item-1, rank))
            
            if len(cache) == cache_size:
                for i in range(0, len(cache), size):
                    yield cache[i:i+size]
                cache[:] = []
        
        if len(cache) > 0:
            for i in range(0, len(cache), size):
                yield cache[i:i+size]
        
    cache[:] = []

In [15]:
sess = tf.Session()
sess.run(tf.global_variables_initializer())

In [16]:
saver = tf.train.Saver()

In [17]:
saver.restore(sess, "./models/model_epoch_{:03}.ckpt".format(14))

In [18]:
with open("./submission_03_05_preformat.csv", "w") as f_submission:
    n_processed = 0
    
    for batch in batch_iterator(NAME_TEST, size=10000):
        users_b = map(operator.itemgetter(0), batch)
        items_b = map(operator.itemgetter(1), batch)
        ranks_b = map(operator.itemgetter(2), batch)
        
        indices = range(len(batch)) 
        users_b = np.asarray(zip(indices, users_b))
        items_b = np.asarray(zip(indices, items_b))
        
        values = np.ones(len(batch))
        users_shape = (len(batch), n_users)
        items_shape = (len(batch), n_items)
        
        prediction = sess.run(layer_3, feed_dict={
            users_indexes: (users_b, values, users_shape),
            items_indexes: (items_b, values, items_shape),
            ranks_indexes: np.asarray(ranks_b)
        })
        
        n_processed += len(batch)
        
        if n_processed % 1000 == 0:
            print "\r~{:>6.5f}".format(n_processed / 20095978.0),
            
        for pred in prediction:
            f_submission.write("{:.5f}".format(pred))
            f_submission.write("\n")
    
    print "\r~{:>6.5f}".format(1.0),

~1.00000
