In [1]:
# Use LearnBPR user ranking

In [2]:
import tensorflow as tf
import numpy as np
import os
import time
import datetime
from tensorflow.contrib import learn
import re
import itertools
from collections import Counter
from collections import defaultdict
import gzip
import struct
import smtplib
from email.mime.text import MIMEText
from IPython.display import Audio
import matplotlib.pyplot as plt
%matplotlib inline

In [3]:
basedir = '/home/tvromen/research/subtitles'

class Flags(object):
    def __init__(self):
        # Data loading params
        self.val_sample_percentage = .1 # Percentage of the training data to use for validation
        self.ratings_file = os.path.join(basedir, 'ml-20m/ratings.csv') # Data source for the ratings
        self.text_data_file = os.path.join(basedir, 'movielens-subtitles-1024.txt') # Data source

        self.max_lines = 100000

        # Model Hyperparameters
        self.embedding_dim = 64 # Dimensionality of user & movie vectors (default: 128)

        #self.max_vocab_size = 100000
        #self.vocab_embedding_dim = 300 # Dimensionality of character embedding (default: 128)
        #self.filter_sizes = "3,4,5" # Comma-separated filter sizes (default: '3,4,5')
        #self.num_filters = 128 # Number of filters per filter size (default: 128)
        #self.dropout_keep_prob = 0.5 # Dropout keep probability (default: 0.5)

        #self.words_in_scene = 64
        #self.num_scenes = 16

        # Training parameters
        self.batch_size = 128 # Batch Size (default: 64)
        self.num_epochs = 10 # Number of training epochs (default: 8)
        self.summary_every = 100
        self.evaluate_every = 1000 # Evaluate model on val set after this many steps (default: 100)
        self.checkpoint_every = 2000 # Save model after this many steps (default: 100)
        self.num_checkpoints = 3 # Number of checkpoints to store (default: 5)
        # Misc Parameters
        self.allow_soft_placement = True # Allow device soft device placement
        self.log_device_placement = True # Log placement of ops on devices

FLAGS = Flags()

In [12]:
# Data helpers

class IdAssigner:
    def __init__(self):
        self.forward = dict()
        self.reverse = dict()
        self.next_id = 0
    def get_id(self, x):
        if x not in self.forward:
            self.forward[x] = self.next_id
            self.reverse[self.next_id] = x
            self.next_id += 1
        return self.forward[x]
    def get_reverse_id(self, id_):
        return self.reverse[id_]
    def get_next_id(self):
        return self.next_id

class Subtitles:
    """
    Class that is in charge of subtitles
    """
    def __init__(self, data_file):
        samples = list(open(data_file, 'r').readlines())
        samples = [s.strip() for s in samples]
        ids = [int(s.split()[0]) for s in samples]
        x_text = [' '.join(s.split()[1:]) for s in samples]
        self.subs = dict()
        for id_,txt in zip(ids, x_text):
            self.subs[id_] = txt


class RatingsData:
    """
    Loads the ratings from the file. Returns an array x where each row is [user_id, movie_id]
    The movie_id is not the original movie_id, but rather a new id which is allocated densely (no skips)
    Only movies with rating >= 3.0 are considered watched (TODO)
    """

    def __init__(self, data_file, max_lines, subs):
        self.id_assigner = IdAssigner()
        self.movies_per_user = defaultdict(list) # map user_id -> [movie_id_1, movie_id_2, ...]
        self.movie_watch_count = defaultdict(int)   # number of times watched per new_movie_id
        # load file
        x = self._load_file(data_file, max_lines, subs)
        x = self._sort_dataset(x)
        x = x[['user_id', 'movie_id', 'rating']]
        # split training/validation:
        # the training set is all movies for each user except the last (in chronological order)
        # validation set is the last movie for each user
        user_ids = x['user_id']
        is_last = (user_ids != np.append(user_ids[1:], -1))
        self.train = x[~is_last]
        self.val = x[is_last]

    def _load_file(self, data_file, max_lines, subs):
        print("Loading data...")
        x = np.zeros(
            max_lines,
            dtype=[('valid',np.bool), ('user_id',np.int32), ('movie_id', np.int32), ('rating', np.float32), ('timestamp', np.int32)]
        )
        with open(data_file) as f:
            _ = f.readline() # skip first line
            for i,line in enumerate(f.readlines()):
                if i % 1000000 == 0:
                    print('{}...'.format(i))
                if i == max_lines:
                    break
                words = line.split(',')
                user_id  = int(words[0])
                movie_id = int(words[1])
                rating   = float(words[2])
                timestamp = int(words[3])
                # if rating < 3.0:
                #     # we count this as not-watched (for now...)
                #     continue
                if movie_id not in subs.subs:
                    # movie doesn't have subtitles
                    continue
                new_movie_id = self.id_assigner.get_id(movie_id)
                x[i] = (True, user_id, new_movie_id, rating, timestamp)
                self.movies_per_user[user_id].append(new_movie_id)
                self.movie_watch_count[new_movie_id] += 1
        valid = (x['valid'] != 0)
        x = x[valid]
        return x[['user_id', 'movie_id', 'rating', 'timestamp']]

    def _sort_dataset(self, x):
        # sort by user and timestamp - use stable sorting algorithm
        x = x[x['timestamp'].argsort(kind='mergesort')]  # secondary sort key
        x = x[x['user_id'].argsort(kind='mergesort')]  # primary sort key
        return x

    def get_num_users(self):
        return max(self.movies_per_user.keys()) + 1 # starts from 1

    def get_num_movies(self):
        return self.id_assigner.get_next_id()

    def get_train(self, shuffle=True):
        if shuffle:
            shuffle_indices = np.random.permutation(np.arange(len(self.train)))
            return self.train[shuffle_indices]
        else:
            return self.train

    def get_val(self, shuffle=True):
        if shuffle:
            shuffle_indices = np.random.permutation(np.arange(len(self.val)))
            return self.val[shuffle_indices]
        else:
            return self.val

    ## TODO: generate negative with equal probability
    ## TODO: this doesn't really use movie_id...
    def _batch_iter_generate_neg(self, user_id, movie_id):
        """
        Generates a batch res of same length as the inputs, such that for each tuple in (user_id, movie_id, res),
        the item movie_id is ranked same or higher (by the user) than the corresponding item in res.
        Also includes movies that haven't been watched. The movies are chosen according to popularity.
        inputs:
            user_id is array of length n
            movie_id is array of length n
        output:
            res is array of length n
        """
        n = len(user_id)
        assert len(movie_id) == n, len(movie_id)
        # count number of watches
        num_movies = self.id_assigner.get_next_id()
        watch_counts = np.array([self.movie_watch_count[i] for i in range(num_movies)], dtype=np.float32)
        total_watches = np.sum(watch_counts)
        assert(total_watches > 0)
        res = np.zeros([n], dtype=np.int32)
        # do it per user to speed up things
        i = 0
        while i < n:
            curr_user_id = user_id[i]
            user_start_idx = i
            while i < n and user_id[i] == curr_user_id:
                i += 1
                if i % 1000000 == 0:
                    print('{}...'.format(i))
            if i < n:
                assert user_id[i] > curr_user_id, 'input needs to be sorted by user, otherwise this code it really inefficient'
            user_end_idx = i # after-last
            # update watch counts: subtract this user's watches
            to_mask = np.zeros([num_movies], dtype=np.bool)
            for j in self.movies_per_user[curr_user_id]:
                to_mask[j] = True
            total_masked = np.sum(watch_counts[to_mask])
            p = (watch_counts * (1-to_mask)) / (total_watches - total_masked)
            res[user_start_idx:user_end_idx] = \
                np.random.choice(num_movies, size=[user_end_idx-user_start_idx], p=p)
            if False: # too slow
                for j in range(user_start_idx, user_end_idx):
                    assert res[j] not in self.movies_per_user[curr_user_id]
        return res

    def batch_iter(self, x, batch_size, num_epochs, shuffle=True):
        """
        Generates the pair for each datapoint, and then
        generates a batch iterator.
        """
        assert type(x) == np.ndarray, type(x)
        data_size = len(x)
        # split to batches
        num_batches_per_epoch = ((data_size - 1) // batch_size) + 1
        for epoch in range(num_epochs):
            # generate the pair
            neg = self._batch_iter_generate_neg(x['user_id'], x['movie_id'])
            # Shuffle the data
            shuffle_indices = np.arange(data_size)
            if shuffle:
                shuffle_indices = np.random.permutation(shuffle_indices)
            x_shuffled = x[shuffle_indices]
            neg_shuffled = neg[shuffle_indices]
            for batch_num in range(num_batches_per_epoch):
                start_index = batch_num * batch_size
                end_index = min((batch_num + 1) * batch_size, data_size)
                x_batch = x_shuffled[start_index:end_index]
                batch_neg = neg_shuffled[start_index:end_index]
                batch_user_id = x_batch['user_id']
                batch_pos = x_batch['movie_id']
                batch_pos_neg = np.stack([batch_pos,batch_neg], axis=-1)
                batch_rating = x_batch['rating']
                yield (batch_user_id,batch_pos_neg,batch_rating)


In [13]:
# Data Preparation
# ==================================================

# Load data
subs = Subtitles(FLAGS.text_data_file)
ratings = RatingsData(FLAGS.ratings_file, FLAGS.max_lines, subs)
#ratings = RatingsData(FLAGS.ratings_file, 10000, subs)

np.random.seed(1234)

x_train = ratings.get_train(shuffle=False)
x_val = ratings.get_val(shuffle=False)

print("Train/Val split: {:d}/{:d}".format(len(x_train), len(x_val)))

num_users = ratings.get_num_users()
num_movies = ratings.get_num_movies()

print('Num users: {}'.format(num_users))
print('Num movies: {}'.format(num_movies))

Loading data...
0...
Train/Val split: 97629/702
Num users: 703
Num movies: 7494


In [14]:
def get_dynamic_tensor_shape(x):
    """
    Calculate the tensor shape. Use a plain number where possible and a tensor elsewhere.
    x is a tensor of some shape.
    returns a list with the dimensions of x.
    """
    shape_tensor = tf.shape(x)
    shape = list(x.get_shape())
    for i in range(len(shape)):
        shape[i] = shape[i].value
        if shape[i] is None:
            # use tensor to represent the dimension
            shape[i] = shape_tensor[i]
    return shape


def embedding_lookup_layer(x, vocab_size, embedding_dim, variable_scope, reuse=False):
    """
    Lookup embedding
    x is tensor of shape (d_1, d_2, ..., d_n) and type int32
    result is tensor of shape (d_1, d_2, ..., d_n, embedding_dim) of n+1 dimensions and type DT_FLOAT
    """
    with tf.variable_scope(variable_scope, reuse=reuse):
        W = tf.get_variable(
            'W',
            shape=[vocab_size, embedding_dim],
            initializer=tf.contrib.layers.xavier_initializer(),
            regularizer=tf.contrib.layers.l2_regularizer(1.)
        )
    x_embedded = tf.nn.embedding_lookup(W, x)
    return x_embedded

def bias_lookup_layer(x, vocab_size, variable_scope, reuse=False):
    """
    Lookup embedding
    x is tensor of shape (d_1, d_2, ..., d_n) and type int32
    result is tensor of same shape in x and type DT_FLOAT
    """
    with tf.variable_scope(variable_scope, reuse=reuse):
        b = tf.get_variable(
            'b',
            shape=[vocab_size, 1],
            initializer=tf.zeros_initializer(),
            regularizer=tf.contrib.layers.l2_regularizer(1.)
        )
    x_bias = tf.squeeze(tf.nn.embedding_lookup(b, x), -1)
    return x_bias

def fc_layer(x, output_size, variable_scope, reuse=False):
    """
    Fully-connected layer
    x has shape (batch_size, d_2)
    result has shape (batch_size, output_size)
    """
    shape = get_dynamic_tensor_shape(x)
    assert len(shape) == 2
    ## TODO: regularization
    with tf.variable_scope(variable_scope, reuse=reuse):
        W = tf.get_variable(
            "W",
            shape=[shape[1], output_size],
            initializer=tf.contrib.layers.xavier_initializer())
        b = tf.get_variable(
            "b",
            shape=[output_size],
            initializer=tf.contrib.layers.xavier_initializer())
    result = tf.nn.xw_plus_b(x, W, b, name="fc")
    return result


class PredictionModel(object):
    """
    A neural network for predicting per-user movie ratings.
    The input to the network is the user_id and movie_id.
    """
    def __init__(self, num_users, num_movies, embedding_dim, l2_reg_lambda):

        assert num_users >= 1
        self.num_users = num_users
        assert num_movies >= 1
        self.num_movies = num_movies
        assert embedding_dim >= 1
        self.embedding_dim = embedding_dim
        assert l2_reg_lambda >= 0

        # Placeholders for input, output and dropout
        self.input_user_id = tf.placeholder(tf.int32, [None], name="input_user_id")
        self.input_pos_neg = tf.placeholder(tf.int32, [None, 2], name="input_pos_neg")
        self.input_movie_id = tf.placeholder(tf.int32, [None], name="input_movie_id")
        #self.dropout_keep_prob = tf.placeholder(tf.float32, name="dropout_keep_prob") # TODO: do we need dropout?

        asrt1 = tf.assert_equal(tf.shape(self.input_user_id)[0], tf.shape(self.input_pos_neg)[0])

        # embedding lookup layer
        with tf.device('/cpu:0'), tf.name_scope('embedding_lookup'), tf.control_dependencies([asrt1]):
            user_embedding = embedding_lookup_layer(self.input_user_id, num_users, embedding_dim, 'user_embedding')
            movie_embedding = embedding_lookup_layer(self.input_pos_neg, num_movies, embedding_dim, 'movie_embedding')
            movie_bias = bias_lookup_layer(self.input_pos_neg, num_movies, 'movie_embedding')

        pos_embedding = movie_embedding[:,0,:]
        neg_embedding = movie_embedding[:,1,:]
        pos_bias = movie_bias[:,0]
        neg_bias = movie_bias[:,1]

        delta_embedding = pos_embedding - neg_embedding
        delta_bias = pos_bias - neg_bias

        # collaborative prediction layer
        with tf.name_scope('collab_prediction'):        
            collab_prediction = tf.reduce_sum(user_embedding * delta_embedding, axis=1) + delta_bias

        # Calculate loss
        with tf.name_scope('collab_loss'):
            #losses = tf.log(tf.sigmoid(-collab_prediction) + 0.01)
            losses = tf.sigmoid(-collab_prediction) ## TODO use log of sigmoid
            self.collab_loss = tf.reduce_mean(losses) # TODO use built-in l2_loss functionality

        with tf.name_scope('loss'):
            reg_losses = tf.get_collection(tf.GraphKeys.REGULARIZATION_LOSSES)
            self.loss = self.collab_loss + l2_reg_lambda * sum(reg_losses)

        # Collab accuracy = ranking accuracy
        with tf.name_scope('collab_accuracy'):
            self.collab_accuracy = tf.reduce_mean(tf.cast(collab_prediction > 0, tf.float32))

    def get_predictions(self):
        asrt1 = tf.assert_equal(tf.shape(self.input_user_id)[0], 1)
        with tf.device('/cpu:0'), tf.name_scope('embedding_lookup'), tf.control_dependencies([asrt1]):
            user_embedding = embedding_lookup_layer(self.input_user_id, self.num_users, self.embedding_dim, 'user_embedding', True)
            movie_embedding = embedding_lookup_layer(self.input_movie_id, self.num_movies, self.embedding_dim, 'movie_embedding', True)
            movie_bias = bias_lookup_layer(self.input_movie_id, self.num_movies, 'movie_embedding', True)
            prediction = tf.reduce_sum(user_embedding * movie_embedding, axis=1) + movie_bias
            self.tmp1 = tf.reduce_sum(user_embedding * movie_embedding, axis=1)
            self.tmp2 = movie_bias
            return prediction


In [28]:
# Training
# ==================================================

def train(
    cnn, sess, starter_learning_rate, learning_rate_decay_every, learning_rate_decay_by
):
    last_accuracy = 0

    # Define Training procedure
    global_step = tf.Variable(0, name="global_step", trainable=False)
    #optimizer = tf.train.AdamOptimizer(1e-3)
    learning_rate = tf.train.exponential_decay(
        starter_learning_rate, global_step, learning_rate_decay_every,
        learning_rate_decay_by, staircase=True)
    optimizer = tf.train.AdamOptimizer(learning_rate)

    grads_and_vars = optimizer.compute_gradients(cnn.loss)
    train_op = optimizer.apply_gradients(grads_and_vars, global_step=global_step)

    # Keep track of gradient values and sparsity (optional)
    grad_summaries = []
    #for g, v in grads_and_vars:
    for g,v in []:
        if g is not None:
            grad_hist_summary = tf.summary.histogram("{}/grad/hist".format(v.name), g)
            sparsity_summary = tf.summary.scalar("{}/grad/sparsity".format(v.name), tf.nn.zero_fraction(g))
            grad_summaries.append(grad_hist_summary)
            grad_summaries.append(sparsity_summary)
    #grad_summaries_merged = tf.summary.merge(grad_summaries)

    # Output directory for models and summaries
    timestamp = str(int(time.time()))
    out_dir = os.path.abspath(os.path.join(os.path.curdir, "runs", timestamp))
    print("Writing to {}\n".format(out_dir))

    # Summaries for loss and accuracy
    collab_loss_summary = tf.summary.scalar("collab_loss", cnn.collab_loss)
    loss_summary = tf.summary.scalar("loss", cnn.loss)
    acc_summary = tf.summary.scalar("accuracy", cnn.collab_accuracy)
    learning_rate_summary = tf.summary.scalar("learning_rate", learning_rate)

    # Train Summaries
    train_summary_op = tf.summary.merge([collab_loss_summary, loss_summary, acc_summary, learning_rate_summary])#, grad_summaries_merged])
    train_summary_dir = os.path.join(out_dir, "summaries", "train")
    train_summary_writer = tf.summary.FileWriter(train_summary_dir, sess.graph)

    # Val summaries
    val_summary_op = tf.summary.merge([collab_loss_summary, loss_summary, acc_summary, learning_rate_summary])
    val_summary_dir = os.path.join(out_dir, "summaries", "val")
    val_summary_writer = tf.summary.FileWriter(val_summary_dir, sess.graph)

    # Checkpoint directory. Tensorflow assumes this directory already exists so we need to create it
    checkpoint_dir = os.path.abspath(os.path.join(out_dir, "checkpoints"))
    checkpoint_prefix = os.path.join(checkpoint_dir, "model")
    if not os.path.exists(checkpoint_dir):
        os.makedirs(checkpoint_dir)
    saver = tf.train.Saver(tf.global_variables(), max_to_keep=FLAGS.num_checkpoints)

    # Initialize all variables
    sess.run(tf.global_variables_initializer())

    def train_step(batch_user_id, batch_pos_neg, batch_rating):
        """
        A single training step 
        """
        feed_dict = {
            cnn.input_user_id: batch_user_id,
            cnn.input_pos_neg: batch_pos_neg,
            #cnn.input_rating: batch_rating,
            #cnn.input_train_rating: 0.1,
            #cnn.dropout_keep_prob: FLAGS.dropout_keep_prob,
        }
        (pretrain_collab_accuracy,) = sess.run([cnn.collab_accuracy], feed_dict)
        sess.run(train_op, feed_dict)
        step, loss, collab_accuracy, rate = sess.run(
            [global_step, cnn.loss, cnn.collab_accuracy, learning_rate],
            feed_dict)
        if step % FLAGS.summary_every == 0:
            summaries = sess.run(train_summary_op, feed_dict)
            train_summary_writer.add_summary(summaries, step)
        time_str = datetime.datetime.now().isoformat()
        if step % FLAGS.summary_every == 0:
            print("{}: step {}, loss {:g}, collab_acc {:g}->{:g}, rate {:g}".format(
                time_str, step, loss, pretrain_collab_accuracy, collab_accuracy, rate))
        return collab_accuracy

    def val_step(batch_user_id, batch_pos_neg, batch_rating, writer=None):
        """
        Evaluates model on a val set
        """
        feed_dict = {
            cnn.input_user_id: batch_user_id,
            cnn.input_pos_neg: batch_pos_neg,
            #cnn.input_rating: batch_rating,
            #cnn.input_train_rating: 0.1,
        }
        step, summaries, loss, collab_accuracy = sess.run(
            [global_step, val_summary_op, cnn.loss, cnn.collab_accuracy],
            feed_dict)
        time_str = datetime.datetime.now().isoformat()
        print("{}: step {}, loss {:g}, collab_acc {:g}".format(
            time_str, step, loss, collab_accuracy))
        if writer:
            writer.add_summary(summaries, step)
        return collab_accuracy

    # Generate batches
    batches = ratings.batch_iter(x_train, FLAGS.batch_size, FLAGS.num_epochs)
    # Training loop. For each batch...
    last_test_collab_accuracy = None
    for batch_user_id, batch_pos_neg, batch_rating in batches:
        last_collab_accuracy = train_step(batch_user_id, batch_pos_neg, batch_rating)
        current_step = tf.train.global_step(sess, global_step)
        if current_step % FLAGS.evaluate_every == 0:
            print("\nEvaluation:")
            ((val_user_id, val_pos_neg, val_rating),) = ratings.batch_iter(x_val, len(x_val), 1)
            if len(x_val) > 1024:
                val_user_id, val_pos_neg, val_rating = val_user_id[:1024], val_pos_neg[:1024], val_rating[:1024]
            last_test_collab_accuracy = \
                val_step(val_user_id, val_pos_neg, val_rating, writer=val_summary_writer)
            print("")
        if current_step % FLAGS.checkpoint_every == 0:
            path = saver.save(sess, checkpoint_prefix, global_step=current_step)
            print("Saved model checkpoint to {}\n".format(path))
            pass
    return (last_collab_accuracy, last_test_collab_accuracy)


In [29]:
def calc_precision(model, sess):
    ranks = []
    mrr = 0.
    mrr_at_10 = 0
    precision_at_10 = 0
    n = 50 # calculate only on first n users in validation set
    for i in range(n):
        user_id, movie_id, rating = x_val[i]
        if i % 50 == 0:
            print('{}...'.format(i))
        num_movies = ratings.id_assigner.get_next_id()
        batch_movie_id = np.arange(num_movies)
        #batch_user_id = np.ones_like(batch_movie_id) * user_id
        #batch_rating = np.zeros_like(batch_movie_id)
        feed_dict = {
            model.input_user_id: [user_id],
            model.input_movie_id: batch_movie_id,
        }
        scores = sess.run(model.get_predictions(), feed_dict=feed_dict)
        s = scores[movie_id] # the score for the correct movie
        train_movies = x_train[x_train['user_id'] == user_id]['movie_id']
        not_watched = (scores == scores) # all True
        not_watched[train_movies] = False
        higher_scores = (scores > s)    
        rank = np.sum(higher_scores & not_watched) + 1
        ranks.append(rank)
        mrr += 1. / rank
        if rank <= 10:
            mrr_at_10 += 1. / rank
            precision_at_10 += 1
    mrr /= n
    mrr_at_10 /= n
    precision_at_10 /= n
    return mrr, mrr_at_10, precision_at_10

In [30]:
def runall():
    res = defaultdict(list)
    res_mrr = defaultdict(list)
    res_mrr10 = defaultdict(list)
    res_p10 = defaultdict(list)
    with open('results.txt', 'a') as f:
        for l2_reg_lambda in [0.]:
            with tf.Graph().as_default():
                session_conf = tf.ConfigProto(
                    allow_soft_placement=FLAGS.allow_soft_placement,
                    log_device_placement=FLAGS.log_device_placement)
                session_conf.gpu_options.allow_growth=True
                sess = tf.Session(config=session_conf)
                with sess.as_default():
                    model = PredictionModel(
                        num_users=num_users,
                        num_movies=num_movies,
                        embedding_dim=FLAGS.embedding_dim,
                        l2_reg_lambda=l2_reg_lambda)
                    for i in range(1):
                        f.write('lambda: {}\n'.format(l2_reg_lambda))
                        last_accuracy = train(model, sess, 3e-3, 20000, 0.5)
                        f.write('accuracy: {}\n'.format(last_accuracy))
                        res[l2_reg_lambda].append(last_accuracy)
                        mrr, mrr10, p10 = calc_precision(model, sess)
                        f.write(repr((mrr, p10)) + '\n')
                        res_mrr[l2_reg_lambda] = mrr
                        res_p10[l2_reg_lambda] = p10
                        res_mrr10[l2_reg_lambda] = mrr10
                        f.write('\n')
                        f.flush()
    return res, res_mrr, res_mrr10, res_p10

In [31]:
res = runall()

Writing to /home/tvromen/research/subtitles2/runs/1510229389

2017-11-09T14:09:51.013176: step 100, loss 0.496787, collab_acc 0.492188->0.648438, rate 0.003
2017-11-09T14:09:52.030435: step 200, loss 0.488271, collab_acc 0.585938->0.679688, rate 0.003
2017-11-09T14:09:53.028962: step 300, loss 0.473816, collab_acc 0.625->0.71875, rate 0.003
2017-11-09T14:09:54.042243: step 400, loss 0.43884, collab_acc 0.726562->0.78125, rate 0.003
2017-11-09T14:09:55.038706: step 500, loss 0.383724, collab_acc 0.695312->0.726562, rate 0.003
2017-11-09T14:09:55.923697: step 600, loss 0.333434, collab_acc 0.78125->0.820312, rate 0.003
2017-11-09T14:09:56.925698: step 700, loss 0.310399, collab_acc 0.757812->0.78125, rate 0.003
2017-11-09T14:09:58.129416: step 800, loss 0.275027, collab_acc 0.796875->0.8125, rate 0.003
2017-11-09T14:09:59.029466: step 900, loss 0.262865, collab_acc 0.796875->0.8125, rate 0.003
2017-11-09T14:09:59.984243: step 1000, loss 0.233985, collab_acc 0.828125->0.828125, rate 0.003

In [32]:
# 100K lines
# all star ratings are considered watched
# with regularization
# update code to support calculation of MRR and precision@10
print(res)

(defaultdict(<class 'list'>, {0.0: [(0.94623655, 0.8062678)]}), defaultdict(<class 'list'>, {0.0: 0.054445933646771388}), defaultdict(<class 'list'>, {0.0: 0.044857142857142859}), defaultdict(<class 'list'>, {0.0: 0.14}))


In [70]:
# 1M lines
# all star ratings are considered watched
# with regularization
# update code to support calculation of MRR and precision@10
print(res)

(defaultdict(<class 'list'>, {0.0: [(0.83673465, 0.77148438)]}), defaultdict(<class 'list'>, {0.0: 0.048712822395037852}), defaultdict(<class 'list'>, {0.0: 0.035190476190476189}), defaultdict(<class 'list'>, {0.0: 0.12}))


In [71]:
# 1M lines
# all star ratings are considered watched
# with regularization
res

defaultdict(list,
            {0.0: [(0.97192383, 0.75740814)], 1e-06: [(0.9397583, 0.7156477)]})

In [64]:
# 1M lines
# only star ratings of 3.0 and above are considered watched
# added regularization
res

defaultdict(list,
            {2e-06: [(0.824646, 0.71223354)], 2e-05: [(0.546875, 0.46242237)]})

In [61]:
# 1M lines
# only star ratings of 3.0 and above are considered watched
# added regularization
res

defaultdict(list,
            {0.0: [(0.86505127, 0.7810421)],
             0.0001: [(0.53125, 0.48535061)],
             0.0002: [(0.5234375, 0.4638834)],
             0.0005: [(0.5703125, 0.48535156)],
             0.001: [(0.53106689, 0.50292969)],
             0.002: [(0.5234375, 0.46480083)],
             0.005: [(0.5, 0.46386719)],
             0.01: [(0.5234375, 0.48632812)],
             0.02: [(0.5546875, 0.50780296)],
             0.05: [(0.59375, 0.50976562)],
             0.1: [(0.5703125, 0.5092907)]})

In [31]:
# 1M lines
# only star ratings of 3.0 and above are considered watched
res

defaultdict(list, {0: [(0.8729248, 0.78455317)]})

In [23]:
# 1M lines
# all star ratings are considered watched
res

defaultdict(list, {0: [(0.96618652, 0.77198738)]})