In [1]:
import tensorflow as tf
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import cPickle
%matplotlib inline 

In [2]:
def reader(addr,Data,userDict):
    # read txt file into list of list,
    # where X has movie,user,year,month,day columns and y has rating
    # UserIDs range from 1 to 2649429, with gaps.
    # userDict is used to remap userID to integer value without gaps for better efficiency with embedding later on.
    with open(addr) as f:
        for line in f:
            if ':' in line:
                movie = np.int32(line.rsplit(':', 1)[0]) - 1
            else:
                user,rating,date = line.split(',')
                if user in userDict:
                    user = userDict[user]
                else:
                    userDict[user] = len(userDict)
                    user = userDict[user]
                Data.append([np.int32(rating),movie,user]+map(np.int32,date.replace('\n','').split('-')))
    return Data,userDict

In [3]:
Data,userDict = [],{}

In [4]:
for i in range(1,5):
    Data,userDict = reader('Netflix Data/combined_data_{}.txt'.format(i),Data,userDict)

In [5]:
len(userDict)

480189

In [6]:
len(Data)

100480507

<br>

As the competition is closed, I have to make my own test dataset. As there is no other information about user/movie, I need to avoid unseen movie or user in the training dataset. Given there is a lot more users than movies, I will sample movies for each user. Note that the test dataset distribution is not necessarily the same as training in reality due to movie/user shift over time.

In [7]:
np.random.seed(7)

In [8]:
Data = pd.DataFrame(Data,columns=['rating','movie','user','year','month','day'])

In [9]:
Data['seed']=np.random.rand(Data.shape[0])

In [10]:
def filter_(x,p,lessThan):
    return x[x.seed<x.seed.quantile(p)] if lessThan else x[x.seed>=x.seed.quantile(p)]

In [11]:
dataTest=Data.groupby('user',sort=False).apply(lambda x:filter_(x,0.2,True))

In [12]:
dataTrain=Data.groupby('user',sort=False).apply(lambda x:filter_(x,0.2,False))

In [13]:
print dataTrain.shape[0], dataTest.shape[0]

80287772 20192735


Check is test user/movie set is contained within train user/movie set

In [14]:
set(dataTrain.user) >= set(dataTest.user)

True

In [15]:
set(dataTrain.movie) >= set(dataTest.movie)

True

In [16]:
Xtrain = dataTrain[['movie','user','year','month','day']].values
ytrain = dataTrain['rating'].values.astype(np.float32)
Xtest = dataTest[['movie','user','year','month','day']].values
ytest = dataTest['rating'].values.astype(np.float32)

In [17]:
del dataTrain,dataTest,Data

In [18]:
with open(r"Xtrain.pickle", "wb") as output_file:
    cPickle.dump(Xtrain, output_file)
with open(r"ytrain.pickle", "wb") as output_file:
    cPickle.dump(ytrain, output_file)
with open(r"Xtest.pickle", "wb") as output_file:
    cPickle.dump(Xtest, output_file)
with open(r"ytest.pickle", "wb") as output_file:
    cPickle.dump(ytest, output_file)

In [2]:
with open(r"Xtrain.pickle", "rb") as output_file:
    Xtrain=cPickle.load(output_file)
with open(r"ytrain.pickle", "rb") as output_file:
    ytrain=cPickle.load(output_file)
with open(r"Xtest.pickle", "rb") as output_file:
    Xtest=cPickle.load(output_file)
with open(r"ytest.pickle", "rb") as output_file:
    ytest=cPickle.load(output_file)

In [71]:
# baseline RMSE
np.mean((ytest - ytrain.mean())**2)

1.1776123

<br>

inner product Model

In [3]:
batch_size = 2000
cells_dim = 51 # fist dim is intercept for user/movie
learning_rate = 1e-3
epoch = 5
V_user = np.max(Xtrain[:,1])
V_movie = np.max(Xtrain[:,0])

In [4]:
tf.reset_default_graph()

In [5]:
X = tf.placeholder(tf.int32, [batch_size, 2], name='X')
Y = tf.placeholder(tf.float32, [batch_size, ], name='Y')

In [6]:
embedding_user = tf.get_variable("embedding_user", \
                                 [V_user, cells_dim],initializer=tf.contrib.layers.xavier_initializer())
embedding_movie = tf.get_variable("embedding_movie", \
                                 [V_movie, cells_dim],initializer=tf.contrib.layers.xavier_initializer())

In [7]:
X_user = tf.nn.embedding_lookup(embedding_user,X[:,1])
X_movie = tf.nn.embedding_lookup(embedding_movie,X[:,0])

In [9]:
b0 = np.mean(ytrain,dtype=np.float32)
b = tf.get_variable(
    "b",
    initializer=tf.constant(b0))

In [10]:
# yhat = tf.nn.sigmoid(b+X_user[:,0]+X_movie[:,0]+tf.einsum('nd,nd->n',X_user,X_movie)) * 4 + 1
yhat = b+X_user[:,0]+X_movie[:,0]+tf.einsum('nd,nd->n',X_user[:,1:],X_movie[:,1:])

In [11]:
cost = tf.reduce_mean((Y-yhat)**2)

In [12]:
optimizer = tf.train.AdamOptimizer(learning_rate=learning_rate)
train_op = optimizer.minimize(cost)

Training

In [3]:
def batch_iterator(X,y,batch_size,shuffle=True):
    n = X.shape[0]
    index = np.random.permutation(n) if shuffle else range(n)
    from_ = 0
    for i in range(n//batch_size):
        yield X[index[from_:from_+batch_size]],y[index[from_:from_+batch_size]]
        from_ += batch_size

In [15]:
sess = tf.InteractiveSession()
sess.run(tf.global_variables_initializer())

In [16]:
n_train = Xtrain.shape[0]//batch_size
n_test = Xtest.shape[0]//batch_size

In [67]:
for i in range(epoch):
    cost_train = 0
    cost_test = 0
    for X_np,Y_np in batch_iterator(Xtrain[:,:2],ytrain,batch_size):
        _,cost_np = sess.run([train_op,cost],{X:X_np,Y:Y_np})
        cost_train += cost_np
    
    for X_np,Y_np in batch_iterator(Xtest[:,:2],ytest,batch_size,False):
        cost_np = sess.run([cost],{X:X_np,Y:Y_np})
        cost_test += cost_np[0]    
    
    print "Train loss:{}, Val loss:{}".format(cost_train/n_train,cost_test/n_test)

Train loss:0.834914844385, Val loss:0.715698077909
Train loss:0.658083238892, Val loss:0.686548642687
Train loss:0.593707999817, Val loss:0.692291251615
Train loss:0.561786555653, Val loss:0.702673189681
Train loss:0.542969985705, Val loss:0.712520237091


In [17]:
for i in range(epoch):
    cost_train = 0
    cost_test = 0
    for X_np,Y_np in batch_iterator(Xtrain[:,:2],ytrain,batch_size):
        _,cost_np = sess.run([train_op,cost],{X:X_np,Y:Y_np})
        cost_train += cost_np
    
    for X_np,Y_np in batch_iterator(Xtest[:,:2],ytest,batch_size,False):
        cost_np = sess.run([cost],{X:X_np,Y:Y_np})
        cost_test += cost_np[0]    
    
    print "Train loss:{}, Val loss:{}".format(cost_train/n_train,cost_test/n_test)

Train loss:0.774577310152, Val loss:0.704337591423
Train loss:0.643941695542, Val loss:0.688827670718
Train loss:0.592241509148, Val loss:0.696242182505
Train loss:0.566813218013, Val loss:0.705526035515
Train loss:0.551550970261, Val loss:0.714033491024


MLP instead of inner product

In [184]:
batch_size = 2000
cells_dim = 50
learning_rate = 1e-3
epoch = 5
V_user = np.max(Xtrain[:,1])
V_movie = np.max(Xtrain[:,0])

In [185]:
tf.reset_default_graph()

In [186]:
X = tf.placeholder(tf.int32, [batch_size, 2], name='X')
Y = tf.placeholder(tf.float32, [batch_size, ], name='Y')

In [187]:
embedding_user = tf.get_variable("embedding_user", \
                                 [V_user, cells_dim*2],initializer=tf.contrib.layers.xavier_initializer())
embedding_movie = tf.get_variable("embedding_movie", \
                                 [V_movie, cells_dim*2],initializer=tf.contrib.layers.xavier_initializer())

In [188]:
X_user = tf.nn.embedding_lookup(embedding_user,X[:,1])
X_movie = tf.nn.embedding_lookup(embedding_movie,X[:,0])
X0 = tf.concat([X_user,X_movie],1)

In [189]:
weights0 = tf.Variable(tf.truncated_normal([cells_dim*4,cells_dim],
                        stddev=1.0 / np.sqrt(cells_dim*4)),name='weights0')
biases0 = tf.Variable(tf.zeros([cells_dim]),
                     name='biases0')
X1 = tf.nn.relu(tf.matmul(X0, weights0) + biases0)

In [190]:
weights1 = tf.Variable(tf.truncated_normal([cells_dim,1],
                        stddev=1.0 / np.sqrt(cells_dim)),name='weights1')
b0 = np.mean(ytrain,dtype=np.float32)
biases1 = tf.get_variable("biases1",
    initializer=tf.constant(b0))
yhat = tf.squeeze(tf.matmul(X1, weights1) + biases1)

In [191]:
cost = tf.reduce_mean((Y-yhat)**2)

In [192]:
optimizer = tf.train.AdamOptimizer(learning_rate=learning_rate)

In [193]:
train_op = optimizer.minimize(cost)

In [194]:
sess = tf.InteractiveSession()
sess.run(tf.global_variables_initializer())

In [195]:
for i in range(epoch):
    cost_train = 0
    cost_test = 0
    for X_np,Y_np in batch_iterator(Xtrain[:,:2],ytrain,batch_size):
        _,cost_np = sess.run([train_op,cost],{X:X_np,Y:Y_np})
        cost_train += cost_np
    
    for X_np,Y_np in batch_iterator(Xtest[:,:2],ytest,batch_size,False):
        cost_np = sess.run([cost],{X:X_np,Y:Y_np})
        cost_test += cost_np[0]    
    
    print "Train loss:{}, Val loss:{}".format(cost_train/n_train,cost_test/n_test)

Train loss:0.800466296374, Val loss:0.749603450331
Train loss:0.716299728379, Val loss:0.719147275345
Train loss:0.676469943251, Val loss:0.710465194867
Train loss:0.6483862691, Val loss:0.706165498421
Train loss:0.62693416324, Val loss:0.708447006775


Missing information feature extraction

In [4]:
batch_size = 2000
cells_dim = 50
learning_rate = 1e-3
epoch = 5
V_user = np.max(Xtrain[:,1])
V_movie = np.max(Xtrain[:,0])
p = .5 # positive sampling prob
d = 1

In [5]:
tf.reset_default_graph()

In [6]:
X = tf.placeholder(tf.int32, [batch_size, 2], name='X')
Y = tf.placeholder(tf.float32, [batch_size, ], name='Y')

In [7]:
embedding_user0 = tf.get_variable("embedding_user0", \
                                 [V_user, cells_dim],initializer=tf.contrib.layers.xavier_initializer())
embedding_movie0 = tf.get_variable("embedding_movie0", \
                                 [V_movie, cells_dim],initializer=tf.contrib.layers.xavier_initializer())

In [8]:
X_user = tf.nn.embedding_lookup(embedding_user0,X[:,1])
X_movie = tf.nn.embedding_lookup(embedding_movie0,X[:,0])

In [9]:
b0 = np.log(p/(1-p)).astype(np.float32)
b = tf.get_variable(
    "b",
    initializer=tf.constant(b0))

In [10]:
yhat = b + X_user[:,0]+X_movie[:,0]+tf.einsum('nd,nd->n',X_user[:,1:],X_movie[:,1:])

In [11]:
cost = tf.reduce_mean(tf.nn.sigmoid_cross_entropy_with_logits(labels=Y,logits=yhat))

In [12]:
optimizer = tf.train.AdamOptimizer(learning_rate=learning_rate)
train_op = optimizer.minimize(cost)

In [13]:
def batch_iterator_missing(X,batch_size,V_user,V_movie, p=0.5,shuffle=True):
    # negative sampling, p% from actual user/movie pair, 1-p% u,m uniformly sampled as non-example
    n = X.shape[0]
    k = int(batch_size*p)
    m = batch_size - k
    y = np.zeros(batch_size,dtype=np.float32)
    y[:k] = 1.0
    index = np.random.permutation(n) if shuffle else range(n)
    from_ = 0
    
    for i in range(n//k):
        yield np.r_[X[index[from_:from_+k]],np.c_[np.random.randint(0,high=V_movie,size=m),\
                                                  np.random.randint(0,high=V_user,size=m)]]\
                                    ,y
        from_ += k

In [14]:
def batch_iterator_missing2(X,batch_size,unigram,d=1,shuffle=True):
    # negative sampling, batch_size/(d+1) from actual user/movie pair, 
    # batch_size*d/(d+1) as non-example where u is the actual repeated d times,m sampled UNIGRAMLY from population
    n = X.shape[0]
    k = batch_size/(d+1)
    m = batch_size - k
    y = np.zeros(batch_size,dtype=np.float32)
    y[:k] = 1.0
    index = np.random.permutation(n) if shuffle else range(n)
    from_ = 0
    
    for i in range(n//k):
        yield np.r_[X[index[from_:from_+k]],np.c_[np.random.choice(unigram.shape[0],m,p=unigram),\
                                                  np.repeat(X[index[from_:from_+k],1],d)]]\
                                    ,y
        from_ += k

In [15]:
_,unigram = np.unique(Xtrain[:,0],return_counts=True)

In [26]:
unigram = 1.0*unigram/np.sum(unigram)

<br>

In [16]:
embedding_user1 = tf.get_variable("embedding_user1", \
                                 [V_user, cells_dim],initializer=tf.contrib.layers.xavier_initializer())
embedding_movie1 = tf.get_variable("embedding_movie1", \
                                 [V_movie, cells_dim],initializer=tf.contrib.layers.xavier_initializer())

In [17]:
X_user1 = tf.nn.embedding_lookup(embedding_user1,X[:,1])
X_movie1 = tf.nn.embedding_lookup(embedding_movie1,X[:,0])
X0 = tf.concat([X_user,X_movie,X_user1,X_movie1],1)

In [18]:
weights0 = tf.Variable(tf.truncated_normal([cells_dim*4,cells_dim],
                        stddev=1.0 / np.sqrt(cells_dim*4)),name='weights0')
biases0 = tf.Variable(tf.zeros([cells_dim]),
                     name='biases0')
X1 = tf.nn.relu(tf.matmul(X0, weights0) + biases0)

In [19]:
weights1 = tf.Variable(tf.truncated_normal([cells_dim,1],
                        stddev=1.0 / np.sqrt(cells_dim)),name='weights1')
b0 = np.mean(ytrain,dtype=np.float32)
biases1 = tf.get_variable("biases1",
    initializer=tf.constant(b0))
yhat2 = tf.squeeze(tf.matmul(X1, weights1) + biases1)

In [20]:
Y2 = tf.placeholder(tf.float32, [batch_size, ], name='Y2')
cost2 = tf.reduce_mean((Y2-yhat2)**2)

In [21]:
optimizer2 = tf.train.AdamOptimizer(learning_rate=learning_rate)
train_op2 = optimizer2.minimize(cost2,\
                        var_list=[embedding_user1,embedding_movie1,weights0,biases0,\
                                 weights1,biases1])

pre-training

In [179]:
sess = tf.InteractiveSession()
sess.run(tf.global_variables_initializer())

In [180]:
n_train = Xtrain.shape[0]//int(batch_size*p)
n_test = Xtest.shape[0]//int(batch_size*p)

In [181]:
for i in range(epoch):
    cost_train = 0
    cost_test = 0
    for X_np,Y_np in batch_iterator_missing(Xtrain[:,:2],batch_size,V_user,V_movie,p=p):
        _,cost_np = sess.run([train_op,cost],{X:X_np,Y:Y_np})
        cost_train += cost_np
    
    for X_np,Y_np in batch_iterator_missing(Xtest[:,:2],batch_size,V_user,V_movie,p=p,shuffle=False):
        cost_np = sess.run([cost],{X:X_np,Y:Y_np})
        cost_test += cost_np[0]    
    
    print "Train loss:{}, Val loss:{}".format(cost_train/n_train,cost_test/n_test)

Train loss:0.252721631242, Val loss:0.21166219235
Train loss:0.196951907906, Val loss:0.192961629696
Train loss:0.180811077684, Val loss:0.188070224141
Train loss:0.172494275106, Val loss:0.188060602753
Train loss:0.16777400367, Val loss:0.189531833657


pre-training with unigram instead of uniform

In [22]:
sess = tf.InteractiveSession()
sess.run(tf.global_variables_initializer())

In [23]:
n_train = Xtrain.shape[0]//(batch_size/(d+1))
n_test = Xtest.shape[0]//(batch_size/(d+1))

In [29]:
for i in range(epoch):
    cost_train = 0
    cost_test = 0
    for X_np,Y_np in batch_iterator_missing2(Xtrain[:,:2],batch_size,unigram,d=d):
        _,cost_np = sess.run([train_op,cost],{X:X_np,Y:Y_np})
        cost_train += cost_np
    
    for X_np,Y_np in batch_iterator_missing2(Xtest[:,:2],batch_size,unigram,d=d,shuffle=False):
        cost_np = sess.run([cost],{X:X_np,Y:Y_np})
        cost_test += cost_np[0]    
    
    print "Train loss:{}, Val loss:{}".format(cost_train/n_train,cost_test/n_test)

Train loss:0.605316479445, Val loss:0.57045159266
Train loss:0.555575274415, Val loss:0.556116987547
Train loss:0.539932697089, Val loss:0.55208317317
Train loss:0.531780930668, Val loss:0.551386037964
Train loss:0.527009378423, Val loss:0.551640862912


Training with first stage embedding fixed

In [30]:
n_train = Xtrain.shape[0]//batch_size
n_test = Xtest.shape[0]//batch_size

In [31]:
for i in range(epoch):
    cost_train = 0
    cost_test = 0
    for X_np,Y_np in batch_iterator(Xtrain[:,:2],ytrain,batch_size):
        _,cost_np = sess.run([train_op2,cost2],{X:X_np,Y2:Y_np})
        cost_train += cost_np
    
    for X_np,Y_np in batch_iterator(Xtest[:,:2],ytest,batch_size,False):
        cost_np = sess.run([cost2],{X:X_np,Y2:Y_np})
        cost_test += cost_np[0]    
    
    print "Train loss:{}, Val loss:{}".format(cost_train/n_train,cost_test/n_test)

Train loss:0.735813758503, Val loss:0.701166857846
Train loss:0.667574365635, Val loss:0.688900958614
Train loss:0.636612164666, Val loss:0.685067618407
Train loss:0.616155376975, Val loss:0.685883329163
Train loss:0.60090292539, Val loss:0.687098913057
