In [66]:
import tensorflow as tf
import sys
print(sys.version)
import numpy as np
import pandas as pd
import matplotlib
import matplotlib.pyplot as plt

3.5.2 |Continuum Analytics, Inc.| (default, Jul  2 2016, 17:53:06) 
[GCC 4.4.7 20120313 (Red Hat 4.4.7-1)]


In [67]:
movieratings = pd.read_csv('ratings.csv', nrows = 50000)

users = movieratings.ix[:,0].values
songs = movieratings.ix[:,1].values
ratings = movieratings.ix[:,2].values

#unique users / songs
uni_users = movieratings['userId'].unique()
uni_songs = movieratings['movieId'].unique()

#dict mapping the id to an index
user_map = dict(zip(uni_users,range(len(uni_users))))
song_map = dict(zip(uni_songs,range(len(uni_songs))))

user_idx =  np.array([ user_map[user] for user in users])
song_idx =  np.array([ song_map[song] for song in songs])

print(len(uni_users),len(uni_songs))

perm = np.random.permutation(len(users))
trn_idx = perm[:(len(users)*9)//10]
val_idx = perm[(len(users)*9)//10:]
user_idx_trn, song_idx_trn, ratings_trn = user_idx[trn_idx], song_idx[trn_idx], ratings[trn_idx]
user_idx_val, song_idx_val, ratings_val = user_idx[val_idx], song_idx[val_idx], ratings[val_idx]

def getDfSummary(input_data):
    output_data = input_data.describe(include = 'all').T
    var = pd.DataFrame(data = {'nanvals': pd.Series(), 'number_distinct': pd.Series()})
    for i in range(len(input_data.columns)):
        nanvals = input_data.ix[:,i].isnull().sum()
        number_distinct = len(input_data.ix[:,i].value_counts())
        var = var.append(pd.DataFrame([[nanvals, number_distinct]], columns = ['nanvals', 'number_distinct']))
    var.index = output_data.index.values
    output_data['nanvals'] = var['nanvals']
    output_data['number_distinct'] = var['number_distinct']
    return output_data

output_data = getDfSummary(movieratings)
output_data

367 6618


Unnamed: 0,count,mean,std,min,25%,50%,75%,max,nanvals,number_distinct
userId,50000.0,176.9997,107.0286,1.0,78.0,182.0,270.0,367.0,0.0,367.0
movieId,50000.0,12115.57,25925.44,1.0,969.0,2355.0,5333.25,162376.0,0.0,6618.0
rating,50000.0,3.54793,1.06245,0.5,3.0,4.0,4.0,5.0,0.0,10.0
timestamp,50000.0,1135655000.0,192265300.0,828213150.0,960838695.0,1111981000.0,1298923000.0,1476641000.0,0.0,39699.0


In [95]:
def error_usr(yhat, ratings):
    ratings_mean = np.mean(ratings)
    ratings_std = np.std(ratings)
    sq_err = np.sum(np.square(yhat - ratings))
    
    return 

class MF_RS():
    def __init__(self, numUsers, numSongs, embedding_dim, reg_lambda = 0.01):
        
        #hyper parameters
        self.batch_size = np.min([1004, numUsers, numSongs]);
        self.numUsers = numUsers
        self.numSongs = numSongs
        self.epochs = 50
        self.reg_lambda = reg_lambda
        
        #embedding matricies for users and songs
        self.userMat = tf.Variable(tf.random_normal([numUsers, embedding_dim]))
        self.songMat = tf.Variable(tf.random_normal([numSongs, embedding_dim]))
        self.userBias = tf.Variable(tf.random_normal([numUsers]))
        self.songBias = tf.Variable(tf.random_normal([numSongs]))
        self.overallBias = tf.Variable(tf.random_normal([1]))
        
        #input tensors for songs, usres, ratings
        self.users = tf.placeholder(tf.int32, shape =(self.batch_size))
        self.songs = tf.placeholder(tf.int32, shape =(self.batch_size))
        self.rating = tf.placeholder(tf.float32, shape = (self.batch_size))
        
        #map each user/song to its feature vector
        self.U = tf.nn.embedding_lookup(self.userMat, self.users)
        self.W = tf.nn.embedding_lookup(self.songMat, self.songs)
        #map each user/song bias to its bias vector
        self.U_bias = tf.nn.embedding_lookup(self.userBias, self.users)
        self.W_bias = tf.nn.embedding_lookup(self.songBias, self.songs)
        
        #predicted rating is dot product of user and song
        bias = self.U_bias+self.W_bias+self.overallBias
        pq = tf.reduce_sum(tf.mul(self.U, self.W), 1)
        self.yhat = pq + bias
        
        self.reg = self.reg_lambda * ( tf.reduce_sum((tf.square(self.U) + tf.square(self.W))) + 
                                 tf.reduce_sum(tf.square(self.U_bias) + tf.square(self.W_bias)))
        self.error = tf.reduce_mean(tf.nn.l2_loss(self.yhat - self.rating))
        self.cost = (self.error + self.reg)/1e4
        self.optimizer = tf.train.AdamOptimizer(learning_rate = .01).minimize(self.cost)
        
        self.session = tf.Session()
        self.session.run(tf.initialize_all_variables())   
        
        
    def train(self, users, songs, ratings, verb = 0):
        
        for i in range(self.epochs):
            
            avg_cost = 0
            perm = np.random.permutation(len(ratings))
            num_batches = len(ratings) // self.batch_size
            
            for b_idx in range(num_batches):
                
                batch = perm[self.batch_size * b_idx:self.batch_size * (b_idx + 1)]
                users_batch = users[batch]
                songs_batch = songs[batch]
                ratings_batch = ratings[batch]
                                
                avg_cost += self.session.run([self.cost, self.optimizer],
                          {self.users:users_batch, self.songs:songs_batch, self.rating:ratings_batch})[0]
                
            if verb > 0:
                print(avg_cost/num_batches)
                
    def test(self, users, songs):
        
        yhat = np.zeros(len(users))
        num_batches = len(users) // self.batch_size
        b_idx = 0
        for b_idx in range(num_batches):
            batch = range(self.batch_size * b_idx,self.batch_size * (b_idx + 1))
            users_batch = users[batch]
            songs_batch = songs[batch]
            yhat[batch] = self.session.run([self.yhat],
                      {self.users:users_batch, self.songs:songs_batch})[0]
        if self.batch_size * (b_idx + 1) < len(users):
            batch = range(self.batch_size * (b_idx + 1), len(users))
            leftover_size = len(batch)
            buffer = self.batch_size - leftover_size
            users_batch = np.concatenate([users[batch], np.array([1]*buffer)])
            songs_batch = np.concatenate([songs[batch], np.array([1]*buffer)])
            y_hat_leftover = self.session.run([self.yhat],
                          {self.users:users_batch, self.songs:songs_batch})[0]
            yhat[batch] = y_hat_leftover[:leftover_size]
        return yhat
    def testdebug(self, users, songs):
        
        yhat = np.zeros(len(users))
        num_batches = len(users) // self.batch_size
        b_idx = -1
        for b_idx in range(num_batches):
            batch = range(self.batch_size * b_idx,self.batch_size * (b_idx + 1))
            users_batch = users[batch]
            songs_batch = songs[batch]
            yhat[batch] = self.session.run([self.yhat],
                      {self.users:users_batch, self.songs:songs_batch})[0]
        if self.batch_size * (b_idx + 1) < len(users):
            batch = range(self.batch_size * (b_idx + 1), len(users))
            leftover_size = len(batch)
            buffer = self.batch_size - leftover_size
            users_batch = np.concatenate([users[batch], np.array([1]*buffer)])
            songs_batch = np.concatenate([songs[batch], np.array([1]*buffer)])
            y_hat_leftover = self.session.run([self.yhat],
                          {self.users:users_batch, self.songs:songs_batch})[0]
            yhat[batch] = y_hat_leftover[:leftover_size]
            #print(yhat)
        #print(yhat)
        return yhat
    def evaluate(self, users, songs, ratings, fcn = 'MSE'):
        if fcn == 'MSE':
            yhat = self.test(users, songs)
            return np.mean(np.square(yhat - ratings))
        if fcn == 'Business':
            num_ratings = len(ratings)
            err_total = 0
            uni_users = np.unique(users)
            for usr in uni_users:
                usr_idxes = users == usr
                usr_idxes = np.where(usr_idxes)
                usr_u = users[usr_idxes]
                sng_u = songs[usr_idxes]
                rtg_u = ratings[usr_idxes]
                yhat = self.testdebug(usr_u, sng_u)
                err_total += error_usr(yhat = yhat, ratings = rtg_u)/num_ratings
            return err_total
        
                
        

In [96]:
run = 'Val'
songmodel = MF_RS(len (uni_users), len(uni_songs), 11, reg_lambda = .01)
print(songmodel.evaluate(user_idx_val, song_idx_val, ratings_val))
songmodel.epochs = 10
songmodel.train(user_idx_trn, song_idx_trn, ratings_trn, verb = 1)
if run == 'Val':
    a = songmodel.evaluate(user_idx_val, song_idx_val, ratings_val, fcn = 'Business')
if run == 'Trn':
    a = songmodel.evaluate(user_idx_trn, song_idx_trn, ratings_trn, fcn = 'MSE')
print(a)

25.8016151243
0.296700912909
0.10752621061
0.0591643697231
0.0400319475925
0.0303217428813
0.0247340011792
0.0211853658414
0.0187844951622
0.0171033243236
0.0158262982216
[ 3.43520308  1.3493911 ]
[-2.83198667  2.83491945  3.30841684  2.8273499  -1.72233963  4.0674324
  2.49056244]
[ 3.62533188  3.26454425  1.37148559]
[ 4.18482208  4.62901497  3.94779277  3.5657711   4.48951817  4.47271872
  4.8476572   4.17301512  4.90506983  4.8524518   4.64968681  3.45166826
  4.97416782  5.52512932  3.99608111]
[ 4.44381666  4.07990217  4.25683594  3.13159394  3.81670642  4.34021187
  3.27693272  3.02211618  2.91438293  4.85092354  3.28125095  4.10213375]
[ 4.26268482  5.24452782  2.28414822  3.7165885   3.52291894  3.97634602]
[ 4.04621649  3.89486837  3.25988817]
[ 0.74501562  3.56811476  2.83477044  6.46443558  5.10963392  2.24895859
  5.80358028  3.17923188  2.17878485]
[ 3.96196103  3.66356301]
[ 2.89106464  2.8303833   3.81761956  3.31759095  3.07960701]
[ 4.85606003]
[ 5.1343708   2.4193491

In [66]:
c = [True, False, False]
d = np.array([1, 2, 3])
d[c==True]

  app.launch_new_instance()


1

In [36]:
a = np.array([1, 2, 3, 4, 5])
b = np.array([1, 2, 3, 4, 5])
c = np.array([4, 3, 2, 5, 1])
#unique users / songs
uni_a = np.unique(a)
uni_b = np.unique(b)

#dict mapping the id to an index
a_map = dict(zip(uni_a,range(len(uni_a))))
b_map = dict(zip(uni_b,range(len(uni_b))))

user_idx =  np.array([ a_map[user] for user in a])
song_idx =  np.array([ b_map[song] for song in b])
model = MF_RS(len(uni_a), len(uni_b), 7)
np.random.seed(2)
model.train(user_idx, song_idx, c)


KeyboardInterrupt: 

In [16]:
edims = [10, 30, 50]
lambdas = [10**i for i in range(-4, -1)]
errmat = np.zeros([len(edims), len(lambdas)])
for eidx, edim in enumerate(edims):
    for lidx, lamb in enumerate(lambdas):
        songmodel = MF_RS(len (uni_users), len(uni_songs), edim, reg_lambda=lamb)
        print("accuracy before training", songmodel.evaluate(user_idx_val, song_idx_val, ratings_val))
        np.random.seed(1)
        songmodel.train(user_idx_trn, song_idx_trn, ratings_trn)
        err = songmodel.evaluate(user_idx_val, song_idx_val, ratings_val)
        print("MSE after training with edim: ", edim, " and lambda: ", lamb, ": ", err)
        errmat[eidx, lidx] = err
errmat


accuracy before training 22.8770905418
MSE after training with edim:  10  and lambda:  0.0001 :  1.69646169488
accuracy before training 29.9246706894
MSE after training with edim:  10  and lambda:  0.001 :  1.65306456763
accuracy before training 19.0567726014
MSE after training with edim:  10  and lambda:  0.01 :  1.22921916365
accuracy before training 45.5816602463


KeyboardInterrupt: 