This file modifies the current popular approach, using stochastic gradient descent for SVD, and introduce bias term. We want to apply the techniques on our large dataset and compare the result. Note that the SVD object strutures are inspired by the current papers.

In [28]:
from __future__ import division
import numpy as np
import scipy as sp
from numpy.random import random
import random
import pandas


data_dir = "data/ml-1m/"
data_shape = (6040, 3952)

# ratings = pandas.read_csv(data_dir + "ratings.dat", sep="::", engine = 'python',header=None)
# ratings.shape

In [57]:
class SVD_Bias:
    def __init__(self, X, k = 30):
        self.X = np.array(X)
        self.k = k
        #baseline average
        self.ave = np.mean(self.X[:,2])
        print "The input data size is ", self.X.shape
        #initialize...
        self.q_movie = {}
        self.p_user = {}
        self.movie_bias = {}
        self.user_bias = {}
        
        #self.movie_user = {} #record movie-user rating into the dictionary
        self.user_movie = {}
        for i in range(self.X.shape[0]):
            user_id = self.X[i][0]
            movie_id = self.X[i][1]
            rat = self.X[i][2]

            # initialize movie_bias, user_bias as 0, and q_movie,p_user to be 30 random numbers
            #print self.q_movie
            self.movie_bias.setdefault(movie_id,0)
            self.user_bias.setdefault(user_id,0)

            r = random.random()
            self.q_movie.setdefault(movie_id,np.random.rand(self.k))  
            self.p_user.setdefault(user_id,np.random.rand(self.k)) 
        
    def pred(self,user_id,movie_id):
        self.movie_bias.setdefault(movie_id,0)
        self.user_bias.setdefault(user_id,0)
        self.q_movie.setdefault(movie_id,np.zeros((self.k,1)))
        self.p_user.setdefault(user_id,np.zeros((self.k,1)))
        
        
        if(self.q_movie[movie_id]==None):
            # if q_movie vector has no previous info, set as 0s (no deviation from the baseline)
            self.q_movie[movie_id] = np.zeros(self.k,1)
        if(self.p_user[user_id]==None):
            self.p_user[movie_id] = np.zeros(self.k,1)
        ans = self.ave + self.movie_bias[movie_id]+self.user_bias[user_id]+np.sum(self.q_movie[movie_id]*self.p_user[user_id])
        if ans > 5:
            return 5
        elif ans < 1:
            return 1
        return ans
    def train(self,steps=30,eta = 0.05,Lambda = 0.15):
        for step in range(steps):
            print 'The ',step,'-th step is running...'
            rmse_sum = 0.0
            kk = np.random.permutation(self.X.shape[0])
            for j in range(self.X.shape[0]):
                i = kk[j]
                user_id = self.X[i][0]
                movie_id = self.X[i][1]
                rat = self.X[i][2]
                error = rat-self.pred(user_id,movie_id)
                rmse_sum += error**2
                #update
                self.user_bias[user_id] += eta*(error-Lambda*self.user_bias[movie_id])
                self.movie_bias[movie_id] += eta*(error-Lambda*self.movie_bias[movie_id])
                temp = self.q_movie[movie_id]
                self.q_movie[movie_id] += eta*(error*self.p_user[user_id]-Lambda*self.q_movie[movie_id])
                self.p_user[user_id] += eta*(error*temp-Lambda*self.p_user[user_id])
            eta *= 0.93
            print "The rmse of this step on training data is ", np.sqrt(rmse_sum/self.X.shape[0])
    def test(self,test_X):
        res = []
        sums = 0
        test_X = np.array(test_X)
        for i in range(test_X.shape[0]):
            #print X.shape
            prdi=self.pred(test_X[i][0],test_X[i][1])
            output.append(prdi)
            sums += (pre-test_X[i][2])**2
        rmse = np.sqrt(sums/test_X.shape[0])
        print "the rmse on test data is ",rmse
        #return output
def read_data():
    train = pandas.read_csv(data_dir + "train.dat", sep=",",header=None)
    test = pandas.read_csv(data_dir + "test.dat", sep=",",header=None)
    return train,test
    
random.seed(1)    
train,test = read_data()
print np.array(train).shape, np.array(test).shape
a = SVD_Bias(train,30)
a.train()
a.test(test)
        


(800167, 4) (200042, 4)
The input data size is  (800167, 4)
The  0 -th step is running...




The rmse of this step on training data is  1.02722259555
The  1 -th step is running...
The rmse of this step on training data is  0.920266127925
The  2 -th step is running...
The rmse of this step on training data is  0.907151642431
The  3 -th step is running...
The rmse of this step on training data is  0.899414193284
The  4 -th step is running...
The rmse of this step on training data is  0.894120345508
The  5 -th step is running...
The rmse of this step on training data is  0.890444270902
The  6 -th step is running...
The rmse of this step on training data is  0.887759700282
The  7 -th step is running...
The rmse of this step on training data is  0.88552939629
The  8 -th step is running...
The rmse of this step on training data is  0.883434263491
The  9 -th step is running...
The rmse of this step on training data is  0.881901188759
The  10 -th step is running...
The rmse of this step on training data is  0.880293005128
The  11 -th step is running...
The rmse of this step on trainin