In [1]:
import graphlab as gl
from graphlab import SFrame
from graphlab import SGraph
import numpy as np
from random import sample
import random
import scipy.io as sio
from itertools import izip
from datetime import datetime
random.seed(123)

[INFO] This trial license of GraphLab Create is assigned to kanitw@gmail.com and will expire on June 26, 2015. Please contact trial@dato.com for licensing options or to request a free non-commercial license for personal or academic use.

[INFO] Start server at: ipc:///tmp/graphlab_server-52625 - Server binary: /usr/local/lib/python2.7/site-packages/graphlab/unity_server - Server log: /tmp/graphlab_server_1433109606.log
[INFO] GraphLab Server Version: 1.4.0


In [18]:
def load(name):
    return gl.load_sframe('data/%s_train.sframe' % name), \
        gl.load_sframe('data/%s_validate.sframe' % name), \
        gl.load_sframe('data/%s_test.sframe' % name), \
        sio.mmread('data/movies.mtx')

In [56]:
def prefix(p):
    return lambda x: "%s%s"%(p,x)

def remove(c):
    def r(x):
        del x[c]
        return x
    return r

In [58]:
def get_vertices(n, m, k, movies):
    user_ids = range(n)
    movie_ids = range(m)
    return SFrame({
            '__id': map(prefix('m'), movie_ids),
            'factors': map(lambda _: np.ones(k), movie_ids), #FIXME
            'features': map(lambda _:{}, movie_ids),  #FIXME
            'user':  map(lambda _: 0, movie_ids)
        }).append(SFrame({
            '__id': map(prefix('u'), user_ids), 
            'factors': map(lambda _: np.ones(k), user_ids), 
            'features': map(lambda _:{}, user_ids),
            'user': map(lambda _: 1, user_ids)
        }))

In [85]:
def get_graph(X_train, k, movies):
    vertices  = get_vertices(n, m, k, movies)
    X_train['uid'] = X_train['userId'].apply(prefix('u'))
    X_train['mid'] = X_train['movieId'].apply(prefix('m'))
    return SGraph().add_vertices(vertices, vid_field='id')\
        .add_edges(X_train, src_field='uid', dst_field='mid')

In [None]:
n, m = (138493, 27278)

In [None]:
# def rmse_mtx(X, L, R):
#     se = 0.0
#     for u, m, x in izip(X.row, X.col, X.data):
#         d = x - L[u].dot(R[:,m])
#         se += d**2
#     return np.sqrt(se/ X.nnz)

In [47]:
def rmse(sf, L, R):
    se = 0.0
    n = len(sf)
    se = sf.apply(lambda r: (r['rating'] - L[r['userId']].dot(R[:,r['movieId']]))**2)
    return np.sqrt(se.sum()/ n)

In [34]:
# k = 5
# L = np.ones((n + 1, k))
# R = np.ones((k, m + 1))
# rmse_train = [rmse(X_train, L, R)]

In [88]:
def sgd_triple_updater(eta, lambda_u, lambda_v):
    def updater(src, edge, dst):
        Lu = np.array(src['factors'])
        Rv = np.array(dst['factors'])
        ruv = edge['rating']
        
        eps = Lu.dot(Rv) - ruv
        src['factors'] = (1 - eta * lambda_u) * Lu - eta * eps * Rv
        dst['factors'] = (1 - eta * lambda_v) * Rv - eta * eps * Lu
        
        return (src, edge, dst)
    return updater

In [104]:
def sgd_gl_edge(g, X_train, X_valid, X_test, \
                lambduh, k, eta=0.1, Niter=100, e_rmse=0.0001):
    L = np.ones((n + 1, k))
    R = np.ones((k, m + 1))
    rmse_train = [rmse(X_train, L, R)]
    print "%s: %.4f" % (0,rmse_train[-1])
    start = datetime.now()
    
    for i in xrange(1, Niter+1): 
        g = g.triple_apply(sgd_triple_updater(\
            eta=eta, lambda_u=lambduh, lambda_v=lambduh), 'factors')
        
        U = g.get_vertices(fields={'user':1})
        uids = U['__id'].apply(lambda x: x[1:])
        L[np.array(uids, dtype=int)] = np.array(U['factors'])
        
        M = g.get_vertices(fields={'user':0})
        mids = M['__id'].apply(lambda x: x[1:])
        R[:,np.array(mids, dtype=int)] = np.array(M['factors']).T
        
        rmse_train.append(rmse(X_train, L, R))
        print "%s : %.4f (time:%s)" % (i, rmse_train[-1], datetime.now()-start)
        if abs(rmse_train[-1] - rmse_train[-2]) < e_rmse:
            break
    
    rmse_valid = rmse(X_valid, L, R)
    rmse_test = rmse(X_test, L, R)
    print "valid=%.4f, test=%.4f" % (rmse_valid, rmse_test)
    return rmse_train, rmse_valid, rmse_test
        

In [102]:
def run_full():
    X_train, X_valid, X_test, movies = load('ratings')
    g = get_graph(X_train, 5, movies)
    return sgd_gl_edge(g, X_train, X_valid, X_test, 0.01, 5)

In [None]:
def run_debug():
    X_train, X_valid, X_test, movies = load('ratings_debug')
    g = get_graph(X_train, 5, movies)
    return sgd_gl_edge(g, X_train, X_valid, X_test, 0.01, 5)

In [None]:
rmse_train, rmse_valid, rmse_test = run_full()