In [1]:
import graphlab as gl
from graphlab import SFrame
from graphlab import SGraph
import numpy as np
from random import sample
import random
import scipy.io as sio
from itertools import izip
from datetime import datetime
from numpy.random import rand
random.seed(123)

[INFO] This trial license of GraphLab Create is assigned to kanitw@gmail.com and will expire on June 26, 2015. Please contact trial@dato.com for licensing options or to request a free non-commercial license for personal or academic use.

[INFO] Start server at: ipc:///tmp/graphlab_server-52625 - Server binary: /usr/local/lib/python2.7/site-packages/graphlab/unity_server - Server log: /tmp/graphlab_server_1433109606.log
[INFO] GraphLab Server Version: 1.4.0


In [128]:
def load(name):
    return gl.load_sframe('data/%s_train.sframe' % name), \
        gl.load_sframe('data/%s_test.sframe' % name)

In [None]:
movies = sio.mmread('data/movies.mtx').tocsr()

In [56]:
def prefix(p):
    return lambda x: "%s%s"%(p,x)

def remove(c):
    def r(x):
        del x[c]
        return x
    return r

In [None]:
n, m = (138493, 27278)
ng, nht = 19, 40 #

In [None]:
def get_features(m):
    return dict(zip(m.indices, m.data))

def get_vertices(n, m, k, movies, factor0=1):
    user_ids = range(n)
    movie_ids = range(m)
    return SFrame({ 
            # Movies
            '__id': map(prefix('m'), movie_ids),
            'factors': map(lambda _: rand(k) * factor0, movie_ids), 
            'w': map(lambda _: np.zeros(ng+nht), movie_ids), 
            'b': map(lambda _: 0, movie_ids),
            'features': map(lambda i: get_features(movies[i]), movie_ids),  #FIXME
            'user':  map(lambda _: 0, movie_ids)
        }).append(SFrame({ 
            # User
            '__id': map(prefix('u'), user_ids), 
            'factors': map(lambda _: rand(k) * factor0, user_ids), 
            'w': map(lambda _: np.zeros(ng+nht), user_ids), 
            'b': map(lambda _: 0, user_ids),
            'features': map(lambda _:{}, user_ids), 
            'user': map(lambda _: 1, user_ids)
        }))

In [137]:
def get_graph(X_train, k, movies):
    factor0 = (X_train['rating'].mean() / k / 0.25) ** 0.5
    vertices  = get_vertices(n, m, k, movies, factor0)
    X_train['uid'] = X_train['userId'].apply(prefix('u'))
    X_train['mid'] = X_train['movieId'].apply(prefix('m'))
    return SGraph().add_vertices(vertices, vid_field='__id')\
        .add_edges(X_train, src_field='uid', dst_field='mid')

In [None]:
# def rmse_mtx(X, L, R):
#     se = 0.0
#     for u, m, x in izip(X.row, X.col, X.data):
#         d = x - L[u].dot(R[:,m])
#         se += d**2
#     return np.sqrt(se/ X.nnz)

In [47]:
def rmse(sf, L, R):
    se = 0.0
    n = len(sf)
    se = sf.apply(lambda r: (r['rating'] - L[r['userId']].dot(R[:,r['movieId']]))**2)
    return np.sqrt(se.sum()/ n)

In [None]:
def sgd_triple_updater(eta, lambda_u, lambda_v, unified, lambda_w):
    def updater(src, edge, dst):
        Lu = np.array(src['factors'])
        Rv = np.array(dst['factors'])
        ruv = edge['rating']
        rhat = Lu.dot(Rv) 
        if unified: 
            rhat += src['b'] + dst['b'] + \
                sum((src['w'][i] + dst['w'][i]) * x for i, x in dst['features'].iteritems())
            
        eps = rhat - ruv 
        src['factors'] = (1 - eta * lambda_u) * Lu - eta * eps * Rv
        dst['factors'] = (1 - eta * lambda_v) * Rv - eta * eps * Lu
        if unified:
            src['w'] = (1 - eta * lambda_w) * src['w'] 
            dst['w'] = (1 - eta * lambda_w) * dst['w'] 
            for i, x in dst['features'].iteritems():
                src['w'][i] -= eta * eps * x
                src['w'][i] -= eta * eps * x
                
            src['b'] -= eta * eps
            dst['b'] -= eta * eps
        
        return (src, edge, dst)
    return updater

In [139]:
def sgd_gl_edge(g, X_train, X_test, \
                lambduh, k, eta=0.1, unified=False, lambduh_w=0, Niter=100, e_rmse=0.001):
    L = np.ones((n + 1, k))
    R = np.ones((k, m + 1))
    rmse_train = [rmse(X_train, L, R)]
    print "%s: %.4f" % (0,rmse_train[-1])
    start = datetime.now()
    
    for i in xrange(1, Niter+1): 
        g = g.triple_apply(sgd_triple_updater(\
            eta, lambduh, lambduh, unified, lambduh_w), 'factors')
        
        U = g.get_vertices(fields={'user':1})
        uids = U['__id'].apply(lambda x: x[1:])
        L[np.array(uids, dtype=int)] = np.array(U['factors'])
        
        M = g.get_vertices(fields={'user':0})
        mids = M['__id'].apply(lambda x: x[1:])
        R[:,np.array(mids, dtype=int)] = np.array(M['factors']).T
        
        rmse_train.append(rmse(X_train, L, R))
        print "%s : %.4f (time:%s)" % (i, rmse_train[-1], datetime.now()-start)
        if abs(rmse_train[-1] - rmse_train[-2]) < e_rmse:
            break
    
    rmse_test = rmse(X_test, L, R)
    print "test=%.4f" % (rmse_test)
    return rmse_train, rmse_test, L, R
        

In [102]:
def run_full():
    X_train, X_test = load('ratings')
    g = get_graph(X_train, 5, movies)
    return sgd_gl_edge(g, X_train, X_test, 0.01, 5)

In [143]:
def run_debug(Niter=1):
    X_train, X_test = load('ratings_debug')
    g = get_graph(X_train, 5, movies)
    return sgd_gl_edge(g, X_train, X_test, 0.01, 5, Niter=Niter)

In [144]:
rmse_train, rmse_test, L, R = run_debug()

0: 1.8112
1 : 1.0017 (time:0:01:10.040569)
test=1.0803


In [140]:
def run_pure_mf(eta=0.05):0 # FIXME
    X_train_debug, X_test_debug = load('ratings_debug')
    min_rmse_test = float('inf')
    min_k, min_lambduh = None, None
    rmse_map = {}
    for lambduh in [0, 0.001, 0.01, 0.1]:
        for k in [5, 10, 20]:
            g = get_graph(X_train_debug, 5, movies)
            rmse_trainunified, rmse_test, L, R = \
                sgd_gl_edge(g, X_train_debug, X_test_debug, lambduh, k, eta, Niter=20)
            rmse_map.get(lambduh, {})[k] = rmse_test
            print "l=%s, k=%s, rmse=%.4f" % (lambduh, k, rmse_test)
            if rmse_test < min_rmse_test:
                min_rmse_test = rmse_test
                min_k = k
                min_lambduh = lambduh
    
    X_train, X_test = load('ratings')
    g = get_graph(X_train, 5, movies)
    rmse_train, rmse_test, L, R = \
                sgd_gl_edge(g, X_train, X_test, min_lambduh, min_k)
    print rmse_test
    return rmse_map, rmse_train, rmse_test, min_lambduh, min_k

In [162]:
def eta_search(): # FIXME
    X_train_debug, X_test_debug = load('ratings_debug')
    min_rmse_test = float('inf')
    min_k, min_lambduh = None, None
    rmse_map = {}
    for eta in [0.01]: # 0.005, 0.01, 0.05,
        print 'eta %s'%eta
        for lambduh in [0.01]: #[0, 0.001, 0.01, 0.1, 1]:
            for k in [5]: #, 10, 20]:
                g = get_graph(X_train_debug, 5, movies)
                rmse_train, rmse_test, L, R = \
                    sgd_gl_edge(g, X_train_debug, X_test_debug, lambduh, k, eta, Niter=3)
                rmse_map.get(lambduh, {}).get(k,{})[eta] = rmse_test
                print "l=%s, k=%s, rmse=%.4f" % (lambduh, k, rmse_test)
                if rmse_test < min_rmse_test:
                    min_rmse_test = rmse_test
                    min_k = k
                    min_eta = eta
                    min_lambduh = lambduh
    print min_eta
    return rmse_map, min_lambduh, min_k, min_eta


In [None]:
# eta 0.005
# 0: 1.8112
# 1 : 1.2069 (time:0:02:50.931521)
# 2 : 1.0992 (time:0:04:23.580317)
# 3 : 1.0346 (time:0:06:03.407423)
# test=1.0803
# l=0.01, k=5, rmse=1.0803
# eta 0.01
# 0: 1.8112
# 1 : 1.1017 (time:0:01:29.450320)
# 2 : 0.9914 (time:0:03:06.359180)
# 3 : 0.9343 (time:0:04:30.713422)
# test=1.0803
# l=0.01, k=5, rmse=1.0803
# eta 0.05
# 0: 1.8112
# 1 : 0.9312 (time:0:01:29.850168)
# 2 : 0.8715 (time:0:02:45.898214)
# 3 : 0.8472 (time:0:04:16.494505)
# test=1.0803
# l=0.01, k=5, rmse=1.0803
# 0.005
# Out[161]:
# ({}, 0.01, 5, 0.005)

In [163]:
eta_search()

eta 0.01
0: 1.8112
1 : 1.1008 (time:0:01:09.639385)
2 : 0.9918 (time:0:02:28.738751)
3 : 0.9347 (time:0:03:50.585330)
test=1.0803
l=0.01, k=5, rmse=1.0803
0.01


({}, 0.01, 5, 0.01)

In [164]:
X_train

userId,movieId,rating,timestamp,uid,mid
53450,2495,4.0,940687312,u53450,m2495
109744,2382,4.0,1047851282,u109744,m2382
138199,14866,3.5,1272834224,u138199,m14866
122318,1672,4.0,1242330925,u122318,m1672
33220,46,4.5,1295461781,u33220,m46
41453,1248,4.5,1062545982,u41453,m1248
112761,2403,4.0,1251964985,u112761,m2403
28142,108,3.0,860277711,u28142,m108
115002,9582,3.0,1303327349,u115002,m9582
15824,3332,4.0,1221151468,u15824,m3332


In [None]:
rmse_map, rmse_train, rmse_test, min_lambduh, min_k = run_pure_mf()

0: 1.8112
1 : 1.0117 (time:0:00:58.099448)
2 : 0.9697 (time:0:02:05.973085)
3 : 0.9579 (time:0:03:17.667039)
4 : 0.9325 (time:0:04:44.986584)
5 : 0.9264 (time:0:06:21.786802)
6 : 0.9165 (time:0:07:56.377591)
7 : 0.9067 (time:0:09:25.674062)
8 : 0.9020 (time:0:10:42.723533)
9 : 0.8974 (time:0:12:29.716956)
10 : 0.8939 (time:0:13:57.979749)
11 : 0.8910 (time:0:15:39.241020)
12 : 0.8847 (time:0:17:21.321695)
13 : 0.8824 (time:0:18:49.046950)
14 : 0.8799 (time:0:20:27.517569)
15 : 0.8790 (time:0:22:07.366880)
test=1.0803
l=0, k=5, rmse=1.0803
0: 1.8112
1 : 1.0114 (time:0:01:42.227319)
2 : 0.9721 (time:0:03:17.125196)
3 : 0.9545 (time:0:04:45.694033)
4 : 0.9403 (time:0:06:32.914066)
5 : 0.9312 (time:0:08:09.092949)
6 : 0.9149 (time:0:09:42.033428)
7 : 0.9071 (time:0:11:37.178244)

In [None]:
# FIXME need to loop more parameter 

def run_cf(eta=0.05):
    X_train_debug, X_test_debug = load('ratings_debug')
    min_rmse_test = float('inf')
    min_k, min_lambduh, min_lambduh_w = None, None, None
    rmse_map = {}
    for lambduh in [0, 0.001, 0.01, 0.1]:
        for k in [5, 10, 20]:
            for lambduh_w in [0, 0.001, 0.01, 0.1]:
                g = get_graph(X_train_debug, 5, movies)
                rmse_train, rmse_test, L, R = \
                    sgd_gl_edge(g, X_train_debug, X_test_debug, lambduh, k, eta, \
                                unified=true, lambduh_w=lambduh_w)
                rmse_map.get(lambduh, {}).get(k,{})[lambduh_w] = rmse_test
                print "l=%s, k=%s, l_w=%s, rmse=%.4f" % (lambduh, k, lambduh_w, rmse_test)
                if rmse_test < min_rmse_test:
                    min_rmse_test = rmse_test
                    min_k = k
                    min_lambduh = lambduh
                    min_lambduh_w = lambduh_w
    
    X_train, X_test = load('ratings_debug')
    g = get_graph(X_train, min_k, movies)
    rmse_train, rmse_test, L, R = \
                sgd_gl_edge(g, X_train, X_test, min_lambduh, min_k, min_eta, \
                                unified=true, lambduh_w=min_lambduh_w)
    print rmse_test
    return rmse_map, rmse_train, rmse_test, min_lambduh, min_k