Assume the data has been loaded from 
http://files.grouplens.org/datasets/movielens/ml-20m.zip and unpacked in data/ml-20m

In [1]:
import scipy.io as sio
import pandas as pd
from scipy.sparse import dok_matrix
from random import sample
import random
random.seed(123)

In [2]:
#read movies and tags
movies = pd.read_csv("data/ml-20m/movies.csv", quotechar='"')
tags = pd.read_csv("data/ml-20m/tags.csv", quotechar='"')
ratings = pd.read_csv("data/ml-20m/ratings.csv", quotechar='"')

In [3]:
### ratings.mtx
# movieId => movieRow
movies_dict = dict((m, i) for (i, m) in enumerate(movies['movieId']))
# userId => userRow
user_dict = dict((u, i) for (i, u) in enumerate(ratings['userId'].unique()))

In [4]:
nm, nr, nu = len(movies), len(ratings), len(user_dict)

In [19]:
def get_rating_matrix(ratings, samples, save_to=None):
    """
        get (and write) rating matrix from long format 
    """
    mat = dok_matrix((nu, nm))
    print save_to, 'samples length =', len(samples)

    for t, i in enumerate(samples):
        if t % 500000 == 0: print t
        uid = user_dict[ratings['userId'][i]]
        mid = movies_dict[ratings['movieId'][i]]
        rating = ratings['rating'][i]
        mat[uid,mid] = rating
    if save_to is not None:
        sio.mmwrite(save_to, mat)
        print "done writing", save_to
    return mat


In [20]:
def save_ratings_splits(ratings, train_ids, valid_ids, test_ids, name):
    get_rating_matrix(ratings, valid_ids, save_to="data/%s_validate.mtx"%name)
    get_rating_matrix(ratings, test_ids, save_to="data/%s_test.mtx"%name)
    get_rating_matrix(ratings, train_ids, save_to="data/%s_train.mtx"%name)

In [25]:
# given list X, randomly split data into training, validation and test set
def sample_split(n, n_test=None, n_validate=None):
    n_test = n/5 if n_test is None else n_test
    n_validate = n/5 if n_validate is None else n_validate

    samples = sample(range(n), n_test + n_validate)
    test_ids = samples[0:n_test]
    validate_ids = samples[n_test:]
    train_ids = list(set(range(n)) - set(test_ids) - set(validate_ids))

    return train_ids, validate_ids, test_ids

In [27]:
def debug_split():
    # smaller split for debugging 
    debug_size = nr/20
    debug_sample = sample(range(nr), debug_size)
    train_ids, valid_ids, test_ids = sample_split(debug_size)
    train_ids = get_debug(train_ids)
    valid_ids = get_debug(valid_ids)
    test_ids = get_debug(test_ids)
    
    get_debug = lambda ids: map(lambda i: debug_sample[i], ids)
    save_ratings_splits(ratings, train_ids, valid_ids, test_ids, 'ratings_debug')
    
    return train_ids, valid_ids, test_ids

debug size 1000013 valid size 200002 test size 200002


In [28]:
debug_train_ids, debug_valid_ids, debug_test_ids = debug_split()

data/ratings_debug_validate.mtx samples length = 200002
0
done writing data/ratings_debug_validate.mtx
data/ratings_debug_test.mtx samples length = 200002
0
done writing data/ratings_debug_test.mtx
data/ratings_debug_train.mtx samples length = 600009
0
500000
done writing data/ratings_debug_train.mtx


In [31]:
def normal_split():
    train_ids, valid_ids, test_ids = sample_split(nr)
    save_ratings_splits(ratings, train_ids, valid_ids, test_ids, 'ratings')
    return train_ids, valid_ids, test_ids

In [None]:
train_ids, valid_ids, test_ids = normal_split()

In [None]:
rating_matrix = get_rating_matrix(ratings, range(len(ratings)))

In [3]:
genres = movies['genres'].map(lambda x: set(x.split('|')))
unique_genres = reduce(lambda a,b: a|b, genres, set())
unique_genres = filter(lambda x: x!='(no genres listed)', unique_genres)
ng = len(unique_genres)

# genreId => genreRow
genres_dict = dict((t, i) for (i, t) in enumerate(unique_genres))
# tagId => tagRow
tags_id_dict = dict((t, i) for (i, t) in enumerate(tags['tag'].unique()))
nt = len(tags_id_dict)

In [6]:
#FIXME refactor this 
def get_movie_matrix():
    mat = dok_matrix((nm, ng + nt))

    for i, gs in enumerate(genres):
        for g in gs:
            if g in genres_dict:
                mat[i,genres_dict[g]] = 1

    for i in xrange(len(tags)):
        mid = movies_dict[tags['movieId'][i]]
        t = tags['tag'][i]
        ti = ng + tags_id_dict[t]
        mat[mid, ti] += 1    
    sio.mmwrite('data/movies_tags.mtx', mat)
    return mat

In [32]:
import graphlab as gl
from graphlab import SFrame
from graphlab import SGraph

In [33]:
def prefix(p):
    return lambda x: "%s%s"%(p,x)

def remove(c):
    def r(x):
        del x[c]
        return x
    return r

In [37]:
user_ids = range(len(user_dict))
movies_ids = range(len(movies_dict))
vertices = SFrame({
        'id': map(prefix('m'), movies_ids),
        'factor': map(lambda _: 1, movies_ids), #FIXME
        'features': map(lambda _:{}, movies_ids),  #FIXME
        'user':  map(lambda _: 0, movies_ids)
    }).append(SFrame({
        'id': map(prefix('u'), user_ids), 
        'factor': map(lambda _: 1, user_ids), 
        'features': map(lambda _:{}, user_ids),
        'user': map(lambda _: 1, user_ids)
    }))

In [44]:
def create_graph(vertices, ratings, suffix):
    ratings_sf = SFrame(ratings)
    ratings_sf['userId'] = ratings_sf['userId'].apply(prefix('u'))
    ratings_sf['movieId'] = ratings_sf['movieId'].apply(prefix('m'))
    s = SGraph().add_vertices(vertices, vid_field='id')\
        .add_edges(ratings_sf, src_field='userId', dst_field='movieId')
    s.save('data/ratings%s_train.sgraph' % suffix)

In [46]:
create_graph(vertices, ratings.ix[debug_train_ids], '_debug')

In [47]:
create_graph(vertices, ratings.ix[train_ids], '')

In [None]:
# FIXME finish this
# split for cold start
create_graph(vertices, ratings_train_cs, 'cs')
create_graph(vertices, ratings_train_cs_debug, 'cs_debug')