Assume the data has been loaded from 
http://files.grouplens.org/datasets/movielens/ml-20m.zip and unpacked in data/ml-20m

In [3]:
import scipy.io as sio
import pandas as pd
from scipy.sparse import dok_matrix
from random import sample
import random
random.seed(123)

In [4]:
#read movies and tags
movies = pd.read_csv("data/ml-20m/movies.csv", quotechar='"')
tags = pd.read_csv("data/ml-20m/tags.csv", quotechar='"')

In [5]:
movies_dict = dict((m, i) for (i, m) in enumerate(movies['movieId']))
genres = movies['genres'].map(lambda x: set(x.split('|')))
unique_genres = reduce(lambda a,b: a|b, genres, set())
unique_genres = filter(lambda x: x!='(no genres listed)', unique_genres)
genres_dict = dict((t, i) for (i, t) in enumerate(unique_genres))

tags_id_dict = dict((t, i) for (i, t) in enumerate(tags['tag'].unique()))

In [6]:
nm = len(movies)
nug = len(unique_genres)
nut = len(tags_id_dict)
nt = len(tags)
mat = dok_matrix((nm, nug + nt))

for i, gs in enumerate(genres):
    for g in gs:
        if g in genres_dict:
            mat[i,genres_dict[g]] = 1
            
for i in xrange(nt):
    mid = movies_dict[tags['movieId'][i]]
    t = tags['tag'][i]
    ti = nug + tags_id_dict[t]
    mat[mid, ti] += 1    

In [7]:
sio.mmwrite('movies_tags.mtx', mat)

In [8]:
# sample ratings
ratings = pd.read_csv("data/ml-20m/ratings.csv", quotechar='"')
nr = len(ratings)

In [9]:
users = ratings['userId'].unique()
nu = len(users)
user_dict = dict((u, i) for (i, u) in enumerate(users))

In [10]:
nr = len(ratings)
sample(range(nr), nr/5)

[1047285,
 1743756,
 8144942,
 2154033,
 18024214,
 763083,
 10724181,
 6644041,
 17041956,
 3193289,
 6744421,
 6676015,
 4903331,
 33411,
 8725630,
 1752293,
 11952146,
 1397557,
 6309262,
 8966552,
 18108211,
 1855315,
 2843538,
 15801644,
 424725,
 18223250,
 11468312,
 5306503,
 16756973,
 15376097,
 6869333,
 16030142,
 4137303,
 12178210,
 10541637,
 16186983,
 6305345,
 7964867,
 15385784,
 10255177,
 13696924,
 12767233,
 13333327,
 17802205,
 9722857,
 13369580,
 7490531,
 636834,
 15083661,
 3629346,
 16404019,
 18273158,
 13161656,
 3402582,
 6738411,
 7823969,
 1465834,
 9135461,
 11463521,
 6722210,
 3781287,
 1763169,
 12566515,
 256585,
 7038366,
 4582655,
 10640089,
 14473457,
 18183304,
 12776184,
 19836122,
 19281436,
 12599979,
 4667525,
 9676356,
 3551841,
 19839416,
 8890115,
 15033200,
 882608,
 2694587,
 3863590,
 5127314,
 7041043,
 5438668,
 8685485,
 8407256,
 2172445,
 1252191,
 7101322,
 1775801,
 8664120,
 171226,
 10716161,
 1469699,
 13884300,
 8084251,


In [11]:
def write_ratings(ratings, samples, name):
    mat = dok_matrix((nu, nm))
    for t, i in enumerate(samples):
        if t % 100000 == 0: print t
        uid = user_dict[ratings['userId'][i]]
        mid = movies_dict[ratings['movieId'][i]]
        rating = ratings['rating'][i]
        mat[uid,mid] = rating
    sio.mmwrite(name, mat)
    print "done writing", name
    return mat

In [12]:
# small debug set
n_debug = nr/5
debug_sample = sample(range(nr), n_debug)
debug_train_sample = debug_sample[0: -n_debug/5]
debug_test_sample = debug_sample[-n_debug/5:-1]

In [13]:
write_ratings(ratings, debug_train_sample, "ratings_debug_train.mtx")
write_ratings(ratings, debug_test_sample, "ratings_debug_test.mtx")

0
100000
200000
300000
400000
500000
600000
700000
800000
900000
1000000
1100000
1200000
1300000
1400000
1500000
1600000
1700000
1800000
1900000
2000000
2100000
2200000
2300000
2400000
2500000
2600000
2700000
2800000
2900000
3000000
3100000
3200000
done writing ratings_debug_train.mtx
0
100000
200000
300000
400000
500000
600000
700000
800000
done writing ratings_debug_test.mtx


<138493x27278 sparse matrix of type '<type 'numpy.float64'>'
	with 800010 stored elements in Dictionary Of Keys format>

In [None]:
# full train / test set
n_test = nr/5
n_validate = nr/5
test_sample = sample(range(nr), n_test)
validate_sample = sample(range(nr), n_validate)
train_sample = list(set(range(nr)) - set(test_sample) - set(validate_sample))

In [None]:
write_ratings(ratings, train_sample, "ratings_train.mtx")
write_ratings(ratings, validate_sample, "ratings_validate.mtx")
write_ratings(ratings, test_sample, "ratings_test.mtx")

0
100000
200000
300000
400000
500000
600000
700000