In [2]:
import numpy as np
from scipy import sparse as spsp

Load and construct user-movie sparse matrix.

In [15]:
f = open("ml-1m/ratings.dat", "r")
user_ids = []
movie_ids = []
seq_nums = []
for line in f.readlines():
    line = line.rstrip()
    user_id, movie_id, _, seq_num = line.split('::')
    user_ids.append(int(user_id))
    movie_ids.append(int(movie_id))
    seq_nums.append(int(seq_num))
num_movies = int(np.max(movie_ids) + 1)
print(np.min(user_ids), np.max(user_ids), len(user_ids), len(np.unique(user_ids)))
print(np.min(movie_ids), np.max(movie_ids), len(movie_ids), len(np.unique(movie_ids)))

1 6040 1000209 6040
1 3952 1000209 3706


Construct user activity sequences.

In [12]:
spm = spsp.csr_matrix((seq_nums, (user_ids, movie_ids)))
is_sorted = lambda a: np.all(a[:-1] <= a[1:])
user_seqs = {}
for i in range(spm.shape[0]):
    start = spm.indptr[i]
    end = spm.indptr[i + 1]
    if end > start:
        movies = spm.indices[start:end]
        seqs = spm.data[start:end]
        idx = np.argsort(seqs)
        movies = movies[idx]
        seqs = seqs[idx]
        assert is_sorted(seqs)
        user_seqs[i] = (movies, seqs)

Construct item sparse matrix (two items are connected if an item always appears before the other one).

In [76]:
def get_item_transit_matrix(user_seqs, k, permute):
    item_pre = []
    item_post = []
    for key in user_seqs:
        movies, _ = user_seqs[key]
        if permute:
            movies = np.random.permutation(movies)
        for i in range(len(movies) - k):
            item_pre.append(movies[i])
            item_post.append(movies[i + k])
    item_spm = spsp.csr_matrix((np.ones(shape=(len(item_pre),)), (item_pre, item_post)), shape=(num_movies, num_movies))
    
    s = np.sum(item_spm, axis=1)
    s[s == 0] = 1
    transit_spm = item_spm / s
    return transit_spm, item_spm

In [77]:
transit_spm, item_spm = get_item_transit_matrix(user_seqs, 1, False)
rand_transit_spms = []
for i in range(2):
    rand_transit_spm, rand_item_spm = get_item_transit_matrix(user_seqs, 1, True)
    rand_transit_spms.append(rand_transit_spm)

In [78]:
print(np.sum(transit_spm > 0), np.sum(transit_spm))
print(np.sum(rand_transit_spm > 0), np.sum(rand_transit_spm))

108043 3702.0000000000014
759504 3706.0000000000014


In [79]:
np.sum(np.abs(transit_spm - rand_transit_spm))

7259.487802886655

In [80]:
np.sum(np.abs(rand_transit_spms[0] - rand_transit_spms[1]))

6435.064699425198

In [81]:
print(rand_item_spm.data)

[2. 1. 3. ... 1. 1. 1.]


In [83]:
print(np.sum(rand_item_spm))
print(np.histogram(rand_item_spm.data, bins=[1, 2, 5, 10, 20, 30]))
print(np.histogram(item_spm.data, bins=[1, 2, 5, 10, 20, 30, 100, 200]))

994169.0
(array([610330, 140420,   8209,    539,      6]), array([ 1,  2,  5, 10, 20, 30]))
(array([49914, 28709, 12047,  7697,  3014,  4942,  1151]), array([  1,   2,   5,  10,  20,  30, 100, 200]))
