In [1]:
import numpy as np
import scipy.sparse
import RecModel

Import the pre trained slim model

In [2]:
slim_W = scipy.sparse.load_npz("slim_W.npz").astype(np.float32)

Import data

In [3]:
train_mat_bin = scipy.sparse.load_npz('data/mat_bin_train.npz')
eval_mat_bin = scipy.sparse.load_npz('data/mat_bin_validate.npz')

Create a smaller matrix that has only nonzero entries.

In [4]:
n_rows = 50000
n_cols = 5000

rows = np.random.choice(np.nonzero(a=train_mat_bin.sum(axis=1).A1 > 0)[0], size = n_rows, replace=False)
cols =  np.random.choice(np.nonzero(a=train_mat_bin.sum(axis=0).A1 > 0)[0], size = n_cols, replace=False)

In [5]:
train_mat = train_mat_bin[rows, :][:, cols].copy()
eval_mat = eval_mat_bin[rows, :][:, cols].copy()
n_users, n_items = train_mat.shape

In [10]:
recwalk = RecModel.Recwalk(num_items=n_items, num_users=n_users, eval_method='k_step', k_steps=5, damping=0.2, slim_W=None)

In [11]:
recwalk.train(train_mat.copy(), phi=0.5, alpha=4.42, l1_ratio=0.318, max_iter=27, tolerance=0.01, cores=8, verbose=2)

5000 models need to be fitted


  self._set_arrayXarray(i, j, x)
  self._set_arrayXarray(i, j, x)


In [29]:
recwalk.k = 8

In [None]:
perf_all = recwalk.eval_topn(test_mat=eval_mat.copy(), topn=np.array([4, 10 ,20, 40], dtype=np.int32),
                             rand_sampled=1000, cores=8, random_state= 1993)
print(perf_all)

In [28]:
perf_all = recwalk.eval_topn(test_mat=eval_mat.copy(), topn=np.array([4, 10 ,20, 40], dtype=np.int32),
                             rand_sampled=1000, cores=8, random_state= 1993)
print(perf_all)

{'Recall@4': 0.15254462, 'Recall@10': 0.3135492, 'Recall@20': 0.44465607, 'Recall@40': 0.5797936}


In [26]:
perf_all = recwalk.eval_topn(test_mat=eval_mat.copy(), topn=np.array([4, 10 ,20, 40], dtype=np.int32),
                             rand_sampled=1000, cores=8, random_state= 1993)
print(perf_all)

{'Recall@4': 0.15343048, 'Recall@10': 0.31430218, 'Recall@20': 0.44904107, 'Recall@40': 0.5828055}


In [25]:
perf_all = recwalk.eval_topn(test_mat=eval_mat.copy(), topn=np.array([4, 10 ,20, 40], dtype=np.int32),
                             rand_sampled=1000, cores=8, random_state= 1993)
print(perf_all)

{'Recall@4': 0.15223457, 'Recall@10': 0.31496656, 'Recall@20': 0.45019266, 'Recall@40': 0.5838242}


In [23]:
perf_all = recwalk.eval_topn(test_mat=eval_mat.copy(), topn=np.array([4, 10 ,20, 40], dtype=np.int32),
                             rand_sampled=1000, cores=8, random_state= 1993)
print(perf_all)

{'Recall@4': 0.15108296, 'Recall@10': 0.3140364, 'Recall@20': 0.44726935, 'Recall@40': 0.578952}


In [18]:
perf_all = recwalk.eval_topn(test_mat=eval_mat.copy(), topn=np.array([4, 10 ,20, 40], dtype=np.int32),
                             rand_sampled=1000, cores=8, random_state= 1993)
print(perf_all)

{'Recall@4': 0.09066749, 'Recall@10': 0.22739957, 'Recall@20': 0.36072108, 'Recall@40': 0.48691145}


In [9]:
print(perf_all)

{'Recall@4': 0.16596536, 'Recall@10': 0.3040705, 'Recall@20': 0.43353856, 'Recall@40': 0.5723967}


In [23]:
P = recwalk.P.copy()

In [25]:
P_2 = P.dot(P)

MemoryError: 

In [22]:
np.allclose(recwalk.P.sum(axis=0).A1, 1.0)

True

In [18]:
recwalk.P.sum(axis=1).A1

array([0.53333334, 0.52172232, 0.50197457, ..., 6.55785873, 1.67305628,
       0.90208379])

Test the internal functions of RecWalk

In [7]:
def fill_empty_row_or_col(mat, fill_value=1.0):
    mat=mat.copy()

    # First fill the empty rows
    empty_rows = (mat.sum(axis=1).A1 == 0)

    # Sample one random item for each of them
    random_items = np.random.randint(0, mat.shape[1], empty_rows.sum())
    mat[empty_rows, random_items] = fill_value

    # If there are some columns remaining zero also fill them!
    empty_cols = (mat.sum(axis=0).A1 == 0)
    if empty_cols.any():
        random_users=np.random.randint(0, mat.shape[0], empty_cols.sum())
        mat[random_users, empty_cols] = fill_value
    return mat

def create_A_g(mat):
    n_users, n_items = mat.shape

    # Built upper half.
    zeros_upper_left = scipy.sparse.csr_matrix((n_users, n_users), dtype=np.float32) 
    upper_half = scipy.sparse.hstack([zeros_upper_left, mat], format='csr')

    # Build lower half.
    zeros_lower_right = scipy.sparse.csr_matrix((n_items, n_items), dtype=np.float32)    
    lower_half = scipy.sparse.hstack([mat.T, zeros_lower_right], format='csr')

    A_g = scipy.sparse.vstack([upper_half, lower_half], format='csr')
    return A_g

def create_H(mat):
    A_g = create_A_g(mat)

    # np.dot(A, 1_n)
    row_sums = A_g.sum(axis=1).A1

    # fill the digonal matrix with the inverse of row sums.
    diag_inv_row_sum = scipy.sparse.diags(diagonals=(1 / row_sums), offsets=0, dtype=np.float32, format='csr')

    return diag_inv_row_sum.dot(A_g)

def create_M_i(W_indptr, W_indices, W_data, n_items):
    """
    M_i is create by making W row stochastic. 
    """
    W = scipy.sparse.csr_matrix((W_data, W_indices, W_indptr), shape=(n_items, n_items), dtype=np.float32)
    W_normalized = W.copy()

    # Compute maximal row sums
    row_sums = W.sum(axis=1).A1
    row_sum_max = row_sums.max()

    # Normalize W by the maximal row sums.
    W_normalized.data /= row_sum_max

    # Create diagonal mat that reintroduces the residuals to make the final Matrix row stochastic.
    diag_mat = scipy.sparse.diags(diagonals=1 - (row_sums / row_sum_max), offsets=0, dtype=np.float32, format='csr')

    M_i = W_normalized + diag_mat

    return M_i

def create_M(indptr, indices, data, n_users, n_items):
    # Make W row stochastic.
    M_i = create_M_i(indptr, indices, data, n_items)

    # Fill the rest of the matrix with zeros and the upper left with an identity matrix.
    I = scipy.sparse.diags(diagonals=np.full(n_users, 1, dtype=np.float32), offsets=0, dtype=np.float32, format='csr')
    zeros_upper_right = scipy.sparse.csr_matrix((n_users, n_items), dtype=np.float32)    
    zeros_lower_left = scipy.sparse.csr_matrix((n_items, n_users), dtype=np.float32)    

    upper_half = scipy.sparse.hstack([I, zeros_upper_right], format='csr')
    lower_half = scipy.sparse.hstack([zeros_lower_left, M_i], format='csr')

    M = scipy.sparse.vstack([upper_half, lower_half], format='csr')
    return M