# Weighted ALS from Hu, Koren and Volinksky's paper

http://yifanhu.net/PUB/cf.pdf

In [126]:
import os
import pandas as pd
from scipy.sparse import coo_matrix, csr_matrix, diags
from scipy.linalg import cho_solve, cho_factor, solve
import numpy as np

import matplotlib.pyplot as plt
from sklearn.metrics import log_loss, accuracy_score, roc_curve, roc_auc_score
from sklearn.utils.extmath import safe_sparse_dot
import time

%matplotlib inline

In [22]:
%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [23]:
lastfm_file = "/Users/timwee/projects/datasets/lastfm-dataset-360K/usersha1-artmbid-artname-plays.tsv"

In [24]:
user_to_idx = {}
artist_to_idx = {}
users, artists, num_played = [], [], []
with open(lastfm_file, 'r') as f:
    for line in f:
        tup = line.strip().split("\t")
        usr, artist, cur_num_played = tup[0], tup[1], float(tup[3])
        if usr not in user_to_idx:
            user_to_idx[usr] = len(user_to_idx)
        if artist not in artist_to_idx:
            artist_to_idx[artist] = len(artist_to_idx)
        users.append(user_to_idx[usr])
        artists.append(artist_to_idx[artist])
        num_played.append(cur_num_played)

In [25]:
del user_to_idx, artist_to_idx

## hyperparams and initialization

In [26]:
num_factors = 20
alpha = 40
epsilon = 1e-7
regularization = 1e-2

In [27]:
1 + alpha * np.log(1 + 0/epsilon), alpha * np.log(1 + 0/epsilon)

(1.0, 0.0)

In [28]:
# we don't add the 1 in the confidence formula, bec. we will subtract it out later anyway
#   This lets us keep the sparse matrix, otherwise 0 becomes 1
# just remember to add 1 when calculating b when solving the linear equation to C
confidence_vals = alpha * np.log(1 + np.array(num_played) / epsilon)

In [29]:
item_users_mat = coo_matrix((confidence_vals, (artists, users))).tocsr()        

In [30]:
user_items_mat = item_users_mat.T

In [31]:
num_users, num_items = user_items_mat.shape

## bench one row

For each row $x_u$ (user or item) we solve a linear equation. 
\begin{align}
x_u = (Y^TC^uY + \lambda I)^{-1} Y^TC^up(u)
\end{align}

- $C^{u}$ is a diagonal matrix where $C_{ii}^{u} = c_{ui}$, the "confidence" value of the user with the item.
- $Y$ is the item factors matrix (with dimensions **num_items X num_factors**)
- $\lambda$ is the regularization parameter
- $p(u)$ is the vector of preferences for user $u$. (this is the binarized values from the rating matrix.


#### confidence value and preferences

Hu, et. al computed $c_{ui}$ in 2 different ways:
1. $c_{ui} = 1 + \alpha r_{ui}$
    - $\alpha$ was set to 40 in the paper.
2. $c_{ui} = 1 + \alpha \log{({1 + r_{ui}} / \epsilon)}$


#### Rewrite of equation for faster runtime
We can rewrite $Y^TC^uY$ to avoid the two big matrix multiplies with $Y$ everytime for each user:
$$(Y^TC^uY) = (Y^T(I + C^u - I)Y) = Y^TY + Y^T(C^u - I)Y\\$$

This means that we can compute $Y^TY$ once and reuse it.

**Note:** This is mostly bec. of $C^u$ having values for all entries (a min value of 1 because the confidence formulas above have a $+ 1$). Since we don't add this $1$ to our confidence values when we form the matrix, we can directly compute $(Y^TC^uY)$

#### solving for $x_u$ 
To solve for $x_u$, we treat the first part of the equation above as the inverse, and the latter part as $b$, $x_u = A^{-1}b$, and pass it to a linear equations solver. 

Note that A is hermitian/symmetric and positive definite, so it is eligible to use a Cholesky decomposition solver.
\begin{align}
A = Y^TC^uY + \lambda I\\
A = Y^TY + Y^T(C^u - I)Y + \lambda I\\
b = Y^TC^up(u)\\
\end{align}



### for user

In [33]:
# grab a single row
usr_idx = 0

##### compute A's 3 terms
1. $Y^T(C^u - I)Y$
    - since we didn't add a 1 for the initial confidence matrix, we don't have to worry about the $I$,  so this becomes $Y^T(C^u)Y$
    - $C^u$ is a diagonal matrix with values on the diagonal (i,i) corresponding to confidence values between user $u$, and item $i$ 
    - since $C^u$ is a diagonal matrix, and $Y^T$ is dense, we don't have to do a dot product
        - multiplying a dense matrix $T$ by a diagonal matrix $D$ just multiplies each row or column $i$ of $T$ by the $(i, i)$ element in D.
        - use broadcasting instead. See https://stackoverflow.com/questions/13272453/multiplying-numpy-scipy-sparse-and-dense-matrices-efficiently
2. $Y^TY$
3. $\lambda I$



In [34]:
Y = item_factors
Yt = Y.T
YtY = np.dot(Yt, Y) # we don't need this as noted above

In [61]:
def YtCuIY_mask(Cu, Y, usr_idx=0, subtract_identity=False):
    """
    Remove zero elements for speed
    (Yt (Cu - I) Y).shape should be (num_factors, num_factors)
    Yt.shape == (num_factors, num_items)
    Cu.shape == diag(num_items, num_items)
    Y.shape == (num_items, num_factors)
    """
    if subtract_identity: 
        Cu = Cu.copy()
        Cu -= 1
        Cu[Cu < epsilon] = 0.0
    mask = Cu.nonzero()[0]#np.flatnonzero(Cu) # Cu.ravel().nonzero()[0]
    Cu_masked = Cu[mask]
    Y_masked = Y[mask,:]
    CuY = Cu_masked[:,None] * Y_masked # broadcast
    return Y_masked.T.dot(CuY)

In [62]:
def YtCuIY(Cu, Y, usr_idx=0, subtract_identity=False):
    """
    (Yt (Cu - I) Y).shape should be (num_factors, num_factors)
    Yt.shape == (num_factors, num_items)
    Cu.shape == diag(num_items, num_items)
    Y.shape == (num_items, num_factors)
    """
    if subtract_identity:
        Cu = Cu.copy()
        Cu -= 1
        Cu[Cu < epsilon] = 0.0
    CuY = Cu[:,None] * Y # broadcasting
    return Y.T.dot(CuY)

In [63]:
# Runs out of memory
def YtCuIY_nobroadcast(Cu, Y, usr_idx=0, subtract_identity=False):
    """
    (Yt (Cu - I) Y).shape should be (num_factors, num_factors)
    Yt.shape == (num_factors, num_items)
    Cu.shape == diag(num_items, num_items)
    Y.shape == (num_items, num_factors)
    """
    confidence_vals = Cu
    if subtract_identity:
        confidence_vals = Cu.copy()
        confidence_vals -= 1
        confidence_vals[confidence_vals < epsilon] = 0.0
    Cu = sparse.diags(confidence_vals, [0])
    return Y.T.dot(Cu).dot(Y)

In [64]:
Cu = user_items_mat[0,:].toarray().ravel()

In [65]:
%timeit YtCuIY_mask(Cu, Y)
%timeit YtCuIY(Cu, Y)

1000 loops, best of 3: 1.19 ms per loop
100 loops, best of 3: 12.5 ms per loop


In [66]:
# runs out of memory
#%timeit YtCuIY_nobroadcast(user_items_mat, Y)

In [67]:
np.allclose(YtCuIY_mask(Cu, Y), YtCuIY(Cu, Y))

True

##### compute $b = Y^TC^up(u)$

Note that $p(u)$, a binarized (0/1) diagonal matrix is implicitly computed already (non-zero inside $C^u$)

In [140]:
# b.shape == (num_factors, num_factors)
def compute_b(Cu, Y):
    """
    Cu is a 1-d array of confidence values for a particular usr u, with all items i
    
    Expected shapes:
    Cu.shape == (num_items,)
    Y.shape == (num_items, num_factors)
    """
    return (Y.T * (Cu + 1.0)).sum(axis=1) # broadcast

In [141]:
Y.T.shape, Cu.shape

((20, 160168), (160168,))

In [142]:
(Y.T * (Cu+1.0)).shape, (Y.T * (Cu+1.0)).sum(axis=1).shape

((20, 160168), (20,))

In [143]:
%timeit compute_b(Cu, Y)

100 loops, best of 3: 12.3 ms per loop


In [144]:
A = YtCuIY_mask(Cu, Y)
b = compute_b(Cu, Y)
A.shape, b.shape

((20, 20), (20,))

In [145]:
c = cho_factor(A)
x = cho_solve(c,b)
x.shape, x

((20,), array([ 0.34421006,  4.44470192, -1.87208381,  1.41074733, -2.05504313,
        -1.9075228 ,  1.73939869, -0.8477629 ,  1.66443732,  0.27854569,
         1.39802624,  2.73733409,  0.40485103, -4.00677536,  1.25697008,
         0.36831876, -1.44033143,  0.42826174, -1.69081957, -0.36230996]))

In [151]:
lambda_I = regularization * np.eye(num_factors) # + YtY
def use_cho_solve(Cu, Y, lambda_I):
    A = YtCuIY_mask(Cu, Y) + lambda_I
    b = compute_b(Cu, Y)
    c = cho_factor(A)
    x = cho_solve(c,b)
    return x

In [153]:
def gen_solve(Cu, Y, lambda_I):
    A = YtCuIY_mask(Cu, Y) + lambda_I
    b = compute_b(Cu, Y)
    return solve(A, b)

In [154]:
use_cho_solve(Cu, Y, lambda_I).shape

(20,)

In [155]:
gen_solve(Cu, Y, lambda_I).shape

(20,)

In [156]:
%timeit use_cho_solve(Cu, Y, lambda_I)

100 loops, best of 3: 13.7 ms per loop


In [157]:
%timeit gen_solve(Cu, Y, lambda_I)

100 loops, best of 3: 14.1 ms per loop


# Putting it All Together

In [159]:
Cui = user_items_mat
Ciu = item_users_mat
lambda_I = regularization * np.eye(num_factors) # + YtY

In [165]:
def single_pass(Cui, Y, lambda_I, num_u, num_factors):
    result = np.zeros((num_u, num_factors))
    for idx, Cu in enumerate(Cui):
        result[idx] = use_cho_solve(Cu.toarray().ravel(), Y, lambda_I)
    return result

In [167]:
start = time.time()
single_pass(Cui=user_items_mat, Y=item_factors, lambda_I=lambda_I, num_u=num_users, num_factors=num_factors)
time.time() - start

5075.574373006821

In [168]:
def fit_weighted_als(user_items_mat, num_factors=15, num_iters=15):
    num_users, num_items = user_items_mat.shape
    user_factors = np.random.randn(num_users, num_factors) * 0.1
    item_factors = np.random.randn(num_items, num_factors) * 0.1
    
    item_users_mat = user_items_mat.T
    lambda_I = regularization * np.eye(num_factors)
    for num_iter in range(num_iters):
        # fit users
        start = time.time()
        user_factors = single_pass(Cui=user_items_mat, Y=item_factors, \
                                   lambda_I=lambda_I, num_u=num_users, \
                                   num_factors=num_factors)
        item_factors = single_pass(Cui=item_users_mat, Y=user_factors, \
                                   lambda_I=lambda_I, num_u=num_items, \
                                   num_factors=num_factors)
        print("finished iteration %i in %s" % (num_iter, time.time() - start))
    return user_factors, item_factors

In [None]:
start = time.time()
user_factors, item_factors = fit_weighted_als(user_items_mat, num_factors=15, num_iters=15)
end = time.time()
end - start