# WALS Matrix Factorization

Steps:
<ol>
<li>Preprocess data</li>
<li>Build model</li>
<li>Train model</li>
<li>Hyperparameter tuning</li>
<li>Validate model</li>
</ol>

In [2]:
import matplotlib as plt
import pandas as pd
import numpy as np
import os

In [3]:
# import data and convert to df
path = os.path.join(os.getcwd(), '../data/u.purchase.csv')

names = ['user_id','username','item_id','product_name','status']
df = pd.read_csv(path, ',', names=names, engine='python', skiprows=1) # skip header row
df.head()

Unnamed: 0,user_id,username,item_id,product_name,status
0,6,Andy Fajar Handika,9463,Nasi Goreng Selimut,VOID BY USER
1,6,Andy Fajar Handika,3663,Nasi Jenggo Ikan Cakalang,VOID BY USER
2,6,Andy Fajar Handika,5930,Nasi Kebuli Sapi,VOID BY USER
3,12,Thomas Dian,6845,Nasi Rames Ayam Bumbu Bali,SUCCESS
4,12,Thomas Dian,7233,Nasi Dori Asam Manis,PAYMENT EXPIRED


In [4]:
# remove first x rows (only during development)
df = df.iloc[:1000]

In [5]:
# remove unused columns
df.drop(['username', 'product_name'], axis=1, inplace=True)

# only keep SUCCESS status
df = df.drop(df[df.status!='SUCCESS'].index)
df.drop(['status'], axis=1, inplace=True)
df

Unnamed: 0,user_id,item_id
3,12,6845
9,12,6271
12,76,7120
13,76,7121
14,76,7122
...,...,...
995,5627,12303
996,5627,7940
997,5627,7500
998,5627,8548


In [6]:
# create

In [7]:
# create array of shape [u-1, i-1] for user and item interactions
users = np.array(df.user_id.unique())
items = np.array(df.item_id.unique())
n_users = len(users)
n_items = len(items)
"""
Z = np.zeros((n_users, n_items))

for user_i, u in enumerate(users):
    # find all user's item interaction 
    u_i = df[df['user_id'] == u]
    for _, i in u_i.iterrows():
        # find index of item
        item_i = np.where(items==i['item_id'])
        Z[user_i, item_i] += 1
"""

R = np.array([[0,0,0]])


for u in users:
    # find list of item interactions
    user_purchases = df[df['user_id']==u]['item_id']
    unique, counts = np.unique(user_purchases, return_counts=True)
    user_total_purchases = np.array(list(zip(unique, np.ones(counts.shape))))
    user_total_purchases = np.insert(user_total_purchases, 0, [[u]], axis=1)

    R = np.concatenate((R, user_total_purchases))

R = np.delete(R, 0, 0)


for i, interaction in enumerate(R):
    # index user
    user_id = interaction[0]
    user_i = np.where(users==user_id)[0]
    R[i][0] = user_i # replace id with index

    # index item
    item_id = interaction[1]
    item_i = np.where(items==item_id)[0]
    R[i][1] = item_i


# delete first row
R

array([[  0.,   1.,   1.],
       [  0.,   0.,   1.],
       [  1.,   2.,   1.],
       ...,
       [ 58., 453.,   1.],
       [ 58., 454.,   1.],
       [ 58., 455.,   1.]])

In [8]:
# train test split
TEST_SPLIT_RATIO = 0.1
test_set_size = round(len(R) * TEST_SPLIT_RATIO)
test_set_idx = np.random.choice(range(len(R)),
                                size=test_set_size, replace=False)

ts = R[test_set_idx]
tr = np.delete(R, test_set_idx, axis=0)
print(ts.shape)
print(tr.shape)

(63, 3)
(569, 3)


In [9]:
# create coo_matrix
from scipy.sparse import coo_matrix

u_tr, i_tr, r_tr = zip(*R)
tr_sparse = coo_matrix((r_tr, (u_tr, i_tr)), shape=(n_users, n_items))

print(tr_sparse.toarray())


[[1. 1. 0. ... 0. 0. 0.]
 [0. 0. 1. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 ...
 [0. 0. 0. ... 0. 0. 0.]
 [0. 1. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 1. 1. 1.]]


In [15]:
# make weights
feature_wt_exp = 0.08

frac = np.array(1.0/(tr_sparse > 0.0).sum(0))
col_wts = np.array(np.power(frac, feature_wt_exp)).flatten()
np.array(1/(tr_sparse > 0.0).sum(0))
tr_sparse.shape

(59, 458)

In [16]:
# create model
import tensorflow as tf
from tensorflow.contrib.factorization.python.ops import factorization_ops

DEFAULT_PARAMS = {
    'weights': True,
    'latent_factors': 5,
    'num_iters': 20,
    'regularization': 0.07,
    'unobs_weight': 0.01,
    'wt_type': 0,
    'feature_wt_factor': 130.0,
    'feature_wt_exp': 0.08,
    'delimiter': '\t'
}

dim = DEFAULT_PARAMS['latent_factors']
num_iters = DEFAULT_PARAMS['num_iters']
reg = DEFAULT_PARAMS['regularization']
unobs = DEFAULT_PARAMS['unobs_weight']
wt_type = DEFAULT_PARAMS['wt_type']
feature_wt_exp = DEFAULT_PARAMS['feature_wt_exp']
obs_wt = DEFAULT_PARAMS['feature_wt_factor']

row_wts = None


num_rows = tr_sparse.shape[0]
num_cols = tr_sparse.shape[1]

row_wts = np.ones(num_rows)

row_factor = None
col_factor = None

with tf.Graph().as_default():

    input_tensor = tf.SparseTensor(indices=list(zip(tr_sparse.row, tr_sparse.col)),
                                   values=(tr_sparse.data).astype(np.float32),
                                   dense_shape=tr_sparse.shape)

    model = factorization_ops.WALSModel(num_rows, num_cols, dim,
                                        unobserved_weight=unobs,
                                        regularization=reg,
                                        row_weights=row_wts,
                                        col_weights=col_wts)

    # retrieve the row and column factors
    row_factor = model.row_factors[0]
    col_factor = model.col_factors[0]

In [17]:
# train model
sess = tf.Session(graph=input_tensor.graph)

with input_tensor.graph.as_default():
    row_update_op = model.update_row_factors(sp_input=input_tensor)[1]
    col_update_op = model.update_col_factors(sp_input=input_tensor)[1]

    sess.run(model.initialize_op)
    sess.run(model.worker_init)
    for _ in range(num_iters):
        sess.run(model.row_update_prep_gramian_op)
        sess.run(model.initialize_row_update_op)
        sess.run(row_update_op)
        sess.run(model.col_update_prep_gramian_op)
        sess.run(model.initialize_col_update_op)
        sess.run(col_update_op)

In [18]:
# get prediction model

# evaluate output factor matrices
output_row = row_factor.eval(session=sess)
output_col = col_factor.eval(session=sess)

pred = np.dot(output_col, output_row.T)
pred

array([[ 0.7633808 ,  0.23418975, -0.04187143, ...,  0.0861668 ,
         0.9970003 ,  0.16619682],
       [ 0.75673723,  0.24433696, -0.02583635, ...,  0.1043287 ,
         1.0069507 ,  0.17695534],
       [ 0.2216481 ,  0.89184934,  0.762364  , ..., -0.41047096,
        -0.09587336, -0.253871  ],
       ...,
       [ 0.05078867, -0.071395  , -0.15302618, ...,  0.4054911 ,
         0.34305036,  0.9346351 ],
       [ 0.05078867, -0.071395  , -0.15302618, ...,  0.4054911 ,
         0.34305036,  0.9346351 ],
       [ 0.05078867, -0.071395  , -0.15302618, ...,  0.4054911 ,
         0.34305036,  0.9346351 ]], dtype=float32)

In [19]:
print(pred.shape)
ts.shape

(458, 59)


(63, 3)

In [33]:
# convert ts to weighted matrix
u_ts = ts[:,0]
i_ts = ts[:,1]
r_ts = ts[:, 2]
ts_sparse = coo_matrix((r_ts, (u_ts, i_ts)), shape=(n_users, n_items)).toarray()

W = np.outer(row_wts, col_wts) + unobs
ts_sparse *= W
ts_sparse[ts_sparse==0] = unobs
ts_sparse

(59, 458)


27022

In [29]:
def select_from_indexes(a1, indexes):
    a2 = np.array([])
    for i in indexes:
        a2 = np.append(a1[i[0]][i[1]], a2)
    return a2

In [31]:
# evaluate model
import math
nonzero_row, nonzero_col = np.nonzero(ts_sparse)
nonzero_indexes = np.array((nonzero_row, nonzero_col)).T # gets list of indexes
print(nonzero_indexes)
# compare ts and prediction values
mse = 0
for i in nonzero_indexes:
    ts_value = ts[i[0]][i[1]]
    pred_value = pred[i[0]][i[1]]
    mse += (pred_value - ts_value)**2
    print(ts_value)
    print(pred_value)
mse /= len(nonzero_indexes)
rmse = math.sqrt(mse)
rmse

[[  1   5]
 [  3  14]
 [  3  15]
 [  3  17]
 [  5  36]
 [  5  52]
 [  8  67]
 [  8  68]
 [  8  70]
 [ 10  76]
 [ 12  83]
 [ 14  97]
 [ 16 105]
 [ 16 110]
 [ 19 128]
 [ 21 131]
 [ 22 135]
 [ 23 142]
 [ 23 143]
 [ 26 156]
 [ 27 105]
 [ 27 108]
 [ 27 158]
 [ 27 159]
 [ 27 163]
 [ 28  90]
 [ 30 122]
 [ 30 203]
 [ 30 205]
 [ 30 225]
 [ 30 226]
 [ 31   6]
 [ 31 240]
 [ 33 126]
 [ 39 284]
 [ 40   6]
 [ 40 288]
 [ 41  82]
 [ 41 298]
 [ 41 300]
 [ 41 305]
 [ 44 109]
 [ 44 177]
 [ 45 316]
 [ 50 335]
 [ 50 337]
 [ 50 342]
 [ 50 354]
 [ 51   6]
 [ 51 376]
 [ 51 381]
 [ 51 391]
 [ 51 395]
 [ 52 397]
 [ 52 398]
 [ 52 403]
 [ 52 408]
 [ 57 341]
 [ 57 433]
 [ 58 250]
 [ 58 289]
 [ 58 435]
 [ 58 455]]


IndexError: index 5 is out of bounds for axis 0 with size 3