# *Matrix Factorization*

In this notebook, we model last.fm data with Probabilistic Matrix Factorization model (implemented in pyTorch). We compare performance of a traditional approach against a decision-maker (neural network) to show that it is capable of correcting errors due to posterior approximation. The inference is carried out using automatic differentiation variational inference, and we split the data randomly into equally sized training and test set.

### Imports

In [2]:
import sys
import time
import pickle

import numpy as np
import pandas as pd

import torch
import torch.nn as nn
from torch.distributions import Normal
from torch.nn.functional import softplus

In [3]:
if torch.cuda.is_available():
    torch.set_default_tensor_type('torch.cuda.FloatTensor')
    env = torch.cuda
    device = torch.device('cuda')
    print("Using GPU")
else:
    torch.set_default_tensor_type('torch.FloatTensor')
    env = torch
    device = torch.device('cpu')
    print("Using CPU")
    
    
def totorch(array):
    return torch.tensor(array, dtype=env.float32)

Using CPU


In [4]:
from aux import print2, tonumpy, flatten_first_two_dims, parse_script_args, dict2str

In [5]:
import losses, aux_optimization

In [6]:
import count_data

### Configuration

In [7]:
args = parse_script_args() # arguments can be passed in the format of NAME1=FLOATVAL1,NAME2=[STRVAL2],NAME3=INTVAL3,...

parsing: <-f>


In [8]:
# optimization general parmeters
SEED = args.get("SEED", 1)
NITER  = 30001 # number of iterations - around 70k is the right number for mininbatch=10
LR = 0.01 # 0.1 is the right number for the full batch, 0.001 is advised for mininbatch=10
MINIBATCH = 100 # how many rows of the matrix per minibatch

# model parameter
K = 20 # number of latent variables

# number of samples used to approximate ELBO term
NSAMPLES = 11

In [9]:
# selected loss: tilted/squared/exptilted/expsquared
LOSS = args.get("LOSS", "tilted")
TILTED_Q = args.get("TILTED_Q", 0.5) # relevant only for tilted and exptilted

In [10]:
S = args.get("S", 1000)
NUM_QUANTILES = args.get("NUM_QUANTILES", 20)

In [11]:
# regularization: lambda=0 means no regularization
REGULARIZATION = args.get("REGULARIZATION", "const").lower()
LAMBDA = args.get("LAMBDA", 0.)

In [12]:
print("CONFIGURATION SUMMARY: %s" % dict2str(globals()))

CONFIGURATION SUMMARY: K=20 LAMBDA=0.0 LOSS=tilted LR=0.01 MINIBATCH=100 NITER=30001 NSAMPLES=11 NUM_QUANTILES=20 REGULARIZATION=const S=1000 SEED=1 TILTED_Q=0.5


### Data

In [13]:
Y, MASK = count_data.lastfm_data(log=True)

print("Fixed (~true) data: sparsity level=%.3f" % ((Y==0).sum() / (Y.shape[0]*Y.shape[1])) )
print("Fixed (~true) data: mean=%.3f, std=%.3f" % (Y.mean(), Y.std()))

Y_TRAIN, Y_TEST, TRAIN_MASK, TEST_MASK = count_data.test_train_split(Y, MASK)

Fixed (~true) data: sparsity level=0.565
Fixed (~true) data: mean=1.202, std=1.762


### Losses

In [14]:
loss, optimal_h_bayes_estimator = losses.LossFactory(**globals()).create(LOSS)
print2("> <%s> loss: %s with (analytical/Bayes estimator) h: %s" % 
        (LOSS, loss.__name__, optimal_h_bayes_estimator.__name__))

[LossFactory] Configuration: TILTED_Q=0.5 LINEX_C=None
> <tilted> loss: tilted_loss_fixedq with (analytical/Bayes estimator) h: tilted_optimal_h_fixedq


In [15]:
def empirical_risk1(preds_flat, y, mask):
  """
    preds_flat  torch tensor (vector)
    y, mask     numpy array
  """
  assert len(preds_flat.shape)==1
  assert y.shape==mask.shape
  assert preds_flat.shape[0]==y.shape[0]*y.shape[1]
  mask_flat = torch.tensor(mask, dtype=env.uint8).view(-1)
  test_preditions = torch.masked_select(preds_flat, mask_flat)
  test_y = totorch( y[mask.astype(bool)] )  
  return loss(test_preditions, test_y).mean()
  

def empirical_risk(preds, y, mask):
  assert preds.shape==y.shape
  assert y.shape==mask.shape  
  preds_flat = preds.view(-1)
  return empirical_risk1(preds_flat, y, mask)

### Model

In [16]:
import normal_normal_mf as model

In [17]:
path = "PMF_qwqz_%s.pkl" % SEED
try:
    print("Loading the model from %s" % path)
    qw, qz = pickle.load(open(path, "rb"))
except:
    print("Failed. Recomputing.")
    qw, qz = model.vi_inference(Y, TRAIN_MASK, K, MINIBATCH, NSAMPLES, SEED, NITER, LR)
    pickle.dump((qw,qz), open(path, "wb"))

Loading the model from PMF_qwqz_1.pkl
Failed. Recomputing.
[0.05s] 0. iteration, 0. epoch
[0.08s] 1. iteration, 0. epoch
[0.12s] 2. iteration, 0. epoch
[0.15s] 3. iteration, 0. epoch
[0.16s] 4. iteration, 0. epoch
[0.20s] 5. iteration, 0. epoch
[0.25s] 6. iteration, 0. epoch
[0.29s] 7. iteration, 0. epoch
[0.32s] 8. iteration, 0. epoch
[0.36s] 9. iteration, 0. epoch
[12.34s] 1000. iteration, 100. epoch
[24.42s] 2000. iteration, 200. epoch
[36.50s] 3000. iteration, 300. epoch
[49.31s] 4000. iteration, 400. epoch
[70.45s] 5000. iteration, 500. epoch
[107.72s] 6000. iteration, 600. epoch
[134.29s] 7000. iteration, 700. epoch
[148.24s] 8000. iteration, 800. epoch
[161.92s] 9000. iteration, 900. epoch
[180.39s] 10000. iteration, 1000. epoch
[199.64s] 11000. iteration, 1100. epoch
[221.72s] 12000. iteration, 1200. epoch
[260.53s] 13000. iteration, 1300. epoch
[282.62s] 14000. iteration, 1400. epoch
[293.96s] 15000. iteration, 1500. epoch
[305.87s] 16000. iteration, 1600. epoch
[316.96s] 1700

In [18]:
from normal_normal_mf import sample_predictive_y
ys = sample_predictive_y(qw, qz, nsamples_theta=1000, nsamples_y=1)        
#pickle.dump(ys, open("PMF_samples_%i.pkl" % SEED, "wb"))

In [19]:
#ys = pickle.load(open("PMF_samples_%i.pkl" % SEED, "rb"))
Y_SAMPLES = tonumpy(ys)

### VI Evaluation

In [20]:
hvi = optimal_h_bayes_estimator(ys) # approximately optimal decisions

In [21]:
vi_train_risk = empirical_risk(hvi, Y, TRAIN_MASK).item()
vi_test_risk = empirical_risk(hvi, Y, TEST_MASK).item()
print("optimal risk: train:%.3f test:%.4f" % (vi_train_risk, vi_test_risk))

optimal risk: train:0.310 test:0.4620


### Regularization

In [22]:
def get_regularization_constant(ys, l=LAMBDA):
    return np.ones(ys.shape[1:])*l


def get_regularization_std(ys, l=LAMBDA):
    sigma = tonumpy(ys).std(0) * l
    lam = 1. / sigma
    return lam    


def get_regularization_qdiff(ys, l=LAMBDA):
    h = optimal_h_bayes_estimator(ys).clone().detach()
    Quantiles = np.array([np.percentile(Y_SAMPLES, int(q*100), axis=0) for q in [0.5-abs(l), 0.5+abs(l)]])
    sigma = abs(Quantiles[0,:,:]-Quantiles[1,:,:])
    lam = 1. / sigma
    return lam


def get_regularization(ys, method=REGULARIZATION, l=LAMBDA):
    if method.startswith("const"): return get_regularization_constant(ys, l)
    if method.startswith("std"): return get_regularization_std(ys, l)
    if method.startswith("qdiff"): return get_regularization_qdiff(ys, l)
    raise Exception("Unknown regularization method name: %s" % method)

In [23]:
HSTAR = hvi
LAMBDA_TRAIN = get_regularization(ys)

### Quantile optimization

In [24]:
def training_empirical_risk(q):    
    q = max(0., min(1., q[0]))
    h = losses.tilted_optimal_h(ys, q) # obtaing a quantile in a very convoluted way :)
    risk = empirical_risk(h, Y, TRAIN_MASK).item() 
    regularizer = ( LAMBDA_TRAIN[TRAIN_MASK.astype(bool)] * tonumpy((h-HSTAR)**2)[TRAIN_MASK.astype(bool)] ).mean()
    obj = risk + regularizer
    print("evaluating training risk @ q=%.2f => risk=%.3f => obj=%.3f" % (q, risk, obj))
    return obj            


from scipy.optimize import minimize
res = minimize(training_empirical_risk, [0.5], 
               method='Nelder-Mead', options={'xtol': 1e-3, 'disp': True, "maxiter": 100})

evaluating training risk @ q=0.50 => risk=0.310 => obj=0.310
evaluating training risk @ q=0.53 => risk=0.315 => obj=0.315
evaluating training risk @ q=0.47 => risk=0.309 => obj=0.309
evaluating training risk @ q=0.45 => risk=0.314 => obj=0.314
evaluating training risk @ q=0.45 => risk=0.314 => obj=0.314
evaluating training risk @ q=0.49 => risk=0.309 => obj=0.309
evaluating training risk @ q=0.50 => risk=0.310 => obj=0.310
evaluating training risk @ q=0.48 => risk=0.309 => obj=0.309
evaluating training risk @ q=0.49 => risk=0.309 => obj=0.309
evaluating training risk @ q=0.48 => risk=0.309 => obj=0.309
evaluating training risk @ q=0.48 => risk=0.309 => obj=0.309
evaluating training risk @ q=0.49 => risk=0.309 => obj=0.309
evaluating training risk @ q=0.49 => risk=0.309 => obj=0.309
evaluating training risk @ q=0.49 => risk=0.309 => obj=0.309
evaluating training risk @ q=0.49 => risk=0.309 => obj=0.309
evaluating training risk @ q=0.49 => risk=0.309 => obj=0.309
evaluating training risk

In [25]:
q = max(0., min(1., res["x"][0]))
h = losses.tilted_optimal_h(ys, q)
q_train_risk = empirical_risk(h, Y, TRAIN_MASK).item() 
q_test_risk = empirical_risk(h, Y, TEST_MASK).item() 
print("optimal risk: train:%.3f test:%.4f" % (q_train_risk, q_test_risk))

optimal risk: train:0.309 test:0.4598


### Decision maker

In [26]:
torch.manual_seed(SEED)

<torch._C.Generator at 0x7f6351401470>

In [27]:
from normal_normal_mf import sample_predictive_y
ys = sample_predictive_y(qw, qz, nsamples_theta=S, nsamples_y=1)        
Y_SAMPLES = tonumpy(ys)
print(Y_SAMPLES.shape)

(1000, 992, 100)


In [28]:
# Prepare features 
step = 1.0/(NUM_QUANTILES+1)
qs = list(np.arange(step, 0.99, step))
Quantiles = np.array([np.percentile(Y_SAMPLES, int(q*100), axis=0) for q in qs])

X = Quantiles
print(X.shape)

(20, 992, 100)


In [29]:
decision_maker = nn.Sequential(
  nn.Linear(X.shape[0], 20),
  nn.ReLU(),
  nn.Linear(20, 10),
  nn.ReLU(),
  nn.Linear(10, 1)
)

print(decision_maker)

Sequential(
  (0): Linear(in_features=20, out_features=20, bias=True)
  (1): ReLU()
  (2): Linear(in_features=20, out_features=10, bias=True)
  (3): ReLU()
  (4): Linear(in_features=10, out_features=1, bias=True)
)


In [30]:
optimizer = torch.optim.Adam(decision_maker.parameters(), lr=0.01)

In [31]:
X_flat = totorch( X.reshape(X.shape[0],-1).transpose() )

In [32]:
predictions = decision_maker(X_flat).view(-1)
print("initial risk: train:%.3f test:%.3f" % (empirical_risk1(predictions, Y, TRAIN_MASK),
                                                empirical_risk1(predictions, Y, TEST_MASK)))

initial risk: train:0.712 test:0.707


In [33]:
HSTAR_FROZEN = HSTAR.detach()

In [34]:
start = time.time()
N = Y.shape[0]
for i in range(10000):
    rows, epoch_no, sgd_scale = aux_optimization.yield_minibatch_rows(i, N, MINIBATCH)
  
    minibatch_TRAIN = TRAIN_MASK[rows,:].reshape(-1).astype(bool)
  
    minibatch_X = X[:,rows,:].reshape(X.shape[0], -1)
    minibatch_X = minibatch_X[:,minibatch_TRAIN]
  
    minibatch_Y = Y[rows,:].reshape(-1)
    minibatch_Y = minibatch_Y[minibatch_TRAIN]
  
    minibatch_predictions = decision_maker( totorch(minibatch_X.transpose()) ).view(-1)
    minibatch_loss = loss(minibatch_predictions, totorch(minibatch_Y)).mean()
    
    minibatch_h = torch.masked_select(HSTAR_FROZEN[rows].view(-1), 
                                      torch.tensor(minibatch_TRAIN).type(env.ByteTensor))
    minibatch_l = totorch(LAMBDA_TRAIN[rows].reshape(-1)[minibatch_TRAIN])
    regularizer = ( minibatch_l * (minibatch_predictions-minibatch_h)**2 ).mean()
  
    optimizer.zero_grad()
    (minibatch_loss+regularizer).backward()
    #minibatch_loss.backward()
    optimizer.step()
  
    if i%1000==0 or i<100:
        print("[%.2fs] %i. iteration, %i. epoch" % (time.time()-start, i, epoch_no))    
        predictions = decision_maker(X_flat).view(-1)
        print("epoch: %s training batch risk: %.3f train risk: %.3f test risk: %.3f" % 
          (epoch_no, minibatch_loss.item(), 
           empirical_risk1(predictions, Y, TRAIN_MASK),
           empirical_risk1(predictions, Y, TEST_MASK)))

[0.04s] 0. iteration, 0. epoch
epoch: 0 training batch risk: 0.746 train risk: 0.632 test risk: 0.629
[0.09s] 1. iteration, 0. epoch
epoch: 0 training batch risk: 0.585 train risk: 0.572 test risk: 0.581
[0.14s] 2. iteration, 0. epoch
epoch: 0 training batch risk: 0.559 train risk: 0.503 test risk: 0.532
[0.21s] 3. iteration, 0. epoch
epoch: 0 training batch risk: 0.538 train risk: 0.442 test risk: 0.492
[0.24s] 4. iteration, 0. epoch
epoch: 0 training batch risk: 0.454 train risk: 0.383 test risk: 0.461
[0.31s] 5. iteration, 0. epoch
epoch: 0 training batch risk: 0.427 train risk: 0.333 test risk: 0.446
[0.35s] 6. iteration, 0. epoch
epoch: 0 training batch risk: 0.308 train risk: 0.312 test risk: 0.459
[0.42s] 7. iteration, 0. epoch
epoch: 0 training batch risk: 0.310 train risk: 0.337 test risk: 0.500
[0.47s] 8. iteration, 0. epoch
epoch: 0 training batch risk: 0.327 train risk: 0.364 test risk: 0.530
[0.52s] 9. iteration, 0. epoch
epoch: 0 training batch risk: 0.369 train risk: 0.3

epoch: 8 training batch risk: 0.254 train risk: 0.248 test risk: 0.414
[3.66s] 85. iteration, 8. epoch
epoch: 8 training batch risk: 0.265 train risk: 0.248 test risk: 0.414
[3.70s] 86. iteration, 8. epoch
epoch: 8 training batch risk: 0.264 train risk: 0.248 test risk: 0.413
[3.72s] 87. iteration, 8. epoch
epoch: 8 training batch risk: 0.241 train risk: 0.248 test risk: 0.412
[3.74s] 88. iteration, 8. epoch
epoch: 8 training batch risk: 0.260 train risk: 0.248 test risk: 0.410
[3.77s] 89. iteration, 8. epoch
epoch: 8 training batch risk: 0.238 train risk: 0.249 test risk: 0.409
[3.80s] 90. iteration, 9. epoch
epoch: 9 training batch risk: 0.229 train risk: 0.249 test risk: 0.408
[3.86s] 91. iteration, 9. epoch
epoch: 9 training batch risk: 0.234 train risk: 0.249 test risk: 0.408
[3.88s] 92. iteration, 9. epoch
epoch: 9 training batch risk: 0.263 train risk: 0.248 test risk: 0.409
[3.90s] 93. iteration, 9. epoch
epoch: 9 training batch risk: 0.255 train risk: 0.248 test risk: 0.411
[3

In [35]:
predictions = decision_maker(X_flat).view(-1)
dm_train_risk = empirical_risk1(predictions, Y, TRAIN_MASK).item()
dm_test_risk = empirical_risk1(predictions, Y, TEST_MASK).item()
print("optimal risk: train:%.3f test:%.4f" % (dm_train_risk, dm_test_risk))

optimal risk: train:0.244 test:0.4045


### Store evaluation results

In [36]:
results = [[SEED, LOSS, TILTED_Q, REGULARIZATION, LAMBDA, S, NUM_QUANTILES,
            vi_train_risk, vi_test_risk, q_train_risk, q_test_risk, dm_train_risk, dm_test_risk]]

In [37]:
path = "PMF_evaluation_%i_%s_%s_%s_%s_%s_%s.csv" % (SEED, LOSS, TILTED_Q, REGULARIZATION, LAMBDA, S, NUM_QUANTILES)
print("Saving to %s" % path)
pd.DataFrame(results).to_csv(path, header=False, index=False)

Saving to PMF_evaluation_1_tilted_0.5_const_0.0_1000_20.csv
