In [15]:
import os
import torch
from reco_encoder.data import input_layer
from reco_encoder.model import model
import torch.optim as optim
from torch.optim.lr_scheduler import MultiStepLR
import torch.nn as nn
from torch.autograd import Variable
import copy
import time
from pathlib import Path
from math import sqrt
import numpy as np
from datetime import datetime

In [16]:
def do_eval(encoder, evaluation_data_layer):
    encoder.eval()
    denom = 0.0
    total_epoch_loss = 0.0
    for i, (eval, src) in enumerate(evaluation_data_layer.iterate_one_epoch_eval()):
        inputs = Variable(src.to_dense())
        targets = Variable(eval.to_dense())
        outputs = encoder(inputs)
        loss, num_ratings = model.MSEloss(outputs, targets)
        total_epoch_loss += loss.data[0]
        denom += num_ratings.data[0]
    return sqrt(total_epoch_loss / denom)


def run(args_drop_prob, args_hidden_layers, 
        args_path_to_train_data, args_path_to_valid_data, args_path_to_test_data, 
        args_non_linearity_type, args_num_epochs, args_batch_size,
        args_log_dir,
        params): 
        
    args_lr = 0.005 
    args_weight_decay = 0
    args_optimizer = "momentum"
    args_aug_step =1
    args_noise_prob = 0
    args_constrained = True
    args_skip_last_layer_nl = False

    print("Loading training data")

    data_layer = input_layer.UserItemRecDataProvider(params=params)
    print("Data loaded")
    print("Total items found: {}".format(len(data_layer.data.keys())))
    print("Vector dim: {}".format(data_layer.vector_dim))

    print("Loading valid data")
    eval_params = copy.deepcopy(params)
    # eval_params['batch_size'] = 1
    eval_params['data_dir'] = args_path_to_valid_data
    eval_data_layer = input_layer.UserItemRecDataProvider(params=eval_params,
                                                        user_id_map=data_layer.userIdMap, # the mappings are provided
                                                        item_id_map=data_layer.itemIdMap)
    eval_data_layer.src_data = data_layer.data
    #'''
    rencoder = model.AutoEncoder(layer_sizes=[data_layer.vector_dim] + args_hidden_layers,
                               nl_type=args_non_linearity_type,
                               is_constrained=args_constrained,
                               dp_drop_prob=args_drop_prob,
                               last_layer_activations=not args_skip_last_layer_nl)
    '''
    rencoder = model.AutoEncoder(hidden_layers=[data_layer.vector_dim] + args_hidden_layers,
                               activation_type=args_non_linearity_type,
                               is_constrained=args_constrained,
                               drop_out_prob=args_drop_prob,
                               activation_last_layer=not args_skip_last_layer_nl)
    '''

    model_checkpoint = args_log_dir + "/model"
    path_to_model = Path(model_checkpoint)
    if path_to_model.is_file():
        print("Loading model from: {}".format(model_checkpoint))
        rencoder.load_state_dict(torch.load(model_checkpoint))

    print('######################################################')
    print('######################################################')
    print('############# AutoEncoder Model: #####################')
    print(rencoder)
    print('######################################################')
    print('######################################################')

    if args_optimizer == "adam":
        optimizer = optim.Adam(rencoder.parameters(),
                           lr=args_lr,
                           weight_decay=args_weight_decay)
    elif args_optimizer == "adagrad":
        optimizer = optim.Adagrad(rencoder.parameters(),
                              lr=args_lr,
                              weight_decay=args_weight_decay)
    elif args_optimizer == "momentum":
        optimizer = optim.SGD(rencoder.parameters(),
                          lr=args_lr, momentum=0.9,
                          weight_decay=args_weight_decay)
        scheduler = MultiStepLR(optimizer, milestones=[24, 36, 48, 66, 72], gamma=0.5)
    elif args_optimizer == "rmsprop":
        optimizer = optim.RMSprop(rencoder.parameters(),
                              lr=args_lr, momentum=0.9,
                              weight_decay=args_weight_decay)

    t_loss = 0.0
    t_loss_denom = 0.0
    global_step = 0 
    summary_frequency = 100
    
    if args_noise_prob > 0.0:
        dp = nn.Dropout(p=args_noise_prob)

    for epoch in range(args_num_epochs):
        print('Doing epoch {} of {}'.format(epoch, args_num_epochs))
        e_start_time = time.time()
        rencoder.train()
        total_epoch_loss = 0.0
        denom = 0.0
        if args_optimizer == "momentum":
            scheduler.step()
        for i, mb in enumerate(data_layer.iterate_one_epoch()):
            inputs = Variable(mb.to_dense())
            optimizer.zero_grad()
            outputs = rencoder(inputs)
            loss, num_ratings = model.MSEloss(outputs, inputs)
            loss = loss / num_ratings
            loss.backward()
            optimizer.step()
            global_step += 1
            t_loss += loss.data[0]
            t_loss_denom += 1
    
            if i % summary_frequency == 0:
                print('[lei] t_loss: %.7f, t_loss_denom: %.7f' % (t_loss, t_loss_denom))
                print('[%d, %5d] RMSE: %.7f' % (epoch, i, sqrt(1.0 * t_loss / t_loss_denom)))
                t_loss = 0
                t_loss_denom = 0.0

            total_epoch_loss += loss.data[0]
            denom += 1

            #if args_aug_step > 0 and i % args_aug_step == 0 and i > 0:
            if args_aug_step > 0:
                # Magic data augmentation trick happen here
                for t in range(args_aug_step):
                    inputs = Variable(outputs.data)
                    if args_noise_prob > 0.0:
                        inputs = dp(inputs)
                    optimizer.zero_grad()
                    outputs = rencoder(inputs)
                    loss, num_ratings = model.MSEloss(outputs, inputs)
                    loss = loss / num_ratings
                    loss.backward()
                    optimizer.step()    

        e_end_time = time.time()
        print('Total epoch {} finished in {} seconds with TRAINING RMSE loss: {}'
              .format(epoch, e_end_time - e_start_time, sqrt(1.0 * total_epoch_loss/denom)))
        if epoch % 3 == 0 or epoch == args_num_epochs - 1:
            eval_loss = do_eval(rencoder, eval_data_layer)
            print('Epoch {} EVALUATION LOSS: {}'.format(epoch, eval_loss))
            print("Saving model to {}".format(model_checkpoint + ".epoch_"+str(epoch)))
            torch.save(rencoder.state_dict(), model_checkpoint + ".epoch_"+str(epoch))

    print("Saving model to {}".format(model_checkpoint + ".last"))
    torch.save(rencoder.state_dict(), model_checkpoint + ".last")


In [17]:
def infer(args_drop_prob, args_hidden_layers, 
        args_path_to_train_data, args_path_to_valid_data, args_path_to_test_data, 
        args_non_linearity_type, args_num_epochs, args_batch_size,
        args_log_dir,
        params):
  
    args_predictions_path = args_log_dir + "/preds.txt"
    print("Loading training data")
    data_layer = input_layer.UserItemRecDataProvider(params=params)
    print("Data loaded")
    print("Total items found: {}".format(len(data_layer.data.keys())))
    print("Vector dim: {}".format(data_layer.vector_dim))

    print("Loading eval data")
    eval_params = copy.deepcopy(params)
    # must set eval batch size to 1 to make sure no examples are missed
    eval_params['batch_size'] = 1
    eval_params['data_dir'] = args_path_to_test_data
    eval_data_layer = input_layer.UserItemRecDataProvider(params=eval_params,
                                                          user_id_map=data_layer.userIdMap,
                                                          item_id_map=data_layer.itemIdMap)

    rencoder = model.AutoEncoder(layer_sizes=[data_layer.vector_dim] + args_hidden_layers,
                               nl_type=args_non_linearity_type,
                               is_constrained=True,
                               dp_drop_prob=args_drop_prob,
                               last_layer_activations=True)

    path_to_model = Path(args_log_dir)
    #print("Path to the model:{}".format(path_to_model))
    #if path_to_model.is_file():
        #print("Loading model from: {}".format(path_to_model))
        #rencoder.load_state_dict(torch.load(path_to_model))
    print("Loading model from: {}".format(path_to_model))
    rencoder.load_state_dict(torch.load(args_log_dir+'/model.last'))

    print('######################################################')
    print('######################################################')
    print('############# AutoEncoder Model: #####################')
    print(rencoder)
    print('######################################################')
    print('######################################################')
    rencoder.eval()
    inv_userIdMap = {v: k for k, v in data_layer.userIdMap.items()}
    inv_itemIdMap = {v: k for k, v in data_layer.itemIdMap.items()}

    eval_data_layer.src_data = data_layer.data
    with open(args_predictions_path, 'w') as outf:
        for i, ((out, src), majorInd) in enumerate(eval_data_layer.iterate_one_epoch_eval(for_inf=True)):
            inputs = Variable(src.to_dense())
            targets_np = out.to_dense().numpy()[0, :]
            outputs = rencoder(inputs).cpu().data.numpy()[0, :]
            non_zeros = targets_np.nonzero()[0].tolist()
            major_key = inv_userIdMap[majorInd]
            for ind in non_zeros:
                outf.write("{}\t{}\t{}\t{}\n".format(major_key, inv_itemIdMap[ind], outputs[ind], targets_np[ind]))
            if i % 10000 == 0:
                print("Done: {}".format(i))
    print("Total done: {}".format(i))

In [None]:
now_hash = str(datetime.now()).replace(' ', '|')
args_drop_prob = 0.8
args_hidden_layers = [512, 512, 1024]
args_path_to_train_data = "Netflix/SUB_TRAIN"
args_path_to_valid_data = "Netflix/SUB_VALID"
args_path_to_test_data = "Netflix/SUB_TEST"
args_non_linearity_type = "selu"
args_num_epochs = 12
args_batch_size = 128
args_log_dir = "model_save_{}".format(now_hash)

params = {
    'batch_size': args_batch_size,
    'data_dir': args_path_to_train_data,
    'major': 'users',
    'itemIdInd': 1,
    'userIdInd': 0,
    # 'delimiter': ","
}

if not os.path.exists(args_log_dir):
    os.makedirs(args_log_dir)




In [None]:
run(
    args_drop_prob, args_hidden_layers, 
    args_path_to_train_data, args_path_to_valid_data, args_path_to_test_data, 
    args_non_linearity_type, args_num_epochs, args_batch_size,
    args_log_dir,
    params
)

Total epoch 0 finished in 56.01937174797058 seconds with TRAINING RMSE loss: 0.689641756732826
Epoch 0 EVALUATION LOSS: 3.8512351986654445
Saving model to model_save_2017-12-13|18:40:02.972474/model.epoch_0
Doing epoch 1 of 12
[lei] t_loss: 45.5169214, t_loss_denom: 96.0000000
[1,     0] RMSE: 0.6885743
Total epoch 1 finished in 55.82537508010864 seconds with TRAINING RMSE loss: 0.5675758032500774
Doing epoch 2 of 12
[lei] t_loss: 30.8367117, t_loss_denom: 96.0000000
[2,     0] RMSE: 0.5667590
Total epoch 2 finished in 54.199846029281616 seconds with TRAINING RMSE loss: 0.5228552466730471
Doing epoch 3 of 12
[lei] t_loss: 26.2594767, t_loss_denom: 96.0000000
[3,     0] RMSE: 0.5230069
Total epoch 3 finished in 55.48429584503174 seconds with TRAINING RMSE loss: 0.5070600960274384
Epoch 3 EVALUATION LOSS: 3.8910174544333556
Saving model to model_save_2017-12-13|18:40:02.972474/model.epoch_3
Doing epoch 4 of 12
[lei] t_loss: 24.7068621, t_loss_denom: 96.0000000
[4,     0] RMSE: 0.5073097


In [None]:
infer(
    args_drop_prob, args_hidden_layers, 
    args_path_to_train_data, args_path_to_valid_data, args_path_to_test_data, 
    args_non_linearity_type, args_num_epochs, args_batch_size,
    args_log_dir,
    params
)

In [156]:
def getRMSE(file):
    print("file", file)
    n = 0
    denom = 0.0
    for line in open(file, 'r').readlines():
        parts = line.split('\t')
        denom += (float(parts[2]) - float(parts[3])) ** 2
        n += 1
    print("####################")
    print("RMSE: {}".format(sqrt(denom/n)))
    print("####################")
    

file = args_log_dir + "/preds.txt"
getRMSE(file)

file model_save_2017-12-12|04:27:28.508141/preds.txt
####################
RMSE: 3.8976483763475067
####################


In [None]:
args_log_dir

In [None]:
path_to_model