In [4]:
import os
from pathlib import Path

os.chdir('/content/drive/MyDrive/hnm')
DATA_PATH = Path.cwd() / 'data'
RAW = DATA_PATH / 'raw'
PROCESSED = DATA_PATH / 'processed'
SUBMISSION = DATA_PATH / 'submission'

os.chdir('/content/drive/MyDrive/hnm/STOSA')

import numpy as np
import pandas as pd
import random
import pickle
import argparse

import torch
from torch.utils.data import DataLoader, RandomSampler, SequentialSampler

from datasets import SASRecDataset
from trainers import FinetuneTrainer, DistSAModelTrainer
from models import S3RecModel
from seqmodels import SASRecModel, DistSAModel, DistMeanSAModel
from utils import EarlyStopping, get_user_seqs, get_item2attribute_json, check_path, set_seed

In [5]:
parser = argparse.ArgumentParser()

parser.add_argument('--data_dir', default=str(f'{PROCESSED}/'), type=str)
parser.add_argument('--output_dir', default=str(f'{SUBMISSION}/'), type=str)
parser.add_argument('--data_name', default='transactions_train_sequences', type=str)
parser.add_argument('--do_eval', action='store_true')
parser.add_argument('--ckp', default=10, type=int, help="pretrain epochs 10, 20, 30...")

# model args
parser.add_argument("--model_name", default='DistSAModel', type=str)
parser.add_argument("--hidden_size", type=int, default=64, help="hidden size of transformer model")
parser.add_argument("--num_hidden_layers", type=int, default=1, help="number of layers")
parser.add_argument('--num_attention_heads', default=4, type=int)
parser.add_argument('--hidden_act', default="gelu", type=str) # gelu relu
parser.add_argument("--attention_probs_dropout_prob", type=float, default=0.0, help="attention dropout p")
parser.add_argument("--hidden_dropout_prob", type=float, default=0.3, help="hidden dropout p")
parser.add_argument("--initializer_range", type=float, default=0.02)
parser.add_argument('--max_seq_length', default=100, type=int)
parser.add_argument('--distance_metric', default='wasserstein', type=str)
parser.add_argument('--pvn_weight', default=0.005, type=float)
parser.add_argument('--kernel_param', default=1.0, type=float)

# train args
parser.add_argument("--lr", type=float, default=0.001, help="learning rate of adam")
parser.add_argument("--batch_size", type=int, default=1024, help="number of batch_size")
parser.add_argument("--epochs", type=int, default=20, help="number of epochs")
parser.add_argument("--no_cuda", action="store_true")
parser.add_argument("--log_freq", type=int, default=1, help="per epoch print res")
parser.add_argument("--seed", default=42, type=int)

parser.add_argument("--weight_decay", type=float, default=0.0, help="weight_decay of adam")
parser.add_argument("--adam_beta1", type=float, default=0.9, help="adam first beta value")
parser.add_argument("--adam_beta2", type=float, default=0.999, help="adam second beta value")
parser.add_argument("--gpu_id", type=str, default="0", help="gpu_id")

parser.add_argument('-f')
args = parser.parse_args() 

set_seed(args.seed)
check_path(args.output_dir)

os.environ["CUDA_VISIBLE_DEVICES"] = args.gpu_id
args.cuda_condition = torch.cuda.is_available() and not args.no_cuda

args.data_file = args.data_dir + args.data_name + '.txt'
#item2attribute_file = args.data_dir + args.data_name + '_item2attributes.json'

user_seq, max_item, valid_rating_matrix, test_rating_matrix, num_users = get_user_seqs(args.data_file)

100%|██████████| 674841/674841 [25:55<00:00, 433.93it/s]


In [6]:
#item2attribute, attribute_size = get_item2attribute_json(item2attribute_file)

args.item_size = max_item + 2
args.num_users = num_users
args.mask_id = max_item + 1
#args.attribute_size = attribute_size + 1

# save model args
args_str = f'{args.model_name}-{args.data_name}-{args.hidden_size}-{args.num_hidden_layers}-{args.num_attention_heads}-{args.hidden_act}-{args.attention_probs_dropout_prob}-{args.hidden_dropout_prob}-{args.max_seq_length}-{args.lr}-{args.weight_decay}-{args.ckp}-{args.kernel_param}-{args.pvn_weight}'
args.log_file = os.path.join(args.output_dir, args_str + '.txt')
print(str(args))
with open(args.log_file, 'a') as f:
    f.write(str(args) + '\n')

#args.item2attribute = item2attribute
# set item score in train set to `0` in validation
args.train_matrix = valid_rating_matrix

# save model
checkpoint = args_str + '.pt'
args.checkpoint_path = os.path.join(args.output_dir, checkpoint)

train_dataset = SASRecDataset(args, user_seq, data_type='train')
train_sampler = RandomSampler(train_dataset)
train_dataloader = DataLoader(train_dataset, sampler=train_sampler, batch_size=args.batch_size)

eval_dataset = SASRecDataset(args, user_seq, data_type='valid')
eval_sampler = SequentialSampler(eval_dataset)
#eval_dataloader = DataLoader(eval_dataset, sampler=eval_sampler, batch_size=200)

test_dataset = SASRecDataset(args, user_seq, data_type='test')
test_sampler = SequentialSampler(test_dataset)
#test_dataloader = DataLoader(test_dataset, sampler=test_sampler, batch_size=200)



model = DistSAModel(args=args)
eval_dataloader = DataLoader(eval_dataset, sampler=eval_sampler, batch_size=100)
test_dataloader = DataLoader(test_dataset, sampler=test_sampler, batch_size=100)
trainer = DistSAModelTrainer(model, train_dataloader, eval_dataloader,
                            test_dataloader, args)


Namespace(adam_beta1=0.9, adam_beta2=0.999, attention_probs_dropout_prob=0.0, batch_size=1024, ckp=10, cuda_condition=True, data_dir='/content/drive/MyDrive/hnm/data/processed/', data_file='/content/drive/MyDrive/hnm/data/processed/transactions_train_sequences.txt', data_name='transactions_train_sequences', distance_metric='wasserstein', do_eval=False, epochs=20, f='/root/.local/share/jupyter/runtime/kernel-19c76005-88f8-4b73-ab73-a1ead421b380.json', gpu_id='0', hidden_act='gelu', hidden_dropout_prob=0.3, hidden_size=64, initializer_range=0.02, item_size=81912, kernel_param=1.0, log_file='/content/drive/MyDrive/hnm/data/submission/DistSAModel-transactions_train_sequences-64-1-4-gelu-0.0-0.3-100-0.001-0.0-10-1.0-0.005.txt', log_freq=1, lr=0.001, mask_id=81911, max_seq_length=100, model_name='DistSAModel', no_cuda=False, num_attention_heads=4, num_hidden_layers=1, num_users=674841, output_dir='/content/drive/MyDrive/hnm/data/submission/', pvn_weight=0.005, seed=42, weight_decay=0.0)
Tota

In [7]:
if args.do_eval:
    trainer.load(args.checkpoint_path)
    print(f'Load model from {args.checkpoint_path} for test!')
    scores, result_info, _ = trainer.test(0, full_sort=True)

else:
    #pretrained_path = os.path.join(args.output_dir, f'{args.data_name}-epochs-{args.ckp}.pt')
    #try:
    #    trainer.load(pretrained_path)
    #    print(f'Load Checkpoint From {pretrained_path}!')

    #except FileNotFoundError:
    #    print(f'{pretrained_path} Not Found! The Model is same as SASRec')
    
    if args.model_name == 'DistSAModel':
        early_stopping = EarlyStopping(args.checkpoint_path, patience=100, verbose=True)
    else:
        early_stopping = EarlyStopping(args.checkpoint_path, patience=50, verbose=True)
    for epoch in range(args.epochs):
        trainer.train(epoch)
        # evaluate on MRR
        scores, _, _ = trainer.valid(epoch, full_sort=True)
        early_stopping(np.array(scores[-1:]), trainer.model)
        if early_stopping.early_stop:
            print("Early stopping")
            break

    print('---------------Change to test_rating_matrix!-------------------')
    # load the best model
    trainer.model.load_state_dict(torch.load(args.checkpoint_path))
    valid_scores, _, _ = trainer.valid('best', full_sort=True)
    trainer.args.train_matrix = test_rating_matrix
    scores, result_info, _ = trainer.test('best', full_sort=True)

print(args_str)
#print(result_info)
with open(args.log_file, 'a') as f:
    f.write(args_str + '\n')
    f.write(result_info + '\n')

{'epoch': 0, 'rec_avg_loss': '0.4119', 'rec_cur_loss': '0.2795', 'rec_avg_auc': '0.878175', 'rec_avg_pvn_loss': '0.107024'}
{'Epoch': 0, 'HIT@1': '0.00160927', 'NDCG@1': '0.00160927', 'HIT@5': '0.00574802', 'NDCG@5': '0.00366350', 'HIT@10': '0.00997568', 'NDCG@10': '0.00501878', 'HIT@15': '0.01366396', 'NDCG@15': '0.00599070', 'HIT@20': '0.01696844', 'NDCG@20': '0.00677075', 'HIT@40': '0.02788361', 'NDCG@40': '0.00898670', 'MRR': '0.00438341'}
Validation score increased.  Saving model ...
{'epoch': 1, 'rec_avg_loss': '0.2408', 'rec_cur_loss': '0.2114', 'rec_avg_auc': '0.929624', 'rec_avg_pvn_loss': '0.052112'}
{'Epoch': 1, 'HIT@1': '0.00206123', 'NDCG@1': '0.00206123', 'HIT@5': '0.00701499', 'NDCG@5': '0.00454306', 'HIT@10': '0.01220436', 'NDCG@10': '0.00619848', 'HIT@15': '0.01670023', 'NDCG@15': '0.00738678', 'HIT@20': '0.02018105', 'NDCG@20': '0.00820721', 'HIT@40': '0.03326710', 'NDCG@40': '0.01086511', 'MRR': '0.00540252'}
Validation score increased.  Saving model ...
{'epoch': 2,

In [9]:
def get_predicted_item_ids(batch):
    with torch.no_grad():
        i = 0
        # 0. batch_data will be sent into the device(GPU or cpu)
        batch = tuple(t.to(device) for t in batch)
        user_ids, input_ids, target_pos, target_neg, answers = batch
        recommend_mean_output, recommend_cov_output, _, _ = trainer.model.finetune(input_ids, user_ids)

        recommend_mean_output = recommend_mean_output[:, -1, :]
        recommend_cov_output = recommend_cov_output[:, -1, :]
        
        rating_pred = trainer.dist_predict_full(recommend_mean_output, recommend_cov_output)

        rating_pred = rating_pred.cpu().data.numpy().copy()
        batch_user_index = user_ids.cpu().numpy()
    
        rating_pred[trainer.args.train_matrix[batch_user_index].toarray() > 0] = 1e+24
        
        # reference: https://stackoverflow.com/a/23734295, https://stackoverflow.com/a/20104162
        ind = np.argpartition(rating_pred, 40)[:, :40]
        
        #ind = np.argpartition(rating_pred, -40)[:, -40:]
        arr_ind = rating_pred[np.arange(len(rating_pred))[:, None], ind]
        
        # ascending order
        arr_ind_argsort = np.argsort(arr_ind)[np.arange(len(rating_pred)), ::]
    
        #arr_ind_argsort = np.argsort(arr_ind)[np.arange(len(rating_pred)), ::-1]
        batch_pred_list = ind[np.arange(len(rating_pred))[:, None], arr_ind_argsort]

    return user_ids, batch_pred_list

In [63]:
from tqdm import tqdm 

device = torch.device("cuda" if trainer.model.args.cuda_condition else "cpu")

users_predicted = []
users_predictions = []

i = 0 
for i, batch in tqdm(enumerate(test_dataloader), total=len(test_dataloader)):
    user_ids, batch_pred_list = get_predicted_item_ids(batch)

    user_ids_list = user_ids.detach().cpu().numpy().tolist()
    item_ids_list = np.array(batch_pred_list.tolist()).astype(np.int64).tolist()
    
    if i == 0:
        users_predicted = user_ids_list
        users_predictions = item_ids_list
    else:
        users_predicted += user_ids_list
        users_predictions += item_ids_list

100%|██████████| 6749/6749 [16:08<00:00,  6.97it/s]


In [56]:
import json

with open(PROCESSED / 'umap.json') as f:
  user_map = json.load(f)

with open(PROCESSED / 'imap.json') as f:
  item_map = json.load(f)

In [57]:
user_map_inv = {user_map[elem]:elem for elem in user_map}
item_map_inv = {item_map[elem]:elem for elem in item_map}

In [97]:
d = {}

for i in range(len(users_predicted)):
    uinv = user_map_inv[users_predicted[i]+1]
    pinv = [item_map_inv[elem] for elem in users_predictions[i][0:12]]
    pinv = ['0'+elem for elem in pinv]

    if uinv in d:
        print('Error')
        break
    
    d[uinv] = ' '.join(pinv)

In [98]:
fname = str(f'{PROCESSED}')+'/stosa1p.json'

with open(fname, 'w') as f:
    json.dump(d, f)