In [1]:
import time
import random
import pickle
import numpy as np
import matplotlib.pyplot as plt
from os.path import join

import torch
from torch import nn
from torch.utils.data import DataLoader
import torch.nn.functional as F
import torch.optim as optim
from torch.optim.lr_scheduler import StepLR
from torch.autograd import Variable
from torch.backends import cudnn
from torch.utils.data import Dataset
from torch.nn.utils.rnn import pack_padded_sequence, pad_packed_sequence

from src.utils import collate_fn
from src.dataset import load_data
from src import metric

import warnings
warnings.filterwarnings("ignore")

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
class Config():
    def __init__(self):
        self.DEBUG = True
        self.dataset_path = 'data/ml-100k/'
        self.batch_size = 512
        self.hidden_size = 100 #gru
        self.embed_dim = 50 #item embedding
        self.epochs = 5 if self.DEBUG else 100
        self.lr = 0.001
        self.lr_dc = 0.1
        self.lr_dc_step = 80
        self.topk = 20
        self.valid_portion = 0.2
        self.test = True #控制模型是否进行加载、测试的参数
        self.device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
args = Config()

In [3]:
!python ./src/preprocess.py

-- Starting @ 2023-02-13 20:04:39.799690s
-- Reading data @ 2023-02-13 20:04:40.296711s
Splitting date 890694638
训练集session数量:	1636
测试集session数量:	327
-- Splitting train set and test set @ 2023-02-13 20:04:40.359684s
item number:	1342
训练集序列数:	80737
测试集序列数:	16157
序列的avg length:  50.48621041879469
Done.


In [4]:
print('Loading data...')
train, valid, test = load_data(args.dataset_path, valid_portion = args.valid_portion)
print('ok')

Loading data...
ok


In [5]:
class NARM(nn.Module):
    def __init__(self, n_items, hidden_size, embedding_dim, batch_size, n_layers = 1):
        super(NARM, self).__init__()
        self.n_items = n_items
        self.hidden_size = hidden_size
        self.batch_size = batch_size
        self.n_layers = n_layers
        self.embedding_dim = embedding_dim
        self.emb = nn.Embedding(self.n_items, self.embedding_dim, padding_idx = 0)
        self.emb_dropout = nn.Dropout(0.25)
        self.gru = nn.GRU(self.embedding_dim, self.hidden_size, self.n_layers)
        self.a_1 = nn.Linear(self.hidden_size, self.hidden_size, bias=False)
        self.a_2 = nn.Linear(self.hidden_size, self.hidden_size, bias=False)
        self.v_t = nn.Linear(self.hidden_size, 1, bias=False)
        self.ct_dropout = nn.Dropout(0.5)
        self.b = nn.Linear(self.embedding_dim, 2 * self.hidden_size, bias=False)
        self.device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    def forward(self, seq, lengths):
        hidden = self.init_hidden(seq.size(1))
        embs = self.emb_dropout(self.emb(seq))
        embs = pack_padded_sequence(embs, lengths)
        gru_out, hidden = self.gru(embs, hidden)
        gru_out, lengths = pad_packed_sequence(gru_out)

        ht = hidden[-1]
        gru_out = gru_out.permute(1, 0, 2)

        c_global = ht
        q1 = self.a_1(gru_out.contiguous().view(-1, self.hidden_size)).view(gru_out.size())  
        q2 = self.a_2(ht)

        mask = torch.where(seq.permute(1, 0) > 0, torch.tensor([1.], device = self.device), torch.tensor([0.], device = self.device))
        q2_expand = q2.unsqueeze(1).expand_as(q1)
        q2_masked = mask.unsqueeze(2).expand_as(q1) * q2_expand
        
        alpha = self.v_t(torch.sigmoid(q1 + q2_masked).view(-1, self.hidden_size)).view(mask.size())
        c_local = torch.sum(alpha.unsqueeze(2).expand_as(gru_out) * gru_out, 1)

        c_t = torch.cat([c_local, c_global], 1)
        c_t = self.ct_dropout(c_t)
        
        item_embs = self.emb(torch.arange(self.n_items).to(self.device))
        scores = torch.matmul(c_t, self.b(item_embs).permute(1, 0))

        return scores

    def init_hidden(self, batch_size):
        return torch.zeros((self.n_layers, batch_size, self.hidden_size), requires_grad=True).to(self.device)

In [6]:
n_items = 1342
model = NARM(n_items, args.hidden_size, args.embed_dim, args.batch_size).to(args.device)
optimizer = optim.Adam(model.parameters(), args.lr)
criterion = nn.CrossEntropyLoss()
scheduler = StepLR(optimizer, step_size = args.lr_dc_step, gamma = args.lr_dc)

In [7]:
class RecSysDataset(Dataset):
    """define the pytorch Dataset class for yoochoose and diginetica datasets.
    """
    def __init__(self, data):
        self.data = data
        print('-'*50)
        print('Dataset info:')
        print('Number of sessions: {}'.format(len(data[0])))
        print('-'*50)
        
    def __getitem__(self, index):
        session_items = self.data[0][index]
        target_item = self.data[1][index]
        return session_items, target_item

    def __len__(self):
        return len(self.data[0])

In [8]:
train_data = RecSysDataset(train)
valid_data = RecSysDataset(valid)
test_data = RecSysDataset(test)
train_loader = DataLoader(train_data, batch_size = args.batch_size, shuffle = True, collate_fn = collate_fn)
valid_loader = DataLoader(valid_data, batch_size = args.batch_size, shuffle = False, collate_fn = collate_fn)
test_loader = DataLoader(test_data, batch_size = args.batch_size, shuffle = False, collate_fn = collate_fn)

--------------------------------------------------
Dataset info:
Number of sessions: 64590
--------------------------------------------------
--------------------------------------------------
Dataset info:
Number of sessions: 16147
--------------------------------------------------
--------------------------------------------------
Dataset info:
Number of sessions: 16157
--------------------------------------------------


In [9]:
Loss = []
def trainForEpoch(train_loader, model, optimizer, epoch, num_epochs, criterion):
    global Time
    model.train()

    sum_epoch_loss = 0
    start = time.time()
    for i, (seq, target, lens) in enumerate(train_loader):
        seq = seq.to(args.device)
        target = target.to(args.device)
        
        optimizer.zero_grad()
        outputs = model(seq, lens)
        loss = criterion(outputs, target)
        loss.backward()
        optimizer.step() 

        loss_val = loss.item()
        sum_epoch_loss += loss_val
    Loss.append(sum_epoch_loss / len(train_loader))
    with open('./NARM-work/{}.log'.format(Time), 'a+') as f:
        f.write('[TRAIN] epoch %d/%d\tloss: %.4f\t(%.2f im/s)\n'
        % (epoch + 1, num_epochs, sum_epoch_loss / len(train_loader), time.time() - start))

In [10]:
#评价函数的定义
def validate(valid_loader, model):
    model.eval()
    recalls = []
    mrrs = []
    with torch.no_grad():
        for seq, target, lens in valid_loader:
            seq = seq.to(args.device)
            target = target.to(args.device)
            outputs = model(seq, lens)
            logits = F.softmax(outputs, dim = 1)
            recall, mrr = metric.evaluate(logits, target, k = args.topk)
            recalls.append(recall)
            mrrs.append(mrr)
    
    mean_recall = np.mean(recalls)
    mean_mrr = np.mean(mrrs)
    return mean_recall, mean_mrr

In [11]:
Time = time.strftime("%Y-%m-%d-%H-%M-%S", time.localtime())
recalls = []
best_re = 0.0
print('training...')
for epoch in range(args.epochs):
    # train for one epoch
    scheduler.step(epoch = epoch)
    trainForEpoch(train_loader, model, optimizer, epoch, args.epochs, criterion)

    recall, mrr = validate(valid_loader, model)
    recalls.append(recall)
    with open('./NARM-work/{}.log'.format(Time), 'a+') as f:
        f.write('Epoch {} validation: Recall@{}: {:.4f},\tMRR@{}: {:.4f} \n\n'.format(epoch, args.topk, recall, args.topk, mrr))

    # 模型存储信息
    ckpt_dict = {
        'epoch': epoch + 1,
        'state_dict': model.state_dict(),
        'optimizer': optimizer.state_dict()
    }
    ckpt_dict = {
        'epoch': epoch + 1,
        'state_dict': model.state_dict(),
        'optimizer': optimizer.state_dict()
    }
    if recall > best_re:
        best_re = recall
        torch.save(ckpt_dict, './NARM-temp/{}_{}.pth.tar'.format(Time, best_re))
print('Done...')

training...
Done...


In [12]:
if test:
    ckpt = torch.load('./NARM-temp/{}_{}.pth.tar'.format(Time, best_re))
    model.load_state_dict(ckpt['state_dict'])
    recall, mrr = validate(test_loader, model)
    print("Test: Recall@{}: {:.4f}, MRR@{}: {:.4f}".format(args.topk, recall, args.topk, mrr))

Test: Recall@20: 0.0930, MRR@20: 0.0205
