In [1]:
import math
import gc
import numpy as np
import pandas as pd
from time import time

from metric import precision_recall

# Constants

In [2]:
MAX_ITEM = 4698 + 1
MAX_SEQ_LENGTH = 200      # Max sequence length
MIN_SEQ_LENGTH = 20       # Min sequence length
EMBEDDING_DIM = 32

# Prepare data

In [3]:
from data import Dataset

data = Dataset(csvfile='interactions.csv', 
               num_test_users=500, 
               sample=0.15, 
               cut_item=100)

train, test = data.get_train_test_sequences()

#User: 7499	#Item: 4698
796479 49833


In [38]:
pop_prob = np.ones(MAX_ITEM)
pop_list = data.train.item.value_counts()
pop_prob[pop_list.index] = pop_list.values
pop_prob = np.log1p(pop_prob)

In [4]:
train.head(3)

Unnamed: 0,user,item_sequence
0,1,"[161, 190, 222, 313, 422, 534, 152, 456, 488, ..."
1,2,"[1041, 2177, 1750, 1929, 4569, 4457, 4448, 366..."
2,3,"[1041, 1750, 2177, 464, 1489, 1720, 1937, 3148..."


In [5]:
test.head(3)

Unnamed: 0,user,item_sequence,eval_sequence
0,39,"[248, 158, 1041, 201, 1, 532, 2933, 364, 478, ...","[1282, 4302, 408, 4202, 4028, 4092, 815, 2402,..."
1,59,"[139, 319, 36, 486, 460, 128, 478, 1181, 3522,...","[4333, 500, 512, 3414, 1349, 2889, 4392, 146, ..."
2,66,"[796, 282, 389, 644, 751, 1231, 280, 993, 1154...","[1793, 1908, 2601, 1010, 992]"


# Data manager

In [27]:
from sequence_utils import sliding_windows, pad_sequences

In [39]:
class MiniBatcher():
    """
    Create mini batch for df (dataframe of sequences).
    
    First sliding windows (long sequences would be cut into several pices). Then, zero padding to the left.
    
    While iterate the dataset, sequences would be shuffled. And negative items would be generated.
    """
    def __init__(self, df, n_items, 
                 batch_size=128, 
                 shuffle=True,
                 random_state=None):
        
        self._n_items = n_items
        self.batch_size = batch_size
        self.shuffle = shuffle
        
        if random_state is None:
            random_state = np.random.RandomState(123)
        self.random_state = random_state
        
        # Gen sequence
        sequences = self._gen_sequences(df, maxlen=MAX_SEQ_LENGTH, minlen=MIN_SEQ_LENGTH)
        user_ids, sequences = zip(*sequences)   # It's memory-consuming, but faster to gen mini batch.
        self.user_ids = user_ids
        # Pad zeros
        self.sequences = pad_sequences(sequences, maxlen=MAX_SEQ_LENGTH)
        self._size = len(self.sequences)
        
    def _gen_sequences(self, df, shuffle=True, maxlen=200, minlen=20):
        """
        Generate sequences.
        If too long, divide it into multiple slice; 
        If too short, drop it.
        
        Return a generator: tuple (user, sequence)
        """
        for row in df.itertuples():
            uid, seq = row[1], row[2]

            # Skip short sequence
            if len(seq) < minlen: 
                continue

            for sub in sliding_windows(seq, window_size=maxlen, 
                                        step_size=maxlen):
                yield uid, sub

    def _get_negative_items(self, shape, prob=None):
        if prob is None:
            prob = np.ones(self._n_items)
        assert(len(prob) == self._n_items)
        
        prob /= prob.sum()

        negative_items = self.random_state.choice(np.arange(self._n_items),  p=prob,
                                                  size=shape)
        return negative_items
    
    def __iter__(self):
        """
        Iterate the whole dataset per mini-batch.
        """
        indices = np.arange(self._size)
        # Shuffle indices
        if self.shuffle:
            self.random_state.shuffle(indices)
        # Generate negative items
        negative_items = self._get_negative_items(shape=(self._size, MAX_SEQ_LENGTH), 
                                                  prob=pop_prob)
        # per mini batch
        for i in range(0, self._size, self.batch_size):
            batch_indices = indices[i : i + self.batch_size]
            
            yield self.sequences[batch_indices], negative_items[batch_indices]

# Define Network

In [8]:
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch import optim
from torch.autograd import Variable

from spotlight.layers import ScaledEmbedding, ZeroEmbedding

## RNN

In [9]:
class GRUNet(nn.Module):
    def __init__(self, num_items, embedding_dim=32,
                       item_embedding_layer=None, sparse=False):

        super(GRUNet, self).__init__()

        self.embedding_dim = embedding_dim

        if item_embedding_layer is not None:
            self.item_embeddings = item_embedding_layer
        else:
            self.item_embeddings = ScaledEmbedding(num_items, embedding_dim,
                                                   padding_idx=0,
                                                   sparse=sparse)

        self.item_biases = ZeroEmbedding(num_items, 1, sparse=sparse,
                                         padding_idx=0)
        
        self.constant_pad = nn.ConstantPad1d(padding=(1,0), value=0)
        
        # RNN layer
        self.gru = nn.GRU(batch_first=True,
                        input_size=embedding_dim,
                        hidden_size=embedding_dim)
        
        # Multi-layer perceptron (fully-connected layers)
        self.mlp = nn.Sequential(
            nn.Dropout(p=0.2),
            nn.Linear(embedding_dim*2, embedding_dim),
            nn.ReLU(),
            nn.Dropout(p=0.2),
            nn.Linear(embedding_dim, 1)
        )
        
    def user_representation(self, sequences):
        """
        Input:
            sequences: 2D (batch, length)
        
        Return:
            tuple: 3D (batch, length, embedding_dim), 3D (batch, 1, embedding_dim)
        """
        # pad a zero
        sequences = self.constant_pad(sequences) # (batch_size, length + 1)
        
        # Embedding sequences
        emb_seq = self.item_embeddings(sequences)
        
        # User representation
        user_repr, _ = self.gru(emb_seq)      # (batch_size, seq_len, embedding_dim)
        user_final = user_repr[:, -1:, :]        # Get final representation (batch_size, 1, embedding_dim)

        return user_repr[:, :-1, :], user_final
    
    def forward(self, user_representations, targets):
        """
        Input:
            user_representations: 3D (batch_size, length, embedding_dim)

            targets: 2D (batch_size, length)
        
        Return:
            prediction score: 2D (batch, length)
        """
        # Targets
        target_bias = self.item_biases(targets).squeeze()
        emb_target = self.item_embeddings(targets)  # (batch_size, seq_len, embedding_dim)

        # Score
#         x = torch.cat([user_representations, emb_target], dim=-1)
#         out = self.mlp(x).squeeze()
        out = (user_representations * emb_target).sum(-1).squeeze()
        
        return out + target_bias

## CNN (conv 1d)

In [10]:
class SeqCNN(nn.Module):
    """
    Causal CNN for sequences.
    """
    def __init__(self, num_items, embedding_dim=32, num_layers=1, sparse=False):

        super(SeqCNN, self).__init__()

        self.embedding_dim = embedding_dim

        self.item_embeddings = ScaledEmbedding(num_items, embedding_dim,
                                               padding_idx=0,
                                               sparse=sparse)

        self.item_biases = ZeroEmbedding(num_items, 1, sparse=sparse,
                                         padding_idx=0)
        
        # Convolution layers
        self.kernel_size = k = 3
        self.num_layers = num_layers
        self.convs = [
            nn.Conv1d(embedding_dim,  # Cin
                      embedding_dim,  # Cout
                      kernel_size=k) 
            for _ in range(num_layers)
        ]
            
        self.conv = nn.Conv1d(embedding_dim,  # Cin
                              embedding_dim,  # Cout
                              kernel_size=k)
        
    def user_representation(self, sequences):
        emb_seq = self.item_embeddings(sequences)  # (N, L, E)
        emb_seq = emb_seq.permute(0, 2, 1)         # (N, E, L), embedding_dim is the channels
        
        x = F.pad(emb_seq,                # (N, E, k + L)
                  (self.kernel_size, 0))
        x = F.tanh(self.convs[0](x))      # (N, E, 1 + L)
        
        # Residual
        x = x + F.pad(emb_seq, (1, 0))    # (N, E, 1 + L)
        
        # Rest layers
        for i in range(1, self.num_layers):
            residual = x
            x = F.pad(x, (self.kernel_size - 1, 0))
            x = F.tanh(self.convs[i](x))
            x = x + residual
        
        return x[:,:,:-1], x[:,:,-1:]     # (N, E, L),  (N, E, 1)
        
    def forward(self, user_representation, targets):
        """
        user_representation: (N, E, L)
        targets: (N, L)
        """
        emb_target = self.item_embeddings(targets).permute(0,2,1) # (N, E, L)
        
        b_i = self.item_biases(targets).squeeze()  # (N, L)
        
        dot = (user_representation * emb_target).sum(1).squeeze() # (N, L)
        return dot + b_i

## CNN (conv 2d)

In [None]:
class Seq2dCNN(nn.Module):
    """
    Causal CNN for sequences (2d conv).
    """
    def __init__(self, num_items, embedding_dim=32, sparse=False):

        super(Seq2dCNN, self).__init__()

        self.embedding_dim = embedding_dim

        self.item_embeddings = ScaledEmbedding(num_items, embedding_dim,
                                               padding_idx=0,
                                               sparse=sparse)
        self.item_biases = ZeroEmbedding(num_items, 1, sparse=sparse,
                                         padding_idx=0)
        
        # Convolution layers
        self.kernel_size = k = 3
        self.conv = nn.Conv2d(embedding_dim,  # Cin
                               embedding_dim,  # Cout
                               kernel_size=(k,1))
        
    def user_representation(self, sequences):
        emb_seq = self.item_embeddings(sequences)  # (N, L, E)
        emb_seq = emb_seq.permute(0, 2, 1)         # (N, E, L), embedding_dim is the channels
        emb_seq = emb_seq.unsqueeze(3)             # (N, E, L, 1)
        
        x = F.pad(emb_seq,                # (N, E, k + L, 1)
                  (0, 0, self.kernel_size, 0))
        x = F.tanh(self.conv(x))          # (N, E, 1 + L, 1)
        
        # Residual
        x = x + F.pad(emb_seq, (0, 0, 1, 0))    # (N, E, 1 + L, 1)
        x = x.squeeze(3)
        
        return x[:,:,:-1], x[:,:,-1:]     # (N, E, L),  (N, E, 1)
        
    def forward(self, user_representation, targets):
        """
        user_representation: (N, E, L)
        targets: (N, L)
        """
        emb_target = self.item_embeddings(targets).permute(0,2,1) # (N, E, L)
        
        b_i   = self.item_biases(targets).squeeze()  # (N, L)
        
        dot = (user_representation * emb_target).sum(1).squeeze() # (N, L)
        return dot + b_i

## Prediction function

In [11]:
def predict(net, sequences, item_ids=None):
    """
    net: model
    sequences: 2D array
    item_ids: 2D array
    """
    # Set to test mode (will not dropout or batch norm)
    net.train(False)
    
    sequences = np.atleast_2d(sequences)
    
    if item_ids is None:
        item_ids = np.atleast_2d(np.arange(MAX_ITEM))
        item_ids = item_ids.repeat(len(sequences), axis=0)
    else:
        item_ids = np.atleast_2d(item_ids)
        assert(len(sequences) == len(item_ids))
    
    n_items = item_ids.shape[1]
    
    # To tensor
    sequences = torch.from_numpy(sequences.astype('int64'))
    item_ids = torch.from_numpy(item_ids.astype('int64'))
    
    # To variable
    sequence_var = Variable(sequences)
    item_var = Variable(item_ids)
    
    # Get user representation
    _, user_final = net.user_representation(sequence_var)
    
    shape = list(user_final.size())  # (N, E, 1)
    shape[2] = n_items
    user_final = user_final.expand(shape)  # (N, E, L)
    
    # Prediction
    out = net(user_final, item_var)
    
    return out.data.numpy()

# Model Training

## Loss functions

In [12]:
def bpr_loss(positive_predictions, negative_predictions, mask=None):
    """
    Bayesian Personalised Ranking [1]_ pairwise loss function.
    """
    loss = (1.0 - F.sigmoid(positive_predictions -
                            negative_predictions))
    #     loss = -F.logsigmoid(negative_predictions -
    #                          positive_predictions)
    if mask is not None:
        mask = mask.float()
        loss = loss * mask
        return loss.sum() / mask.sum()

    return loss.mean()

def hinge_loss(positive_predictions, negative_predictions):
    """
    Hinge pairwise loss function.
    """
    loss = torch.clamp(negative_predictions -
                       positive_predictions +
                       1.0, 0.0)
    return loss.mean()

In [29]:
test = test.dropna()
test_sequences = pad_sequences(test.item_sequence, maxlen=MAX_SEQ_LENGTH)
eval_sequences = test.eval_sequence

In [33]:
def evaluate(_net, sequences, eval_sequences):
    output = predict(_net, sequences)
    topk_recs = np.argsort(-output)[:, :20]
    return precision_recall(topk_recs, eval_sequences)

# Training

In [45]:
NUM_LAYERS = 1

LEARNING_RATE = 1e-2
L2_NORM = 1e-5

EPOCHS = 50
BATCH_SIZE = 128

In [None]:
# Set random state
random_state = np.random.RandomState(123)

seed = random_state.randint(-10**8, 10**8)
torch.manual_seed(seed)

# Data
batcher = MiniBatcher(train, MAX_ITEM, 
                      batch_size=BATCH_SIZE,
                      shuffle=True,
                      random_state=random_state)

# Model
_net = SeqCNN(num_items=MAX_ITEM,
              embedding_dim=EMBEDDING_DIM,
              num_layers=NUM_LAYERS)
# Optim
optimizer = optim.Adam(_net.parameters(),
                        weight_decay=L2_NORM,
                        lr=LEARNING_RATE)
# Loss function
loss_function = bpr_loss

# Iteration
for i in range(1, EPOCHS+1):
    _net.train(True)
    epoch_loss = 0.0
    start = time()

    # Batch training
    for j, (batch_seq, batch_neg) in enumerate(batcher):  # __iter__
        # Input
        sequences_var     = Variable(torch.from_numpy(batch_seq.astype('int64')))
        neg_sequences_var = Variable(torch.from_numpy(batch_neg.astype('int64')))
        mask = sequences_var > 0
        
        # Sequence representations
        user_repr, _ = _net.user_representation(sequences_var)
        
        # Score
        positive_pred = _net(user_repr, sequences_var)
        negative_pred = _net(user_repr, neg_sequences_var)
    
        optimizer.zero_grad()
    
        # Loss
        loss = loss_function(positive_pred, negative_pred, mask)
        epoch_loss += loss.data[0]
        
        # Backward & update
        loss.backward()
        optimizer.step()
    
    print("#Epoch {0} \tLoss: {1:.4f} \t{2:.1f}s".format(i, epoch_loss/(j+1), time()-start))
    
    if i % 5 == 0:
        p,r,DCG = evaluate(_net, test_sequences, eval_sequences)
        print("\tPre@20: {0:.2f}%  Rec@20: {1:.2f}%  NDCG@20: {2:.4f}".format(p*100, r*100, DCG))
    
print("Done training!")

In [10]:
import numpy as np
import torch
import torch.nn as nn
from torch.autograd import Variable
from spotlight.sequence.implicit import ImplicitSequenceModel

class MySequenceModel(ImplicitSequenceModel):
    
    def predict(self, sequences, item_ids=None):
        self._net.train(False)

        sequences = np.atleast_2d(sequences)

        if item_ids is None:
            item_ids = np.arange(self._num_items).reshape(-1, 1)

        sequences = torch.from_numpy(sequences.astype(np.int64).reshape(1, -1))
        item_ids = torch.from_numpy(item_ids.astype(np.int64))

        sequence_var = Variable(sequences)
        item_var = Variable(item_ids)

        _, sequence_representations = self._net.user_representation(sequence_var)
        
        size = (len(item_var), ) + sequence_representations.size()[1:]
        out = self._net(sequence_representations.expand(*size),
                        item_var)

        return out.data.numpy()

In [186]:
from spotlight.interactions import SequenceInteractions

batcher = MiniBatcher(train, MAX_ITEM, batch_size=128)
train_seq = SequenceInteractions(num_items=MAX_ITEM, 
                                 sequences=batcher.sequences, 
                                 user_ids=batcher.user_ids)

model = MySequenceModel(n_iter=15,
                        batch_size=128,
                        learning_rate=1e-2,
                        l2=1e-5,
                        random_state=np.random.RandomState(123),
                        representation='cnn',
                        loss='bpr')

model._num_items = train_seq.num_items
model._net = SeqCNN(MAX_ITEM, EMBEDDING_DIM)
model._optimizer = optim.Adam(
                model._net.parameters(),
                weight_decay=model._l2,
                lr=model._learning_rate)
model._loss_func = bpr_loss

model.fit(train_seq, verbose=True)

Epoch 0: loss 0.2682572480902743
Epoch 1: loss 0.18318165972161649
Epoch 2: loss 0.16311376197124594
Epoch 3: loss 0.1521787647880725
Epoch 4: loss 0.1466848581139721
Epoch 5: loss 0.1427982431739124
Epoch 6: loss 0.14100971161874373
Epoch 7: loss 0.13974539735423985
Epoch 8: loss 0.14023387976991597
Epoch 9: loss 0.13828193407450148
Epoch 10: loss 0.1369533599979842
Epoch 11: loss 0.13618914293709086
Epoch 12: loss 0.1361529262208227
Epoch 13: loss 0.13653984265540964
Epoch 14: loss 0.13587565353112435


In [187]:
test = test.dropna()
test_seq = _pad_sequences(test.item_sequence, maxlen=MAX_SEQ_LENGTH)

predictions = np.zeros((len(test_seq), MAX_ITEM))
for i, seq in enumerate(test_seq):
    predictions[i] = model.predict(seq)

In [188]:
topk_recs = np.argsort(-predictions)[:, :20]
topk_recs

array([[3272, 3522, 3393, ...,  158, 3475, 2173],
       [ 139,  158, 3393, ...,  514, 1041,  509],
       [3522, 2950, 3396, ..., 2869, 1910, 3393],
       ..., 
       [ 520, 1181,  158, ..., 1041, 1540, 1937],
       [3090, 2617, 2364, ..., 1638, 3525, 1705],
       [3522, 3919, 4339, ..., 3148, 4201, 3393]])

In [189]:
p, r, DCG = precision_recall(topk_recs, test.eval_sequence)
print("Precision@20: {0:.2f}%  Recall@20: {1:.2f}%  NDCG@20: {2:.4f}".format(p*100, r*100, DCG))
# Precision@20: 4.70%  Recall@20: 9.21%
# Precision@20: 6.11%  Recall@20: 12.27%

# Precision@20: 4.79%  Recall@20: 8.30%  NDCG@20: 0.3414

Precision@20: 5.28%  Recall@20: 10.35%  NDCG@20: 0.3684


# Baseline

In [31]:
popk_items = data.train.item.value_counts().sort_values(ascending=False)[:20].index.values

n_test = len(test.item_sequence)
popk_items = popk_items.reshape(1,-1).repeat(n_test, axis=0)

p, r, DCG = precision_recall(popk_items, test.eval_sequence)
print("Precision@20: {0:.2f}%  Recall@20: {1:.2f}%  NDCG@20: {2:.4f}".format(p*100, r*100, DCG))

Precision@20: 3.62%  Recall@20: 8.15%  NDCG@20: 0.2597
