In [1]:
import math, gc
import numpy as np
import pandas as pd
from time import time

import torch
import torch.nn as nn
import torch.nn.functional as F
from torch import optim
from torch.autograd import Variable

from spotlight.layers import ScaledEmbedding, ZeroEmbedding

In [2]:
randstate = np.random.RandomState(123)

In [3]:
from data import Dataset

data = Dataset(csvfile='interactions.csv', 
               num_test_users=500, 
               sample=0.15, 
               cut_item=100)

train = data.train_plus
test  = data.test_second

#User: 7499	#Item: 4698
796479 49833


In [4]:
N_USERS = 7499 + 1
N_ITEMS = 4698 + 1

In [5]:
pop_prob = np.ones(N_ITEMS)
pop_list = train.item.value_counts()
pop_prob[pop_list.index] = pop_list.values
pop_prob = np.log1p(pop_prob)
pop_prob

array([ 0.69314718,  7.69848279,  6.50578406, ...,  2.39789527,
        3.98898405,  3.91202301])

In [6]:
from sklearn import utils as skutils

class Interactions():
    def __init__(self, df, meta=None, shuffle=True):
        self.users = df.user.values.astype('int64')
        self.items = df.item.values.astype('int64')
        
        self.size = len(df)
        
        self.n_users = df.user.unique().size
        self.n_items = df.item.unique().size
        
        self.shuffle = shuffle
        
    def batch_generator(self, batch_size, randstate):
        if randstate is None:
            randstate = np.random.RandomState(123)
            
        idx = np.arange(self.size)
        if self.shuffle:
            idx = skutils.shuffle(idx, 
                                  random_state=randstate)
        for i in range(0, len(idx), batch_size):
            b = idx[i : i + batch_size]
            batch = {
                'users': torch.from_numpy(self.users[b].astype('int64')),
                'items': torch.from_numpy(self.items[b].astype('int64'))
            }
            yield batch

In [7]:
class MFNet(nn.Module):
    def __init__(self, n_users, n_items, embedding_dim=32):
        super(MFNet, self).__init__()
        
        self.user_embedding = ScaledEmbedding(n_users, embedding_dim)
        self.item_embedding = ScaledEmbedding(n_items, embedding_dim)
        
        self.user_bias = ZeroEmbedding(n_users, 1)
        self.item_bias = ZeroEmbedding(n_items, 1)
        
    def forward(self, user_ids, item_ids):
        emb_user = self.user_embedding(user_ids).squeeze()
        emb_item = self.item_embedding(item_ids).squeeze()
        
        out = (emb_user * emb_item).sum(-1).squeeze()
        
        b_u = self.user_bias(user_ids).squeeze()
        b_i = self.item_bias(item_ids).squeeze()
        
        return out + b_u + b_i

In [29]:
class ImplicitModel():
    def __init__(self, n_users, n_items, embedding_dim=32, 
                 optimizer=None, loss_function=None, 
                 lr=1e-3, 
                 l2=1e-5, 
                 randstate=None):
        
        self.n_users = n_users
        self.n_items = n_items
        self.embedding_dim = embedding_dim
        if randstate is None:
            randstate = np.random.RandomState(123)
        self.randstate = randstate
        
        # Model
        self._net = MFNet(n_users, 
                          n_items, 
                          embedding_dim)
        # Adam default
        if optimizer is None:
            optimizer = optim.Adam(self._net.parameters(), 
                                   lr=lr, 
                                   weight_decay=l2)
        self._optimizer = optimizer
        
        # Loss function
        if loss_function is None:
            loss_function = bpr_loss
        self._loss_function = loss_function
        
    def fit(self, train, epochs=10, batch_size=128, verbose=True):
        """
        Input:
            train: Interactions
        """
        
        for i in range(1, epochs+1):
            epoch_loss = 0.0
            start = time()
            
            batcher = train.batch_generator(batch_size, randstate)
            for j, batch in enumerate(batcher):
                items_var = Variable(batch['items'])
                users_var = Variable(batch['users'])
                
                # Predict
                pred = self._net(users_var, items_var)
                
                neg_pred = self._get_negative_prediction(users_var)
                
                self._optimizer.zero_grad()
                
                # Loss
                loss = self._loss_function(pred, neg_pred)
                epoch_loss += loss.data[0]
                
                # Update
                loss.backward()
                self._optimizer.step()
                
            if verbose:    
                print("#Epoch {0} \tLoss: {1:.4f}\t {2:.1f}s".format(i, epoch_loss /(j+1), time()-start))
        print("Done training!")
        
    def _get_negative_prediction(self, users_var):
        items = np.arange(self.n_items)
        # Random choice
#         p = pop_prob / pop_prob.sum()
        neg_items = self.randstate.choice(items, #p=p,
                                          size=users_var.size()[0])
        # To tensor
        neg_items = torch.from_numpy(neg_items.astype('int64'))
        neg_var = Variable(neg_items)
        
        # Predict
        return self._net(users_var, neg_var)
    
    
    def predict(self, user_ids, item_ids=None):
        self._net.train(False)

        user_ids = user_ids.reshape(-1, 1)  # 1D to 2D (batch_size, 1)

        if item_ids is None:
            item_ids = np.arange(N_ITEMS)
            item_ids = np.atleast_2d(item_ids)                 #          (1, n_items)
            item_ids = item_ids.repeat(len(user_ids), axis=0)  # (batch_size, n_items)
        else:
            assert(len(user_ids) == len(item_ids))

        # To tensor
        user_ids = torch.from_numpy(user_ids.astype('int64'))
        item_ids = torch.from_numpy(item_ids.astype('int64'))
        # To variable
        user_var = Variable(user_ids)
        item_var = Variable(item_ids)

        # Repeat vector
        user_var = user_var.expand_as(item_var) # (batch_size, n_items)

        # Predict
        predictions = self._net(user_var, item_var)

        return predictions.data.numpy()
    
    def predict_by_batch(self, user_ids, item_ids=None, batch_size=1000):
        preds = []
        for i in range(0, len(user_ids), batch_size):
            pred = self.predict(user_ids[i : i+batch_size], item_ids=item_ids)
            preds.append(pred)
        return np.vstack(preds)

In [22]:
def bpr_loss(positive_pred, negative_pred):
    loss = 1.0 - F.sigmoid(positive_pred - negative_pred)
    return loss.mean()

def hinge_loss(positive_pred, negative_pred):
    loss = F.relu(negative_pred - positive_pred + 1.0)
    return loss.mean()

In [10]:
dtrain = Interactions(train, shuffle=True)

In [33]:
model = ImplicitModel(N_USERS, N_ITEMS, embedding_dim=32,
                      loss_function=bpr_loss, 
                      lr=1e-2,
                      l2=1e-5)

In [34]:
model.fit(dtrain, epochs=15, batch_size=10000)

#Epoch 1 	Loss: 0.3377	 3.6s
#Epoch 2 	Loss: 0.2168	 3.6s
#Epoch 3 	Loss: 0.2104	 3.5s
#Epoch 4 	Loss: 0.2060	 3.5s
#Epoch 5 	Loss: 0.2044	 3.5s
#Epoch 6 	Loss: 0.2033	 3.5s
#Epoch 7 	Loss: 0.2026	 3.5s
#Epoch 8 	Loss: 0.2014	 3.5s
#Epoch 9 	Loss: 0.2008	 3.6s
#Epoch 10 	Loss: 0.2009	 3.5s
Done training!


# Evaluation

In [13]:
from metric import precision_recall

In [14]:
# Test item lists sorted by user.
test_lists = test.groupby('user').agg({
    'item': lambda x:list(x)
})['item']

In [35]:
N_TOP = 20

# Test user ids.
test_users = np.sort(test.user.unique())
# Prediction for each user.
pred = model.predict_by_batch(test_users)

# Get topk and sorted
topk = np.argpartition(-pred, kth=np.arange(N_TOP))[:, :N_TOP]  

n_test = len(test_lists)
p, r, DCG = precision_recall(topk, test_lists)
print("Pre@20: {0:.2f}% \tRec@20: {1:.2f}% \tNDCG@20: {2:.4f}".format(p*100, r*100, DCG))

Pre@20: 3.78% 	Rec@20: 8.29% 	NDCG@20: 0.2644


# Baseline

In [16]:
k = 20
pop_list = train.item.value_counts().index.values
topk = pop_list[:k]
topk = np.atleast_2d(topk)
topk

array([[1041, 3662,  139, 3148, 1181,  158, 1750, 2177,  128, 1929,  114,
         248,  220,  319, 3522,  464,  462, 3393,   36, 1139]])

In [17]:
n_test = len(test_lists)
p, r, DCG = precision_recall(topk.repeat(n_test, axis=0), test_lists)
print("Pre@20: {0:.2f}%  Rec@20: {1:.2f}%  NDCG@20: {2:.4f}".format(p*100, r*100, DCG))

Pre@20: 3.62%  Rec@20: 8.15%  NDCG@20: 0.2596
