## DIN

In [1]:
from tensorboardX import SummaryWriter
import torch
import torch.nn as nn
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from torch.utils.data import Dataset, DataLoader
import torch.nn.functional as F
import torch.optim as optim
# Ignore warnings
import warnings
warnings.filterwarnings("ignore")

plt.ion()   # interactive mode

### Get training and test sample, remove unused columns

In [2]:
folder_path = "/home/leon/Documents/SparrowRecSys/src/main/resources/webroot/sampledata"

In [3]:
training_path = folder_path + "/Pytorch_data/trainingSamples.csv"

In [4]:
test_path = folder_path + "/Pytorch_data/testSamples.csv"

In [5]:
training_df = pd.read_csv(training_path, index_col=False)

In [6]:
test_df = pd.read_csv(test_path, index_col=False)

In [7]:
training_df.head()

Unnamed: 0,movieId,userId,rating,timestamp,label,releaseYear,movieGenre1,movieGenre2,movieGenre3,movieRatingCount,...,userGenre3,userGenre4,userGenre5,scaledReleaseYear,scaledmovieRatingCount,scaledmovieAvgRating,scaledmovieRatingStddev,scaleduserRatingCount,scaleduserAvgRating,scaleduserRatingStddev
0,593,10096,4.0,954365552,1,1991.0,13,4,12,13692.0,...,17,12,0,0.915493,0.936777,0.90625,0.449735,0.030612,0.688889,0.279874
1,832,10351,3.0,851791379,0,1996.0,13,12,0,3052.0,...,5,12,13,0.985916,0.208758,0.649306,0.486773,0.112245,0.726667,0.22956
2,85,10351,3.0,851791395,0,1995.0,11,5,0,592.0,...,5,12,13,0.971831,0.040438,0.690972,0.502645,0.122449,0.713333,0.22956
3,588,10351,5.0,851792205,1,1992.0,3,15,18,8980.0,...,12,5,13,0.929577,0.614369,0.729167,0.486773,0.22449,0.675555,0.207547
4,370,1090,2.0,1117852491,0,1994.0,2,7,0,3087.0,...,0,0,0,0.957747,0.211153,0.482639,0.555555,0.030612,0.2,0.204403


In [8]:
# fill NaN items in 'userRatedMovie1' column, movieId starts from 1, so we can use 0 to do padding
training_df.fillna(0, inplace=True)
test_df.fillna(0, inplace=True)

In [9]:
columns2Keep = ['userId', 
                'userGenre1', 
                'scaleduserRatingCount',
                'scaleduserAvgRating', 
                'scaleduserRatingStddev',
                'userRatedMovie1',
                'userRatedMovie2',
                'userRatedMovie3',
                'userRatedMovie4',
                'userRatedMovie5',
                'movieId',  
                'movieGenre1', 
                'scaledReleaseYear',
                'scaledmovieRatingCount',
                'scaledmovieAvgRating',
                'scaledmovieRatingStddev']

In [10]:
training_feature = training_df[columns2Keep]

In [11]:
training_label = training_df['label']

In [12]:
test_feature = test_df[columns2Keep]

In [13]:
test_label = test_df['label']

In [14]:
test_feature

Unnamed: 0,userId,userGenre1,scaleduserRatingCount,scaleduserAvgRating,scaleduserRatingStddev,userRatedMovie1,userRatedMovie2,userRatedMovie3,userRatedMovie4,userRatedMovie5,movieId,movieGenre1,scaledReleaseYear,scaledmovieRatingCount,scaledmovieAvgRating,scaledmovieRatingStddev
0,10436,3,0.102041,0.888889,0.094340,356.0,593.0,592.0,480.0,899.0,588,3,0.929577,0.614369,0.729167,0.486773
1,10436,7,0.244898,0.828889,0.141509,104.0,750.0,2.0,586.0,141.0,368,3,0.957747,0.256928,0.631945,0.470899
2,11078,7,0.112245,0.684444,0.229560,608.0,1.0,150.0,457.0,593.0,50,13,0.971831,0.699282,0.965278,0.396825
3,11078,12,0.153061,0.686667,0.201258,922.0,628.0,47.0,50.0,608.0,527,11,0.943662,0.753746,0.947917,0.439153
4,11078,12,0.214286,0.686667,0.188679,377.0,588.0,380.0,527.0,922.0,733,2,0.985916,0.467739,0.722222,0.486773
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
26603,7447,2,0.091837,0.697778,0.264151,1.0,589.0,260.0,110.0,593.0,780,2,0.985916,0.702703,0.625000,0.560847
26604,8700,11,0.071429,0.680000,0.198113,589.0,750.0,111.0,912.0,858.0,356,7,0.957747,0.987000,0.854167,0.502645
26605,8700,11,0.091837,0.717778,0.226415,296.0,356.0,589.0,750.0,111.0,593,13,0.915493,0.936777,0.906250,0.449735
26606,8700,11,0.122449,0.722222,0.220126,110.0,593.0,296.0,356.0,589.0,527,11,0.943662,0.753746,0.947917,0.439153


In [15]:
for i in range(1, 6): # convert userRatedMovie columns to longtype
    column_name = "userRatedMovie" + str(i)
    training_feature[column_name] = training_feature[column_name].astype('int64')
    test_feature[column_name] = test_feature[column_name].astype('int64')

In [16]:
test_feature

Unnamed: 0,userId,userGenre1,scaleduserRatingCount,scaleduserAvgRating,scaleduserRatingStddev,userRatedMovie1,userRatedMovie2,userRatedMovie3,userRatedMovie4,userRatedMovie5,movieId,movieGenre1,scaledReleaseYear,scaledmovieRatingCount,scaledmovieAvgRating,scaledmovieRatingStddev
0,10436,3,0.102041,0.888889,0.094340,356,593,592,480,899,588,3,0.929577,0.614369,0.729167,0.486773
1,10436,7,0.244898,0.828889,0.141509,104,750,2,586,141,368,3,0.957747,0.256928,0.631945,0.470899
2,11078,7,0.112245,0.684444,0.229560,608,1,150,457,593,50,13,0.971831,0.699282,0.965278,0.396825
3,11078,12,0.153061,0.686667,0.201258,922,628,47,50,608,527,11,0.943662,0.753746,0.947917,0.439153
4,11078,12,0.214286,0.686667,0.188679,377,588,380,527,922,733,2,0.985916,0.467739,0.722222,0.486773
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
26603,7447,2,0.091837,0.697778,0.264151,1,589,260,110,593,780,2,0.985916,0.702703,0.625000,0.560847
26604,8700,11,0.071429,0.680000,0.198113,589,750,111,912,858,356,7,0.957747,0.987000,0.854167,0.502645
26605,8700,11,0.091837,0.717778,0.226415,296,356,589,750,111,593,13,0.915493,0.936777,0.906250,0.449735
26606,8700,11,0.122449,0.722222,0.220126,110,593,296,356,589,527,11,0.943662,0.753746,0.947917,0.439153


In [17]:
for i, col_name in enumerate(training_feature.columns): # get column_index for each feature
    print(str(i) + " -> " + col_name)

0 -> userId
1 -> userGenre1
2 -> scaleduserRatingCount
3 -> scaleduserAvgRating
4 -> scaleduserRatingStddev
5 -> userRatedMovie1
6 -> userRatedMovie2
7 -> userRatedMovie3
8 -> userRatedMovie4
9 -> userRatedMovie5
10 -> movieId
11 -> movieGenre1
12 -> scaledReleaseYear
13 -> scaledmovieRatingCount
14 -> scaledmovieAvgRating
15 -> scaledmovieRatingStddev


### DataSet and DataLoader

In [18]:
sparse_col = [0, 1, 11, 5, 6, 7, 8, 9, 10] # column_index of sparse features

In [19]:
sparse_col_size = [30001, 20, 20, 1001, 1001, 1001, 1001, 1001, 1001] # number of classes per sparse_feature

In [20]:
sparse_ratio = [3, 5, 1] # column_number of sparse column: behavior column: candidate column

In [21]:
dense_col = [2, 3, 4, 12, 13, 14, 15]

In [22]:
class ModelDataSet(Dataset):
    # Retrieve an item in every call
    def __init__(self, input_DF, label_DF, sparse_col, dense_col):
        self.df = input_DF
        
        self.dense_df = input_DF.iloc[:, dense_col].astype(np.float32) 
        self.sparse_df = input_DF.iloc[:, sparse_col].astype('int64') 
        
        self.label = label_DF.astype(np.float32) 
        
    def __len__(self):
        return len(self.label)
    
    def __getitem__(self, idx):
        if torch.is_tensor(idx):
            idx = idx.tolist()
        sparse_feature = torch.tensor(self.sparse_df.iloc[idx])
        dense_feature = torch.tensor(self.dense_df.iloc[idx])
        label = torch.tensor(self.label.iloc[idx])
        return {'Feature': (sparse_feature, dense_feature), 'Label': label}

In [23]:
training_dataset = ModelDataSet(training_feature, training_label, sparse_col, dense_col)

In [24]:
test_dataset = ModelDataSet(test_feature, test_label, sparse_col, dense_col)

In [25]:
test_dataset[0]

{'Feature': (tensor([10436,     3,     3,   356,   593,   592,   480,   899,   588]),
  tensor([0.1020, 0.8889, 0.0943, 0.9296, 0.6144, 0.7292, 0.4868])),
 'Label': tensor(1.)}

In [26]:
BATCH_SIZE = 100

In [27]:
training_dataloader = DataLoader(training_dataset, batch_size=BATCH_SIZE, shuffle=True)

In [28]:
test_dataloader = DataLoader(test_dataset, batch_size=BATCH_SIZE)

### Model and loss function

First, define the Attention module

In [29]:
class Attention_ElementWiseProduct(nn.Module):
    """
      Input:
          behavior: 3D tensor with shape: ``(batch_size,field_size,embedding_size)``.
          candidate: 3D tensor with shape: ``(batch_size,1,embedding_size)``.
      Output:
          attention_weight: 3D tensor with shape: ``(batch_size, field_size, 1)``.
    """

    def __init__(self, embedding_size):
        super().__init__()
        self.linear1 = nn.Linear(4*embedding_size, 32)
        self.linear2 = nn.Linear(32, 1)
        self.prelu = nn.PReLU()

    def forward(self, behavior, candidate):
        candidate = candidate.expand_as(behavior)
        embed_input = torch.cat([behavior, candidate, behavior-candidate, behavior*candidate], dim=2)  # (B,F,4k)
        output = self.prelu(self.linear1(embed_input))  # (B,F,32)
        output = F.sigmoid(self.linear2(output)) # (B,F,1)
        return output

In [30]:
import math
class DIN(nn.Module):
    def __init__(self, sparse_col_size, dense_col_size, sparse_ratio):
        # sparse_col_size: list[int]
        # dense_col_size: int
        super().__init__()
        self.sparse_col_size = sparse_col_size
        self.dense_col_size = dense_col_size
        self.sparse_ratio = sparse_ratio
        
        # For categorical features, we embed the features in dense vectors of dimension of 6 * category cardinality^1/4
        embedding_size = list(map(lambda x: int(6 * pow(x, 0.25)), self.sparse_col_size))
        
        # Attention layer
        movieId_embed_size = embedding_size[3]
        self.attention_layer = Attention_ElementWiseProduct(movieId_embed_size)
        
        # Embedding layer for all sparse features
        sparse_embedding_list = []
        for class_size, embed_size in zip(self.sparse_col_size, embedding_size):
            embed_layer = nn.Embedding(class_size, embed_size, scale_grad_by_freq=True)
            # init embed_layer
            # embed_layer.weight.data.uniform_(-1/math.sqrt(class_size), 1/math.sqrt(class_size))
            sparse_embedding_list.append(embed_layer)
        self.sparse_embedding_layer = nn.ModuleList(sparse_embedding_list)

        # Deep layers
        deep_input_size = np.sum(embedding_size) + dense_col_size - 4*movieId_embed_size
        self.linear1 = nn.Linear(deep_input_size, 128)
        self.prelu1 = nn.PReLU()
        self.linear2 = nn.Linear(128, 64)
        self.prelu2 = nn.PReLU()
        self.linear3 = nn.Linear(64, 1)
        

        
    def forward(self, sparse_feature, dense_feature):
        if (len(sparse_feature.shape) == 1): # 1D tensor coverted to 2D tensor if batch_number == 1
            sparse_feature = sparse_feature.view(1, -1)
            dense_feature = dense_feature.view(1, -1)
        
        # convert sparse feature to oneHot and Embedding
        embedding_list=[]
        for i in range(len(self.sparse_col_size)):
            sparse_feature_input = sparse_feature[:, i] # batch x 1
            class_size = self.sparse_col_size[i]
            embedding_layer = self.sparse_embedding_layer[i]
            embedding_output = embedding_layer(sparse_feature_input).squeeze(1) # batch x embedding_size
            embedding_list.append(embedding_output)
            
        # Split into "other sparse feature", behavior feature, candidate feature
        sparse_embed_list = embedding_list[:self.sparse_ratio[0]]
        behavior_embed_list = embedding_list[self.sparse_ratio[0]:self.sparse_ratio[0]+self.sparse_ratio[1]]
        candidate = embedding_list[-1].unsqueeze(1)
            
        sparse_embedding = torch.cat(sparse_embed_list, dim=1)    
        behavior = torch.stack(behavior_embed_list, dim=1)  # B x field_number x embedding_size
        
        # Cal the attention weight
        attention_weight = self.attention_layer(behavior, candidate) # B x field_number x 1
        
        # Apply attention weight and do sumPooling
        attention_behavior = attention_weight * behavior # B x field_number x embedding_size
        sumPool_behavior = attention_behavior.sum(dim=1) # B x embedding_size
        
        deep_input = torch.cat([sparse_embedding, dense_feature, sumPool_behavior, candidate.squeeze()], dim=1)
        
        # Deep layer
        deep_output = self.prelu1(self.linear1(deep_input))
        deep_output = self.prelu2(self.linear2(deep_output))
        deep_output = self.linear3(deep_output)
        return F.sigmoid(deep_output).view(-1) # (B,)

### Training and evaluation

In [31]:
from sklearn.metrics import roc_auc_score

In [32]:
model = DIN(sparse_col_size, 7, sparse_ratio)

In [33]:
loss_fn = nn.BCELoss()

In [34]:
EPOCHS = 5

In [35]:
LR = 0.001

In [36]:
# Add weight decay for all parameters in the model other than those in PReLU layer
para_list = []
for para_name, para in model.named_parameters():
    if 'prelu' in para_name:
        para_list.append({'params': para, 'weight_decay':0})
    else:
        para_list.append({'params': para})

In [37]:
optimizer = optim.Adam(para_list, lr=LR, weight_decay=0.001)

In [38]:
# optimizer = optim.SGD(model.parameters(), lr=LR, momentum=0.9)

In [39]:
dev = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")

In [40]:
# summary = SummaryWriter()

In [41]:
class Train_Eval():
    def __init__(self, model, loss_fn, optim, device, train_dataloader, test_dataloader):
        self.device = device
        self.model = model.to(self.device)
        self.optim = optim
        self.loss_fn = loss_fn
        self.train_dataloader = train_dataloader
        self.test_dataloader = test_dataloader
        self.threashold = 0.5 # threashold for positive class
        
    def train(self, epochs):
        self.model.train()
        for epoch in range(epochs):
            print("==========================================================")
            print("start training epoch: {}".format(epoch+1))
            loss_list = []
            pred_list = []
            label_list = []
            
            iteration = 1
            for train_data in self.train_dataloader:
                sparse_feature = train_data['Feature'][0].to(self.device)
                dense_feature = train_data['Feature'][1].to(self.device)
                label = train_data['Label'].to(self.device)
                prediction = self.model(sparse_feature, dense_feature)
                
                pred_list.extend(prediction.tolist())
                label_list.extend(label.tolist())
                
                cur_loss = self.loss_fn(prediction, label)
                loss_list.append(cur_loss.item())
                cur_loss.backward()
                self.optim.step()
                self.optim.zero_grad()
                
                # logging every 20 iteration
                if iteration % 20 == 0:
                    print("---------------------------------------------------------")
                    print("epoch {}/{}, cur_iteration is {}, logloss is {:.2f}"
                          .format(epoch+1, epochs, iteration, cur_loss.item()))
                iteration += 1
                
            # validation every epoch
            training_loss, training_accuracy, training_roc_score = self._getMetric(loss_list, pred_list, label_list)
            print("==========================================================")
            print("Result of epoch {}".format(epoch+1))
            print(f"training loss: {training_loss:.2f}, accuracy: {training_accuracy:.3f}, roc_score: {training_roc_score:.2f}")
            
            test_loss, test_accuracy, test_roc_score = self.eval()
            print(f"test loss: {test_loss:.2f}, accuracy: {test_accuracy:.3f}, roc_score: {test_roc_score:.2f}")
            # summary.add_embedding(np.reshape(np.array(loss_list), (1, -1)), tag="loss_list")
            # summary.add_embedding(np.reshape(np.array(pred_list), (1, -1)), tag="pred_list")
            # summary.add_embedding(np.reshape(np.array(label_list), (1, -1)), tag="label_list")
            # summary.add_scalar("training_loss", training_loss)
            # summary.add_scalar("training_accuracy", training_accuracy)
            # summary.add_scalar("training_roc_score", training_roc_score)
    
    def eval(self):
        # return logloss, accuracy, roc_score
        self.model.eval()
        loss_list = []
        pred_list = []
        label_list = []
        with torch.no_grad():
            for test_data in self.test_dataloader:
                sparse_feature = test_data['Feature'][0].to(self.device)
                dense_feature = test_data['Feature'][1].to(self.device)
                label = test_data['Label'].to(self.device)
                prediction = self.model(sparse_feature, dense_feature)
                cur_loss = self.loss_fn(prediction, label)
                
                loss_list.append(cur_loss.item())
                pred_list.extend(prediction.tolist())
                label_list.extend(label.tolist())
        return self._getMetric(loss_list, pred_list, label_list)
                
    def _getMetric(self, loss_list, pred_list, label_list):
        # return logloss, accuracy, roc_score        
        # average logloss
        avg_loss = np.mean(loss_list)
        # roc_score
        roc_score = roc_auc_score(label_list, pred_list)
        # average accuracy
        pred_class_list = list(map(lambda x: 1 if x >= self.threashold else 0, pred_list))
        correct_count = 0
        for p, l in zip(pred_class_list, label_list):
            if p == l:
                correct_count += 1
        avg_accuracy = correct_count / len(label_list)
        
        return avg_loss, avg_accuracy, roc_score

In [42]:
train_eval = Train_Eval(model, loss_fn, optimizer, dev, training_dataloader, test_dataloader)

In [None]:
train_eval.train(EPOCHS)

start training epoch: 1
---------------------------------------------------------
epoch 1/5, cur_iteration is 20, logloss is 0.68
---------------------------------------------------------
epoch 1/5, cur_iteration is 40, logloss is 0.68
---------------------------------------------------------
epoch 1/5, cur_iteration is 60, logloss is 0.68
---------------------------------------------------------
epoch 1/5, cur_iteration is 80, logloss is 0.65
---------------------------------------------------------
epoch 1/5, cur_iteration is 100, logloss is 0.66
---------------------------------------------------------
epoch 1/5, cur_iteration is 120, logloss is 0.66
---------------------------------------------------------
epoch 1/5, cur_iteration is 140, logloss is 0.67
---------------------------------------------------------
epoch 1/5, cur_iteration is 160, logloss is 0.63
---------------------------------------------------------
epoch 1/5, cur_iteration is 180, logloss is 0.60
-----------------

---------------------------------------------------------
epoch 2/5, cur_iteration is 660, logloss is 0.57
---------------------------------------------------------
epoch 2/5, cur_iteration is 680, logloss is 0.57
---------------------------------------------------------
epoch 2/5, cur_iteration is 700, logloss is 0.62
---------------------------------------------------------
epoch 2/5, cur_iteration is 720, logloss is 0.52
---------------------------------------------------------
epoch 2/5, cur_iteration is 740, logloss is 0.59
---------------------------------------------------------
epoch 2/5, cur_iteration is 760, logloss is 0.54
---------------------------------------------------------
epoch 2/5, cur_iteration is 780, logloss is 0.62
---------------------------------------------------------
epoch 2/5, cur_iteration is 800, logloss is 0.51
---------------------------------------------------------
epoch 2/5, cur_iteration is 820, logloss is 0.54
-------------------------------------

---------------------------------------------------------
epoch 4/5, cur_iteration is 420, logloss is 0.60
---------------------------------------------------------
epoch 4/5, cur_iteration is 440, logloss is 0.65
---------------------------------------------------------
epoch 4/5, cur_iteration is 460, logloss is 0.54
---------------------------------------------------------
epoch 4/5, cur_iteration is 480, logloss is 0.62
---------------------------------------------------------
epoch 4/5, cur_iteration is 500, logloss is 0.52
---------------------------------------------------------
epoch 4/5, cur_iteration is 520, logloss is 0.62
---------------------------------------------------------
epoch 4/5, cur_iteration is 540, logloss is 0.59
---------------------------------------------------------
epoch 4/5, cur_iteration is 560, logloss is 0.56
---------------------------------------------------------
epoch 4/5, cur_iteration is 580, logloss is 0.48
-------------------------------------

In [None]:
# summary.close()