# DIN Base

DIN模型的输入特征大致上分为了三类： Dense(连续型), Sparse(离散型), VarlenSparse(变长离散型)，也就是指的上面的历史行为数据。而不同的类型特征也就决定了后面处理的方式会不同：
* Dense型特征：由于是数值型了，这里为每个这样的特征建立Input层接收这种输入， 然后拼接起来先放着，等离散的那边处理好之后，和离散的拼接起来进DNN
* Sparse型特征，为离散型特征建立Input层接收输入，然后需要先通过embedding层转成低维稠密向量，然后拼接起来放着，等变长离散那边处理好之后， 一块拼起来进DNN， 但是这里面要注意有个特征的embedding向量还得拿出来用，就是候选商品的embedding向量，这个还得和后面的计算相关性，对历史行为序列加权。
* VarlenSparse型特征：这个一般指的用户的历史行为特征，变长数据， 首先会进行padding操作成等长， 然后建立Input层接收输入，然后通过embedding层得到各自历史行为的embedding向量， 拿着这些向量与上面的候选商品embedding向量进入AttentionPoolingLayer去对这些历史行为特征加权合并，最后得到输出。


DIN 模型的应用场景是阿里最典型的电商广告推荐，有大量的用户历史行为信息（历史购买过得商品或类别信息）。对于付了广告费的商品，阿里会根据模型预测的点击率高低，把合适的广告商品推荐给合适的用户，所以 DIN 模型本质上是一个点击率预估模型。

下面的图 1 就是 DIN 的基础模型 Base Model。我们可以看到，Base Model 是一个典型的 Embedding MLP 的结构。它的输入特征有用户属性特征（User Proflie Features）、用户行为特征（User Behaviors）、候选广告特征（Candidate Ad）和场景特征（Context Features）。

<img src="../data/img/DIN_base_model.webp" style="zoom:50%" />


# DIN 

<img src="../data/img/DIN.webp" style="zoom:50%" />

In [None]:
class ActivationUnit(torch.nn.Module):
    def __init__(self, emb_dim, dims=[36], activation="dice", use_softmax=False):
        super(ActivationUnit, self).__init__()
        self.emb_dim = emb_dim
        self.use_softmax = use_softmax
        # Dice(36)
        self.attention = MLP(4 * self.emb_dim, dims=dims, activation=activation)

    def forward(self, history, target):
        seq_length = history.size(1)
        target = target.unsqueeze(1).expand(-1, seq_length, -1)
        # Concat
        att_input = torch.cat([target, history, target - history, target * history], dim=-1)  
        # Dice(36)
        att_weight = self.attention(att_input.view(-1, 4 * self.emb_dim))  
        # Linear(1)
        att_weight = att_weight.view(-1, seq_length)
        if self.use_softmax:
            att_weight = att_weight.softmax(dim=-1)
        # (batch_size,emb_dim)
        output = (att_weight.unsqueeze(-1) * history).sum(dim=1)
        return

In [None]:
class MLP(nn.Module):
    """Multi Layer Perceptron Module, it is the most widely used module for 
    learning feature. Note we default add `BatchNorm1d` and `Activation` 
    `Dropout` for each `Linear` Module.

    Args:
        input dim (int): input size of the first Linear Layer.
        output_layer (bool): whether this MLP module is the output layer. If `True`, then append one Linear(*,1) module. 
        dims (list): output size of Linear Layer (default=[]).
        dropout (float): probability of an element to be zeroed (default = 0.5).
        activation (str): the activation function, support `[sigmoid, relu, prelu, dice, softmax]` (default='relu').

    Shape:
        - Input: `(batch_size, input_dim)`
        - Output: `(batch_size, 1)` or `(batch_size, dims[-1])`
    """

    def __init__(self, input_dim, output_layer=True, dims=[], dropout=0, activation="relu"):
        super().__init__()
        layers = list()
        for i_dim in dims:
            layers.append(nn.Linear(input_dim, i_dim))
            layers.append(nn.BatchNorm1d(i_dim))
            layers.append(activation_layer(activation))
            layers.append(nn.Dropout(p=dropout))
            input_dim = i_dim
        if output_layer:
            layers.append(nn.Linear(input_dim, 1))
        self.mlp = nn.Sequential(*layers)

    def forward(self, x):
        return self.mlp(x)

In [None]:
import torch
import torch.nn as nn
import numpy as np
from torch.nn.modules.activation import Sigmoid

class DIN(nn.Module):
    def __init__(self, candidate_movie_num, recent_rate_num, user_profile_num, context_feature_num, candidate_movie_dict, 
            recent_rate_dict, user_profile_dict, context_feature_dict, history_num, embed_dim, activation_dim, hidden_dim=[128, 64]):
        super().__init__()
        self.candidate_vocab_list = list(candidate_movie_dict.values())
        self.recent_rate_list = list(recent_rate_dict.values())
        self.user_profile_list = list(user_profile_dict.values())
        self.context_feature_list = list(context_feature_dict.values())
        self.embed_dim = embed_dim
        self.history_num = history_num
        # candidate_embedding_layer 
        self.candidate_embedding_list = nn.ModuleList([nn.Embedding(vocab_size, embed_dim) for vocab_size in self.candidate_vocab_list])
        # recent_rate_embedding_layer
        self.recent_rate_embedding_list = nn.ModuleList([nn.Embedding(vocab_size, embed_dim) for vocab_size in self.recent_rate_list])
        # user_profile_embedding_layer
        self.user_profile_embedding_list = nn.ModuleList([nn.Embedding(vocab_size, embed_dim) for vocab_size in self.user_profile_list])
        # context_embedding_list
        self.context_embedding_list = nn.ModuleList([nn.Embedding(vocab_size, embed_dim) for vocab_size in self.context_feature_list])

        # activation_unit
        self.activation_unit = nn.Sequential(nn.Linear(4*embed_dim, activation_dim), 
                                            nn.PReLU(),
                                            nn.Linear(activation_dim, 1),
                                            nn.Sigmoid())
        
        # self.dnn_part
        self.dnn_input_dim = len(self.candidate_embedding_list) * embed_dim + candidate_movie_num - len(
            self.candidate_embedding_list) + embed_dim + len(self.user_profile_embedding_list) * embed_dim + \
            user_profile_num - len(self.user_profile_embedding_list) + len(self.context_embedding_list) * embed_dim \
            + context_feature_num - len(self.context_embedding_list)

        self.dnn = nn.Sequential(nn.Linear(self.dnn_input_dim, hidden_dim[0]),
                             nn.BatchNorm1d(hidden_dim[0]),
                             nn.PReLU(),
                             nn.Linear(hidden_dim[0], hidden_dim[1]),
                             nn.BatchNorm1d(hidden_dim[1]),
                             nn.PReLU(),
                             nn.Linear(hidden_dim[1], 1),
                             nn.Sigmoid())

    def forward(self, candidate_features, recent_features, user_features, context_features):
        bs = candidate_features.shape[0]
        # candidate cate_feat embed
        candidate_embed_features = []
        for i, embed_layer in enumerate(self.candidate_embedding_list):
            candidate_embed_features.append(embed_layer(candidate_features[:, i].long()))
        candidate_embed_features = torch.stack(candidate_embed_features, dim=1).reshape(bs, -1).unsqueeze(1)
        ## add candidate continous feat
        candidate_continous_features = candidate_features[:, len(candidate_features):]
        candidate_branch_features = torch.cat([candidate_continous_features.unsqueeze(1), candidate_embed_features], dim=2).repeat(1, self.history_num, 1)

        # recent_rate  cate_feat embed
        recent_embed_features = []
        for i, embed_layer in enumerate(self.recent_rate_embedding_list):
            recent_embed_features.append(embed_layer(recent_features[:, i].long()))
        recent_branch_features = torch.stack(recent_embed_features, dim=1)
        
        # user_profile feat embed 
        user_profile_embed_features = []
        for i, embed_layer in enumerate(self.user_profile_embedding_list):
            user_profile_embed_features.append(embed_layer(user_features[:, i].long()))
        user_profile_embed_features = torch.cat(user_profile_embed_features, dim=1)
        ## add user_profile continous feat
        user_profile_continous_features = user_features[:, len(self.user_profile_list):]
        user_profile_branch_features = torch.cat([user_profile_embed_features, user_profile_continous_features], dim=1)

        # context embed feat
        context_embed_features = []
        for i, embed_layer in enumerate(self.context_embedding_list):
            context_embed_features.append(embed_layer(context_features[:, i].long()))
        context_embed_features = torch.cat(context_embed_features, dim=1)
        ## add context continous feat
        context_continous_features = context_features[:, len(self.context_embedding_list):]
        context_branch_features = torch.cat([context_embed_features, context_continous_features], dim=1)

        # activation_unit
        sub_unit_input = recent_branch_features - candidate_branch_features
        product_unit_input = torch.mul(recent_branch_features, candidate_branch_features)
        unit_input = torch.cat([recent_branch_features, candidate_branch_features, sub_unit_input, product_unit_input], dim=2)
        # weight-pool
        activation_unit_out = self.activation_unit(unit_input).repeat(1, 1, self.embed_dim)
        recent_branch_pooled_features = torch.mean(torch.mul(activation_unit_out, recent_branch_features), dim=1)
        # dnn part
        dnn_input = torch.cat([candidate_branch_features[:, 0, :], recent_branch_pooled_features, user_profile_branch_features, context_branch_features], dim=1)
        dnn_out = self.dnn(dnn_input)
        return

<img src="../data/img/din_build.webp" >

In [None]:
class DIN_Build(torch.nn.Module):
    def __init__(self, features, history_features, target_features, mlp_params, attention_mlp_params):
        super().__init__()
        self.features = features
        self.history_features = history_features
        self.target_features = target_features
        # 历史行为特征个数
        self.num_history_features = len(history_features)
        # 计算所有的dim
        self.all_dims = sum([fea.embed_dim for fea in features + history_features + target_features])
        
        # 构建Embeding层
        self.embedding = EmbeddingLayer(features + history_features + target_features)
        # 构建注意力层
        self.attention_layers = nn.ModuleList(
            [ActivationUnit(fea.embed_dim, **attention_mlp_params) for fea in self.history_features])
        self.mlp = MLP(self.all_dims, activation="dice", **mlp_params)

    def forward(self, x):
        embed_x_features = self.embedding(x, self.features)
        embed_x_history = self.embedding(x, self.history_features)
        embed_x_target = self.embedding(x, self.target_features)
        attention_pooling = []
        for i in range(self.num_history_features):
            attention_seq = self.attention_layers[i](embed_x_history[:, i, :, :], embed_x_target[:, i, :])
            attention_pooling.append(attention_seq.unsqueeze(1)) 
        # SUM Pooling
        attention_pooling = torch.cat(attention_pooling, dim=1)
        # Concat & Flatten
        mlp_in = torch.cat([
            attention_pooling.flatten(start_dim=1),
            embed_x_target.flatten(start_dim=1),
            embed_x_features.flatten(start_dim=1)
        ], dim=1)
        
        # 可传入[80, 200]
        y = self.mlp(mlp_in)
        
        # 代码中使用的是sigmoid(1)+BCELoss，效果和论文中的DIN模型softmax(2)+CELoss类似
        return torch.sigmoid(y.squeeze(1))