In [28]:
import tensorflow as tf
import operations as op
import layers
from tensorflow.keras.layers import Conv1D,GlobalMaxPooling1D

In [2]:
import utils.douban_evalutaion as eva

ImportError: No module named douban_evalutaion

In [35]:
input_x  = tf.placeholder(tf.float32,shape=[10,30,40])
input_x

[10, 30, 40]

In [3]:
x= [1,2,3,4,5]
x[-1:]

[5]

In [34]:
x = Conv1D(filters=128, kernel_size=5, activation='relu')(input_x)
x = GlobalMaxPooling1D()(x)
x

<tf.Tensor 'global_max_pooling1d_1/Max:0' shape=(10, 128) dtype=float32>

In [2]:

def attention(
    Q, K, V, 
    Q_lengths, K_lengths, 
    attention_type='dot', 
    is_mask=True, mask_value=-2**32+1,
    drop_prob=None):
    '''Add attention layer.
    Args:
        Q: a tensor with shape [batch, Q_time, Q_dimension]
        K: a tensor with shape [batch, time, K_dimension]
        V: a tensor with shape [batch, time, V_dimension]

        Q_length: a tensor with shape [batch]
        K_length: a tensor with shape [batch]

    Returns:
        a tensor with shape [batch, Q_time, V_dimension]

    Raises:
        AssertionError: if
            Q_dimension not equal to K_dimension when attention type is dot.
    '''
    assert attention_type in ('dot', 'bilinear')
    if attention_type == 'dot':
        assert Q.shape[-1] == K.shape[-1]

    Q_time = Q.shape[1]
    K_time = K.shape[1]

    if attention_type == 'dot':
        logits = op.dot_sim(Q, K) #[batch, Q_time, time]
    if attention_type == 'bilinear':
        logits = op.bilinear_sim(Q, K)

    if is_mask:
        mask = op.mask(Q_lengths, K_lengths, Q_time, K_time) #[batch, Q_time, K_time]
        logits = mask * logits + (1 - mask) * mask_value
    
    attention = tf.nn.softmax(logits)

    if drop_prob is not None:
        print('use attention drop')
        attention = tf.nn.dropout(attention, drop_prob)

    return op.weighted_sum(attention, V)


In [13]:
def scaled_dot_product_attention(q, k, v, mask):
    '''attention(Q, K, V) = softmax(Q * K^T / sqrt(dk)) * V'''
    # query 和 Key相乘
    matmul_qk = tf.matmul(q, k, transpose_b=True)
    # 使用dk进行缩放
    dk = tf.cast(tf.shape(q)[-1], tf.float32)
    scaled_attention =matmul_qk / tf.sqrt(dk)
    # 掩码mask
    if mask is not None:
        # 这里将mask的token乘以-1e-9，这样与attention相加后，mask的位置经过softmax后就为0
        # padding位置 mask=1
        scaled_attention += mask * -1e-9
    # 通过softmax获取attention权重, mask部分softmax后为0
    attention_weights = tf.nn.softmax(scaled_attention)  # shape=[batch_size, seq_len_q, seq_len_k]
    # 乘以value
    outputs = tf.matmul(attention_weights, v)  # shape=[batch_size, seq_len_q, depth]
    return outputs, attention_weights
class MultiHeadAttention(tf.keras.layers.Layer):
    def __init__(self, d_model, num_heads):
        super(MultiHeadAttention, self).__init__()
        self.num_heads = num_heads
        self.d_model = d_model
        # d_model必须可以正确分成多个头
        assert d_model % num_heads == 0
        # 分头之后维度
        self.depth = d_model // num_heads
        self.wq = tf.keras.layers.Dense(d_model)
        self.wk = tf.keras.layers.Dense(d_model)
        self.wv = tf.keras.layers.Dense(d_model)
        self.dense = tf.keras.layers.Dense(d_model)

    def split_heads(self, x, batch_size):
        # 分头，将头个数的维度，放到seq_len前面 x输入shape=[batch_size, seq_len, d_model]
        x = tf.reshape(x, [batch_size, -1, self.num_heads, self.depth])
        return tf.transpose(x, perm=[0, 2, 1, 3])

    def call(self, q, k, v, mask=None):
        batch_size = tf.shape(q)[0]
        # 分头前的前向网络，根据q,k,v的输入，计算Q, K, V语义
        q = self.wq(q)  # shape=[batch_size, seq_len_q, d_model]
        k = self.wq(k)
        v = self.wq(v)
        # 分头
        q = self.split_heads(q, batch_size)  # shape=[batch_size, num_heads, seq_len_q, depth]
        k = self.split_heads(k, batch_size)
        v = self.split_heads(v, batch_size)
        # 通过缩放点积注意力层
        # scaled_attention shape=[batch_size, num_heads, seq_len_q, depth]
        # attention_weights shape=[batch_size, num_heads, seq_len_q, seq_len_k]
        scaled_attention, attention_weights = scaled_dot_product_attention(q, k, v, mask)
        # 把多头维度后移
        scaled_attention = tf.transpose(scaled_attention, perm=[0, 2, 1, 3]) # shape=[batch_size, seq_len_q, num_heads, depth]
        # 把多头合并
        concat_attention = tf.reshape(scaled_attention, (batch_size, -1, self.d_model)) # shape=[batch_size, seq_len_q, d_model]
        # 全连接重塑
        output = self.dense(concat_attention)
        return output, attention_weights

In [3]:
batch_size = 2
turns_len = 5
words = 10
dim =10

input_turns = tf.placeholder(tf.float32, [2,5,10,10])
respones  = tf.placeholder(tf.float32, [batch_size, words, dim])
respones_len = tf.placeholder(tf.int32,[batch_size])

In [6]:
# input_turns = tf.transpose(input_turns,perm=[1,0,2,3])
# input_turns = tf.transpose(input_turns,perm=[1,0,2,3])
print(input_turns)
_turn_match = []

for _t in tf.split(input_turns,5,1):
    _t = tf.squeeze(_t)
    _match_result= attention(respones, _t,  _t, respones_len, respones_len)
    _turn_match.append(tf.expand_dims(_match_result,1))

Tensor("Placeholder:0", shape=(2, 5, 10, 10), dtype=float32)


In [8]:
best_turn_match = tf.concat(_turn_match,1)
best_turn_match

<tf.Tensor 'concat_11:0' shape=(2, 5, 10, 10) dtype=float32>

In [14]:
multihead = MultiHeadAttention(dim,2)

In [10]:

def FFN(x, out_dimension_0=None, out_dimension_1=None):
    '''Add two dense connected layer, max(0, x*W0+b0)*W1+b1.

    Args:
        x: a tensor with shape [batch, time, dimension]
        out_dimension: a number which is the output dimension

    Returns:
        a tensor with shape [batch, time, out_dimension]

    Raises:
    '''
    with tf.variable_scope('FFN_1'):
        y = op.dense(x, out_dimension_0)
        y = tf.nn.relu(y)
    with tf.variable_scope('FFN_2'):
        z = op.dense(y, out_dimension_1) #, add_bias=False)  #!!!!
    return z

In [18]:
result,_ = multihead(respones, best_turn_match, best_turn_match)
result

<tf.Tensor 'multi_head_attention_1/dense_3/BiasAdd:0' shape=(2, 10, 10) dtype=float32>

In [19]:
result = FFN(result)

In [20]:
result

<tf.Tensor 'FFN_2/add:0' shape=(2, 10, 10) dtype=float32>