In [None]:
import numpy as np
import tensorflow as tf
from tensorflow.keras.layers import Dense, LayerNormalization, Dropout, Embedding

In [None]:
def positional_encoding(position,d_model):
    angle_rads = np.arange(position)[:,np.newaxis]/np.power(10000,2*(np.arange(d_model)//2)/np.float32(d_model))
    angle_rads[:, 0::2] = np.sin(angle_rads[:, 0::2])
    angle_rads[:, 1::2] = np.cos(angle_rads[:, 1::2])
    pos_encoding = angle_rads[np.newaxis, ...]
    return tf.cast(pos_encoding,dtype=tf.float32)



# position = 50 ##length of sequence
# d_model = 512 #dimensionality of models output
# pos_encoding = positional_encoding(position, d_model)
# print(pos_encoding)


In [None]:
## Multi Head Attention

class MultiHeadAttention(tf.keras.layers.Layer):
  def __init__(self,d_model,num_heads):
     super().__init__()
     self.d_model = d_model
     self.num_heads = num_heads
     assert d_model%num_heads==0

     self.depth = d_model//num_heads

     self.wq = Dense(d_model)
     self.wv = Dense(d_model)
     self.wk = Dense(d_model)
     self.dense = Dense(d_model)

  def splitHead(self,x,batch_size):
    x = tf.reshape(x,(batch_size,-1,self.num_heads,self.depth))
    return tf.transpose(x,perm=[0,2,1,3])

  def call(self,q,k,v,mask):
    q = self.wq(q)
    k = self.wk(k)
    v = self.wv(v)

    batch_size = tf.shape(q)[0]

    q = self.split_heads(q, batch_size)
    k = self.split_heads(k, batch_size)
    v = self.split_heads(v, batch_size)

    attention,attention_weights = self.scaled_dotproduct_attention(q,k,v,mask)

    attention = tf.transpose(attention, perm=[0, 2, 1, 3])
    attention = tf.reshape(attention,(batch_size,-1,self.d_model))
    output = self.dense(attention)
    return output


  def scaled_dotproduct_attention(self,q,k,v,mask):
    qkt = tf.matmul(q,k,transpose_b=1)
    dk = tf.cast(tf.shape(k)[-1],tf.float32)
    scaled_attention_logits = qkt/tf.math.sqrt(dk)

    if mask is not None:
      scaled_attention_logits += mask* -1e9

    attention_weights = tf.nn.softmax(scaled_attention_logits,axis=-1)
    output = tf.matmul(attention_weights,v)
    return output, attention_weights




In [None]:
##Feed Forward Network

class PositionWiseFeedForward(tf.keras.layers.Layer):
  def __init__(self,d_model,dff):
     super().__init__()
     self.d_model = d_model
     self.dff=dff
     self.layer1 = Dense(self.dff, activation="relu")
     self.layer2 = Dense(self.d_model)

  def call(self,input):
    x = self.layer1(input)
    x = self.layer2(x)
    return x


In [None]:
class TransformerBlock(tf.keras.layers.Layer):
  def __init__(self,d_model,num_heads,dff,dropout_rate=0.1):
    super().__init__()
    self.att = MultiHeadAttention(d_model,num_heads)
    self.ffn = PositionWiseFeedForward(d_model,dff)
    self.layer1nor = LayerNormalization(epsilon = 1e-6)
    self.layer2nor = LayerNormalization(epsilon = 1e-6)
    self.droplayer1 = Dropout(dropout_rate)
    self.droplayer2 = Dropout(dropout_rate)

  def call(self,x,training,mask):
    attn_output = self.att(x,x,x,mask)
    attn_output = self.droplayer1(attn_output,training = training)
    out1 = self.layer1nor(x+attn_output)
    ffn_out = self.ffn(out1)
    ffn_out = self.droplayer2(ffn_out, training=training)
    out2 = self.layer2nor(x+ffn_out)
    return out2



In [None]:
class Encoder(tf.keras.layers.Layer):
  def __init__(self,num_layers,d_model,num_heads,dff,input_vocab_size,maximum_position_encoding, dropout_rate=0.1):
    super().__init__()
    self.d_model = d_model
    self.num_layers = num_layers
    self.embedding = Embedding(input_vocab_size,d_model)
    self.pos_encoding = positional_encoding(maximum_position_encoding,d_model)
    self.dropout = Dropout(dropout_rate)
    self.enc_layers = [TransformerBlock(d_model,num_heads,dff,dropout_rate) for _ in range(num_layers)]

  def call(self,x,training,mask):
    seq_len = tf.shape(x)[1]
    x = self.embedding(x)
    x+=self.pos_encoding[:,:seq_len, :]
    x = self.dropout(x, training = training)
    for i in range(self.num_layers):
      x = self.enc_layers[i](x, training, mask)
    return x


In [None]:
class Decoder(tf.keras.layers.Layer):
  def __init__(self,num_layers,d_model,num_heads,dff,target_vocab_size, maximum_position_encoding, dropout_rate = 0.1):
    super().__init__()
    self.d_model = d_model
    self.num_layers = num_layers
    self.embedding = Embedding(target_vocab_size,d_model)
    self.pos_encoding = positional_encoding(maximum_position_encoding, d_model)
    self.dropout = Dropout(dropout_rate)
    self.dec_layer = [TransformerBlock(d_model,num_heads, dff, dropout_rate) for _ in range(num_layers)]

  def call(self,x,enc_output,training, look_ahead_mask, padding_mask):
    seq_length = tf.shape(x)[1]
    attention_weights = {}
    x = self.embedding(x)
    x+=self.pos_encoding[:,:seq_length, :]
    x = self.dropout(x,training=training)



SyntaxError: incomplete input (<ipython-input-7-d93b77958ea9>, line 13)