In [4]:
import tensorflow as tf

def batch_fetch_element_per_group(data,idx):
    #here data is of shape [?,m,n] ? is the data batch size, m -number of groups, n-number of candidates in each group
    #idx is of shape [?,m] m - index to select a particular target from each group, so we have m of them.
    # output would be [?,m], selected targets for all groups
    
    nRows = tf.shape(data)[0]  
    #print(nRows)
    
    nCols = tf.constant(tf.shape(data)[1] , dtype=tf.int32) 
    #print(nCols)
    
    m1 = tf.reshape(tf.tile(tf.range(nCols), [nRows]),
                                           shape=[nRows, nCols])
    #print(m1)
    m2 = tf.transpose(tf.reshape(tf.tile(tf.range(nRows), [nCols]),
                                            shape=[nCols, nRows]))
    #print(m2)
    indices = tf.stack([m2, m1, idx], axis=-1)
    # indices should be of shape [?, 5, 3] with indices[i,j]==[i,j,idx[i,j]]
    #print(indices)
    output = tf.gather_nd(data, indices=indices)
    #print(output)
    return output

data = tf.constant([[[1,2,3,4],[1,2,3,4],[1,2,3,4],[1,2,3,4],[1,2,3,4]],
                   [[1,2,3,4],[1,2,3,4],[1,2,3,4],[1,2,3,4],[1,2,3,4]],
                   [[1,2,3,4],[1,2,3,4],[1,2,3,4],[1,2,3,4],[1,2,3,4]]])

idx= tf.constant ([[0,1,2,3,3],[0,1,2,3,3],[0,1,2,3,3]])

 
print('given: ',data)
print('fetch indices: ',idx)
batch_fetch_element_per_group(data,idx)

given:  tf.Tensor(
[[[1 2 3 4]
  [1 2 3 4]
  [1 2 3 4]
  [1 2 3 4]
  [1 2 3 4]]

 [[1 2 3 4]
  [1 2 3 4]
  [1 2 3 4]
  [1 2 3 4]
  [1 2 3 4]]

 [[1 2 3 4]
  [1 2 3 4]
  [1 2 3 4]
  [1 2 3 4]
  [1 2 3 4]]], shape=(3, 5, 4), dtype=int32)
fetch indices:  tf.Tensor(
[[0 1 2 3 3]
 [0 1 2 3 3]
 [0 1 2 3 3]], shape=(3, 5), dtype=int32)


<tf.Tensor: shape=(3, 5), dtype=int32, numpy=
array([[1, 2, 3, 4, 4],
       [1, 2, 3, 4, 4],
       [1, 2, 3, 4, 4]], dtype=int32)>

In [5]:
#how to get the best span text given the logits of start and end points.
i0 = tf.constant(0)
m0 = tf.ones([2, 2])
c = lambda i, m: i < 10
b = lambda i, m: [i+1, tf.concat([m, m], axis=0)]
tf.while_loop(
    c, b, loop_vars=[i0, m0],
    shape_invariants=[i0.get_shape(), tf.TensorShape([None, 2])])

[<tf.Tensor: shape=(), dtype=int32, numpy=10>,
 <tf.Tensor: shape=(2048, 2), dtype=float32, numpy=
 array([[1., 1.],
        [1., 1.],
        [1., 1.],
        ...,
        [1., 1.],
        [1., 1.],
        [1., 1.]], dtype=float32)>]

In [100]:
import tensorflow as tf

my_tensor = tf.constant(0, shape=[6 ,2]) # Tensor('Const:0' shape=(6, 2) dtype=int32)
my_dynamic_shape = tf.shape(my_tensor) 
print(my_dynamic_shape.numpy())
# -> Tensor('Shape:0' shape=(2,) dtype=int32)
# The shape of the tensor "Shape" is (2,) because my_tensor is a 2-D tensor
# so the dynamic shape is a 1-D tensor containing sizes of my_tensor dimensions
# and in this case, we have 2 dimensions.

my_reshaped_tensor = tf.reshape(my_tensor, [2, 3, 2]) 
print(my_reshaped_tensor)
# -> Tensor('Reshape:0' shape=(2, 3, 2) dtype=int32)

# To access a dynamic shape value, you need to run your graph and feed any placeholder that your tensor my depended upon:
@tf.function
def get_dynamic_shape(x):
    return tf.shape(x)

print('shape is:',get_dynamic_shape([[1., 2.], [1., 2.], [1., 2.], [1., 2.], [1., 2.], [1., 2.]]).numpy())

[6 2]
tf.Tensor(
[[[0 0]
  [0 0]
  [0 0]]

 [[0 0]
  [0 0]
  [0 0]]], shape=(2, 3, 2), dtype=int32)
shape is: [6 2]


In [None]:
def get_best_span_prediction(ids,start_logits, end_logits):
    _, starts = tf.nn.top_k(start_logits, k=1)
    _, ends = tf.nn.top_k(end_logits, k=1)
    
    batch_size = tf.shape(ids)[0]
    str_len = tf.shape(ids)[1]
    
    span_array = []
    mask_array = []

    
    def condition(id_str,start,end,i):
        return tf.less(i,batch_size) && tf.less(j,str_len)
        
    
    def body(id_str,start,end,i):
       
        span_array.append(tf.strided_slice(id_str, start, end + 1))
        mask_array.append(tf.strided_slice(tf.fill([str_len], 1), start , end  + 1))
        
        def inside_body(i,j):
            span_array[i] = tf.concat([span_array[i], [0]], axis=0)
            mask_array[i] = tf.concat([mask_array[i], [0]], axis=0)
            j=j+1
        tf.while_loop(
            cond = lambda i,j: tf.less(j,str_len-len(span_array[i])),
            body=inside_body,
            loop_vars=[i,j]
        )
        
        i=i+1
    
    returned = tf.while_loop(
        cond = condition,
        body= body,
        loop_vars=[ids,starts,ends,0]
    )
    
    
    

In [6]:
batch_size = 2
seq_len = 3
feature_size = 4

def rnn_step(inp, state):
  return inp + state

@tf.function
def dynamic_rnn(rnn_step, input_data, initial_state):
  # [batch, time, features] -> [time, batch, features]
  input_data = tf.transpose(input_data, [1, 0, 2])
  max_seq_len = input_data.shape[0]

  states = tf.TensorArray(tf.float32, size=max_seq_len)
  state = initial_state
  for i in tf.range(max_seq_len):
    state = rnn_step(input_data[i], state)
    states = states.write(i, state)
  return tf.transpose(states.stack(), [1, 0, 2])
  
dynamic_rnn(rnn_step,
            tf.random.uniform([batch_size, seq_len, feature_size]),
            tf.zeros([batch_size, feature_size]))

<tf.Tensor: shape=(2, 3, 4), dtype=float32, numpy=
array([[[0.04177725, 0.7987331 , 0.52792335, 0.21513033],
        [0.10943258, 1.3330783 , 0.6829629 , 0.65409565],
        [0.91260433, 1.6601439 , 1.1433008 , 0.9245236 ]],

       [[0.8501966 , 0.67991745, 0.7845633 , 0.66094744],
        [1.4661397 , 1.6632185 , 0.7948631 , 0.992458  ],
        [2.0233245 , 1.9321158 , 1.4580377 , 1.3229467 ]]], dtype=float32)>

In [None]:
def get_best_span_prediction(ids,start_logits, end_logits):
    _, starts = tf.nn.top_k(start_logits, k=1)
    _, ends = tf.nn.top_k(end_logits, k=1)
    
    batch_size = tf.shape(ids)[0]
    str_len = tf.shape(ids)[1]
    
    span_array = []
    mask_array = []
    
    dynamic_rnn(rnn_step, ids,starts,ends)
    
    def rnn_step(inp, state):
    return inp + state


    @tf.function
    def dynamic_rnn(rnn_step, input_data, starts,ends):
      # [batch, time, features] -> [time, batch, features]
      input_data = tf.transpose(input_data, [1, 0])
      max_seq_len = input_data.shape[0]

      spans = tf.TensorArray(tf.float32, size=max_seq_len)
      masks = tf.TensorArray(tf.float32, size=max_seq_len)
      start = starts[0]
      end = ends[0]
      for i in tf.range(max_seq_len):
        state = rnn_step(input_data[i], state)
        states = states.write(i, state)
      return tf.transpose(states.stack(), [1, 0, 2])
  
    
    
    

In [124]:
max_seq_length=400

    # ids = tf.keras.layers.Input(
    #       shape=(max_seq_length,), dtype=tf.int32, name='input_ids')
    # starts = tf.keras.layers.Input(
    #       shape=(1,), dtype=tf.int32, name='starts')

    # ends = tf.keras.layers.Input(
    #       shape=(1,), dtype=tf.int32, name='ends')

ids = tf.constant([[1,2,3,4,5],[1,2,3,4,5],[1,2,3,4,5],[1,2,3,4,5],[1,2,3,4,5],[1,2,3,4,5]])
starts = tf.constant([[0],[0],[1],[2],[3],[3]])
ends = tf.constant([[3],[4],[4],[4],[4],[4]])

#@tf.function
def get_best_span_prediction(ids,starts, ends):
    span_array = tf.TensorArray(dtype=tf.int32, size=0,dynamic_size=True)
    mask_array =  tf.TensorArray(dtype=tf.int32, size=0,dynamic_size=True)


    batch_size =tf.shape(ids)[0]
    str_len = tf.shape(ids)[0]
    i=0
    while(i<batch_size):
        span_array.write(i,tf.strided_slice(ids[i], starts[i], ends[i] + 1))
        mask_array.write(i,tf.strided_slice(tf.fill([str_len], 1), starts[i], ends[i] + 1))
        j=0
        while j <(str_len - len(span_array.read(i))):
            x=tf.concat([span_array.read(i), tf.constant([0])], axis=0)
            span_array.write(i,x)
            y=tf.concat([mask_array.read(i), tf.constant([0])], axis=0)
            mask_array.write(i,y )
            j=j+1
        i=i+1
        
    spans =  span_array.stack()
    masks =  mask_array.stack()

    return (spans,masks)
    
spans,masks =get_best_span_prediction(ids,starts, ends)
print(spans,masks)

InvalidArgumentError: Could not read index 0 twice because it was cleared after a previous read (perhaps try setting clear_after_read = false?)

In [88]:
import numpy as np
ids = np.random.randint(30000, size=(1000,32))
starts = np.random.randint(low=0,high=15,size=(1000,1))
ends = np.random.randint(low=16,high=32,size=(1000,1))
dataset = tf.data.Dataset.from_tensor_slices((ids,starts,ends))
print( dataset)  
dataset.batch(10)
dataset.repeat(100)

dataset

<TensorSliceDataset shapes: ((32,), (1,), (1,)), types: (tf.int64, tf.int64, tf.int64)>


<TensorSliceDataset shapes: ((32,), (1,), (1,)), types: (tf.int64, tf.int64, tf.int64)>

In [102]:
ids = tf.keras.layers.Input(
      shape=(max_seq_length,), dtype=tf.int32, name='input_ids')
stt = tf.keras.layers.Input(
      shape=(max_seq_length,), dtype=tf.int32, name='input_ids')

ids = tf.keras.layers.Input(
      shape=(max_seq_length,), dtype=tf.int32, name='input_ids')


ids = tf.constant([[1,2,3,4,5],[1,2,3,4,5],[1,2,3,4,5],[1,2,3,4,5],[1,2,3,4,5],[1,2,3,4,5]])
starts = tf.constant([[0],[0],[1],[2],[3],[3]])
ends = tf.constant([[3],[4],[4],[4],[4],[4]])

span_array = []
mask_array = []


batch_size =tf.shape(ids)[0]
str_len = tf.shape(ids)[0]
for i in tf.range(batch_size):
    span_array.append(tf.strided_slice(ids[i], starts[i], ends[i] + 1))
    mask_array.append(tf.strided_slice(tf.fill([str_len], 1), starts[i], ends[i] + 1))
    for j in range(str_len - len(span_array[i])):
        span_array[i] = tf.concat([span_array[i], [0]], axis=0)
        mask_array[i] = tf.concat([mask_array[i], [0]], axis=0)

spans = tf.stack(span_array, axis=0)
masks = tf.stack(mask_array, axis=0)

print(spans,masks)

OperatorNotAllowedInGraphError: using a `tf.Tensor` as a Python `bool` is not allowed in Graph execution. Use Eager execution or decorate this function with @tf.function.

In [49]:
import timeit
lstm_cell = tf.keras.layers.LSTMCell(10)

@tf.function
def lstm_fn(input, state):
  return lstm_cell(input, state)

input = tf.zeros([10, 10])
state = [tf.zeros([10, 10])] * 2
print(input,state[0],state[1])
# warm up
lstm_cell(input, state); lstm_fn(input, state)
print("eager lstm:", timeit.timeit(lambda: lstm_cell(input, state), number=10))
print("function lstm:", timeit.timeit(lambda: lstm_fn(input, state), number=10))

tf.Tensor(
[[0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]], shape=(10, 10), dtype=float32) tf.Tensor(
[[0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]], shape=(10, 10), dtype=float32) tf.Tensor(
[[0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
 [0. 0. 0.

In [130]:
ids = tf.constant([[1,2,3,4,5],[1,2,3,4,5],[1,2,3,4,5],[1,2,3,4,5],[1,2,3,4,5],[1,2,3,4,5]])
starts = tf.constant([[0],[0],[1],[2],[3],[3]])
ends = tf.constant([[3],[4],[4],[4],[4],[4]])

starts_1= tf.tile(starts,[1,5])

print(tf.reduce_max(starts_1[0]))
print(starts_1)
starts_t=tf.transpose(starts_1,[1,0])
print(starts,starts_t)

tf.Tensor(0, shape=(), dtype=int32)
tf.Tensor(
[[0 0 0 0 0]
 [0 0 0 0 0]
 [1 1 1 1 1]
 [2 2 2 2 2]
 [3 3 3 3 3]
 [3 3 3 3 3]], shape=(6, 5), dtype=int32)
tf.Tensor(
[[0]
 [0]
 [1]
 [2]
 [3]
 [3]], shape=(6, 1), dtype=int32) tf.Tensor(
[[0 0 1 2 3 3]
 [0 0 1 2 3 3]
 [0 0 1 2 3 3]
 [0 0 1 2 3 3]
 [0 0 1 2 3 3]], shape=(5, 6), dtype=int32)


In [156]:
def square_if_positive_vectorized(x):
  return tf.where(x > 0, x ** 2, x)


square_if_positive_vectorized(tf.range(-5, 5))

print(tf.range(-5, 5))

tf.Tensor([-5 -4 -3 -2 -1  0  1  2  3  4], shape=(10,), dtype=int32)


In [4]:
import tensorflow as tf

starts = tf.constant([[0],[0],[1],[2],[3],[3]])
ends = tf.constant([[3],[4],[4],[4],[4],[4]])
ids = tf.constant([[1,2,3,4,5],[1,2,3,4,5],[1,2,3,4,5],[1,2,3,4,5],[1,2,3,4,5],[1,2,3,4,5]])
max_len=5
batch_size =6

#tf.compat.v1.disable_eager_execution()
print(tf.executing_eagerly())   

s= tf.transpose(tf.tile(starts,[1,max_len]))
e= tf.transpose(tf.tile(ends,[1,max_len]))

ta=tf.TensorArray(dtype = tf.int32, size=5)
#print(s,e)
for i in tf.range(max_len):
    x=tf.where(i>=s[i],1,0)
    y=tf.where(i<e[i],1,0)
    #tf.print(x,y)
    ta.write(i,x*y)

m=tf.transpose(ta.stack(),[1,0])
spans=ids*m
 
#for i in tf.range(max_len):
    #new_spans=tf.roll(spans, shift=s[i], axis=[1])
    
 

def f_roll(arg):
    x, s = arg
    
    #return tf.roll(x, shift=-1 * s, axis=[0])
    z=tf.roll(x, shift= -1*s , axis=[0])
    print(z)
    return z
    
new_spans = tf.vectorized_map(
    fn=f_roll,
    elems=(spans, starts)
)
 
new_mask = tf.vectorized_map(
    fn=f_roll,
    elems=(m, starts)
)
 
print(new_spans,new_mask)

True
Tensor("loop_body/Roll:0", shape=(5,), dtype=int32)


StagingError: in converted code:
    relative to /Users/wweschen/tf2/env/lib/python3.7/site-packages:

    tensorflow_core/python/ops/parallel_for/control_flow_ops.py:183 f  *
        return _pfor_impl(loop_fn, iters, parallel_iterations=parallel_iterations)
    tensorflow_core/python/ops/parallel_for/control_flow_ops.py:256 _pfor_impl
        outputs.append(converter.convert(loop_fn_output))
    tensorflow_core/python/ops/parallel_for/pfor.py:1280 convert
        output = self._convert_helper(y)
    tensorflow_core/python/ops/parallel_for/pfor.py:1453 _convert_helper
        if flags.FLAGS.op_conversion_fallback_to_while_loop:
    tensorflow_core/python/platform/flags.py:84 __getattr__
        wrapped(_sys.argv)
    absl/flags/_flagvalues.py:633 __call__
        name, value, suggestions=suggestions)

    UnrecognizedFlagError: Unknown command line flag 'f'


In [79]:
 
fw=tf.keras.layers.LSTM(5, return_sequences=True,return_state=True )
bw=tf.keras.layers.LSTM(5, return_sequences=True,go_backwards=True,return_state=True )

 

inputs = tf.ones([2, 12, 5])


out=fw(inputs)
out2 =bw(inputs)
 
print(tf.concat([out[0],out2[0]],axis=2).shape)
      
#print(tf.split(s,2,axis=-1))

(2, 12, 10)


In [72]:
 
c=tf.keras.layers.LSTMCell(5) 
l  =  tf.keras.layers.Dense(5)
#print(c.output_size)
#print(c.state_size)

inputs = tf.ones([3,5])
state=[tf.ones([3,5]),tf.ones([3,5])]

print(state)
out=c(inputs,state)
  
out2=l(out[1][0])
print(out[0],out[1][0],out[1][1])
print(out2)
      
#print(tf.split(s,2,axis=-1))

[<tf.Tensor: shape=(3, 5), dtype=float32, numpy=
array([[1., 1., 1., 1., 1.],
       [1., 1., 1., 1., 1.],
       [1., 1., 1., 1., 1.]], dtype=float32)>, <tf.Tensor: shape=(3, 5), dtype=float32, numpy=
array([[1., 1., 1., 1., 1.],
       [1., 1., 1., 1., 1.],
       [1., 1., 1., 1., 1.]], dtype=float32)>]
tf.Tensor(
[[0.00270986 0.5078718  0.2911771  0.5298484  0.08189317]
 [0.00270986 0.5078718  0.2911771  0.52984846 0.08189316]
 [0.00270986 0.5078718  0.29117706 0.52984846 0.08189316]], shape=(3, 5), dtype=float32) tf.Tensor(
[[0.00270986 0.5078718  0.2911771  0.5298484  0.08189317]
 [0.00270986 0.5078718  0.2911771  0.52984846 0.08189316]
 [0.00270986 0.5078718  0.29117706 0.52984846 0.08189316]], shape=(3, 5), dtype=float32) tf.Tensor(
[[0.00501275 1.1753801  0.9873215  1.044925   0.6098953 ]
 [0.00501275 1.1753801  0.9873215  1.044925   0.6098953 ]
 [0.00501275 1.1753801  0.9873214  1.044925   0.6098953 ]], shape=(3, 5), dtype=float32)
tf.Tensor(
[[-0.20794933 -0.17963332  0.39797

In [279]:
max_len=5
ids = tf.keras.layers.Input(
      shape=(max_len,), dtype=tf.int32, name='input_ids')
starts = tf.keras.layers.Input(
      shape=(1,), dtype=tf.int32, name='starts')

ends = tf.keras.layers.Input(
      shape=(1,), dtype=tf.int32, name='ends')
# starts = tf.constant([[0],[0],[1],[2],[3],[3]])
# ends = tf.constant([[3],[4],[4],[4],[4],[4]])
# ids = tf.constant([[1,2,3,4,5],[1,2,3,4,5],[1,2,3,4,5],[1,2,3,4,5],[1,2,3,4,5],[1,2,3,4,5]])
max_len=5

@tf.function
def get_best_span_prediction(ids,start_logits, end_logits):
    #_, starts = tf.nn.top_k(start_logits, k=1)
    #_, ends = tf.nn.top_k(end_logits, k=1)
    
    s= tf.transpose(tf.tile(starts,[1,max_len]))
    e= tf.transpose(tf.tile(ends,[1,max_len]))

    ta=tf.TensorArray(dtype = tf.int32, size=5)
    #print(s,e)
    for i in tf.range(max_len):
        x=tf.where(i>=s[i],1,0)
        y=tf.where(i<e[i],1,0)
        #tf.print(x,y)
        ta.write(i,x*y)

    m=tf.transpose(ta.stack(),[1,0])
    spans=ids*m
    #print(spans,starts)

    #for i in tf.range(max_len):
        #new_spans=tf.roll(spans, shift=s[i], axis=[1])

    return (spans,m)

get_best_span_prediction(ids,starts,ends)

_SymbolicException: Inputs to eager execution function cannot be Keras symbolic tensors, but found [<tf.Tensor 'input_ids_22:0' shape=(None, 5) dtype=int32>, <tf.Tensor 'starts_19:0' shape=(None, 1) dtype=int32>, <tf.Tensor 'ends_19:0' shape=(None, 1) dtype=int32>, <tf.Tensor 'starts_19:0' shape=(None, 1) dtype=int32>, <tf.Tensor 'ends_19:0' shape=(None, 1) dtype=int32>]

In [147]:
max_seq_length=400

ids = tf.keras.layers.Input(
       shape=(max_seq_length,), dtype=tf.int32, name='input_ids')
starts = tf.keras.layers.Input(
       shape=(1,), dtype=tf.int32, name='starts')

ends = tf.keras.layers.Input(
       shape=(1,), dtype=tf.int32, name='ends')

m=[]

@tf.function
def make_a_mask(starts,ends):
   
    batch_size=tf.shape(starts)[0]

    k=0
    while(k<batch_size):

        a =[]
        for i in tf.range(max_seq_length):
            if(i<starts[k] or i>=ends[k]):
                a.append(0)
            else: 
                a.append(1)

        k=k+1 
        m.append(a)
    
m= make_a_mask(starts,ends)

tf.print(m)

_SymbolicException: Inputs to eager execution function cannot be Keras symbolic tensors, but found [<tf.Tensor 'starts_17:0' shape=(None, 1) dtype=int32>, <tf.Tensor 'ends_17:0' shape=(None, 1) dtype=int32>]

In [507]:

class LinearLayer(tf.keras.layers.Layer):
  def __init__(self,
               output_size,
               use_bias=False,
               kernel_initializer=None,
               bias_initializer="zeros",
               activation=None,
               **kwargs):
    super(LinearLayer, self).__init__(**kwargs)
    self.output_size = output_size
    self.kernel_initializer = kernel_initializer
    self.bias_initializer = bias_initializer
    self.activation = activation
    self.use_bias = use_bias

  def build(self, input_shape):
    """Implements build() for the layer."""

    total_arg_size = 0
    shapes = input_shape  
    if type(shapes) is not list:
        shapes=[shapes] 
        
    shapes=list(itertools.chain(*shapes))
     
    for shape in shapes: 
       
        if len(shape) != 2:
            raise ValueError("Linear is expecting 2D arguments: %s" % str(shapes))
        if not shape[1]:
            raise ValueError("Linear expects shape[1] of arguments: %s" % str(shapes))
        else:
            total_arg_size += shape[1] 
             
    self.kernel = self.add_weight(
        "kernel",
        shape=[total_arg_size, self.output_size],
        initializer=self.kernel_initializer,
        dtype=self.dtype,
        trainable=True)
    self.bias = self.add_weight(
        "bias",
        shape=[self.output_size],
        initializer=self.bias_initializer,
        dtype=self.dtype,
        trainable=True)

    super(LinearLayer, self).build(input_shape)

  def call(self, inputs):
     
        
      if type(inputs) is not list:
        inputs=[inputs]  
        
      inputs=list(itertools.chain(*inputs))
    
      if len(inputs) == 1:
          
          res = tf.matmul(inputs[0], self.kernel)
      else:
          res = tf.matmul(tf.concat(axis=1, values=inputs), self.kernel)
      
      if not self.use_bias:
            return res
        
      return res + self.bias



In [114]:
def get_shape_list(tensor, expected_rank=None, name=None):
  """Returns a list of the shape of tensor, preferring static dimensions.

  Args:
    tensor: A tf.Tensor object to find the shape of.
    expected_rank: (optional) int. The expected rank of `tensor`. If this is
      specified and the `tensor` has a different rank, and exception will be
      thrown.
    name: Optional name of the tensor for the error message.

  Returns:
    A list of dimensions of the shape of tensor. All static dimensions will
    be returned as python integers, and dynamic dimensions will be returned
    as tf.Tensor scalars.
  """
  if expected_rank is not None:
    assert_rank(tensor, expected_rank, name)

  shape = tensor.shape.as_list()

  non_static_indexes = []
  for (index, dim) in enumerate(shape):
    if dim is None:
      non_static_indexes.append(index)

  if not non_static_indexes:
    return shape

  dyn_shape = tf.shape(tensor)
  for index in non_static_indexes:
    shape[index] = dyn_shape[index]
  return shape


In [504]:
from modeling import tf_utils
class AttentionLayer(tf.keras.layers.Layer):
    def __init__(self, vector_size,
                 use_coverage,
                 initializer=None,
                 float_type=tf.float32,
                 **kwargs):
        super(AttentionLayer , self).__init__(**kwargs)
        self.initializer = initializer
        self.float_type = float_type
        self.vector_size = vector_size
        self.use_coverage=use_coverage

        #self.w_h = self.add_weight(shape=[1, 1, self.attention_length, self.vector_size], name="W_h")
        self.v = self.add_weight(shape=[self.vector_size], name="v")
        #self.w_c = self.add_weight(shape=[1, 1, 1, self.vector_size], name="W_c")

    def build(self,  unused_input_shapes):


        self.linear_layer =  LinearLayer(self.vector_size)
            # shape (batch_size, attention_vec_size)

        self.coverage_layer =  tf.keras.layers.Conv2D(self.vector_size,(1,1), padding= "SAME")
        # c has shape (batch_size, attn_length, 1, attention_vec_size)

        super(AttentionLayer, self).build(unused_input_shapes)

    def __call__(self,
                 decoder_state,
                 encoder_features,
                 input_mask,
                 coverage=None,
                 **kwargs):
        inputs = (encoder_features,decoder_state, input_mask,coverage )
        return super(AttentionLayer, self).__call__(inputs, **kwargs)

    def call(self, inputs):
         
        encoder_features = inputs[0]
        batch_size = tf_utils.get_shape_list(encoder_features)[0]

        decoder_states=inputs[1]
        input_mask=inputs[2] 
        coverage  =inputs[3]
 
        decoder_features= self.linear_layer([decoder_states])
        decoder_features = tf.expand_dims(tf.expand_dims(decoder_features, 1),1)
          # reshape to (batch_size, 1, 1, attention_vec_size)


        def masked_attention(e):
            """Take softmax of e then apply enc_padding_mask and re-normalize"""
            attn_dist = tf.nn.softmax(e)  # take softmax. shape (batch_size, attn_length)
            attn_dist *= input_mask  # apply mask
            masked_sums = tf.reduce_sum(attn_dist, axis=1)  # shape (batch_size)
            return attn_dist / tf.reshape(masked_sums, [-1, 1])  # re-normalize

        if self.use_coverage and coverage is not None:  # non-first step of coverage
            # Multiply coverage vector by w_c to get coverage_features.
            coverage_features = self.coverage_layer(coverage )  # c has shape (batch_size, attn_length, 1, attention_vec_size)

            # Calculate v^T tanh(W_h h_i + W_s s_t + w_c c_i^t + b_attn)
            e = tf.reduce_sum(self.v * tf.tanh(encoder_features + decoder_features + coverage_features),
                                    [2, 3])  # shape (batch_size,attn_length)

            # Calculate attention distribution
            attn_dist = masked_attention(e)

            # Update coverage vector
            coverage += tf.reshape(attn_dist, [batch_size, -1, 1, 1])
        else:
            # Calculate v^T tanh(W_h h_i + W_s s_t + b_attn)
            e = tf.reduce_sum(self.v * tf.tanh(encoder_features + decoder_features), [2, 3])  # calculate e

            # Calculate attention distribution
            attn_dist = masked_attention(e)

            if self.use_coverage:  # first step of training
                coverage = tf.expand_dims(tf.expand_dims(attn_dist, 2), 2)  # initialize coverage

        # Calculate the context vector from attn_dist and encoder_states
        context_vector = tf.reduce_sum(tf.reshape(attn_dist, [batch_size, -1, 1, 1]) * encoder_features,
                                             [1, 2])  # shape (batch_size, attn_size).
       
        context_vector = tf.reshape(context_vector, [-1, self.vector_size])
       
         
        return context_vector, attn_dist, coverage

    def compute_output_shape(self,inputShape):
    
        #calculate shapes from input shape
        return [[None,self.vector_size],
                [None,self.max_seq_length],
                [None,self.max_seq_length,1,1]
               ]


In [512]:
class AttentionDecoder (tf.keras.layers.Layer):
    def __init__(self,
                  hidden_dim,
                 vector_size,
                 attention_length,
                 initial_state_attention=False,
                 pointer_gen=True,
                 use_coverage=False,
                 initializer=None,
                 float_type=tf.float32,
                 **kwargs):
        super(AttentionDecoder , self).__init__(**kwargs)
        self.initializer = initializer
        self.float_type = float_type
        self.hidden_dim=hidden_dim
        self.vector_size = vector_size
        self.attention_length = attention_length
        self.pointer_gen = pointer_gen
        self.use_coverage = use_coverage
        self.initial_state_attention=initial_state_attention

    def build(self, unused_input_shapes):

        self.lstm_layer = tf.keras.layers.LSTMCell(self.vector_size)

        self.encoder_layer = tf.keras.layers.Conv2D(filters=self.vector_size, kernel_size=(1, 1), padding="SAME")
        # shape (batch_size,attn_length,1,attention_vec_size)
        self.linear = LinearLayer(self.vector_size )
        self.linear2 = LinearLayer(1)
        self.attention_layer = AttentionLayer(self.vector_size,self.attention_length,True)



    def __call__(self,
                 decoder_inputs,
                 dec_initial_state,
                 encoder_states,
                 enc_padding_mask,
                 prev_coverage=None,
                 **kwargs):
        
        inputs = (decoder_inputs,  
                                       dec_initial_state,
                                       encoder_states,
                                       enc_padding_mask,
                                       prev_coverage) 
        return super(AttentionDecoder, self).__call__(inputs, **kwargs)

    def call(self, inputs):
        #unpacked_inputs = tf_utils.unpack_inputs(inputs)

        decoder_inputs = inputs[0]
        initial_state = inputs[1]
        encoder_states = inputs[2]
        enc_padding_mask = inputs[3]
        prev_coverage =  inputs[4]

        outputs = []
        attn_dists = []
        p_gens = []
  
        encoder_states = tf.expand_dims(encoder_states, axis=2)  # now is shape (batch_size, attn_len, 1, attn_size)
        
        encoder_features = self.encoder_layer(encoder_states)  # shape (batch_size,attn_length,1,attention_vec_size)
        state =initial_state # [initial_state,initial_state]
        #state=[initial_state]*2
        batch_size=tf_utils.get_shape_list(encoder_states)[0]

        coverage = prev_coverage  # initialize coverage to None or whatever was passed in
         
        context_vector = tf.zeros([batch_size, self.vector_size])
        context_vector.set_shape([None, self.vector_size])  # Ensure the second shape of attention vectors is set.
        if self.initial_state_attention:  # true in decode mode
            # Re-calculate the context vector from the previous step so that we can pass it through a linear layer
            # with this step's input to get a modified version of the input
            context_vector, _, coverage = self.attention_layer ( encoder_features=encoder_features,
                                                                 decoder_state=state,
                                                                 coverage =coverage,
                                                                 input_mask=enc_padding_mask )
             
            # in decode mode, this is what updates the coverage vector

        for i, inp in enumerate(decoder_inputs):

            # Merge input and previous attentions into one vector x of the same size as inp
            input_size = inp.get_shape().with_rank(2)[1]
            
            if input_size is None:
                raise ValueError("Could not infer input size from input: %s" % inp.name)
            
             
            x = self.linear([[inp], [context_vector]])

            # Run the decoder RNN cell. cell_output = decoder state
            #print(i, x, state)
            cell_output, state = self.lstm_layer(x,state)
           
            # Run the attention mechanism.
            if i == 0 and self.initial_state_attention:  # always true in decode mode
                context_vector, attn_dist, _ = self.attention_layer (encoder_features=encoder_features,
                                                                     decoder_state=state,
                                                                     coverage=coverage,
                                                                     input_mask=enc_padding_mask)  # don't allow coverage to update
            else:
                context_vector, attn_dist, coverage = self.attention_layer(encoder_features=encoder_features,
                                                                           decoder_state=state,
                                                                           coverage=coverage,
                                                                           input_mask=enc_padding_mask)
            attn_dists.append(attn_dist)

            # Calculate p_gen
            if self.pointer_gen: 
                p_gen = self.linear2( [[context_vector],[state[0]], [state[1]], [x]])
                # Tensor shape (batch_size, 1)
                p_gen = tf.sigmoid(p_gen)
                p_gens.append(p_gen)

                # Concatenate the cell_output (= decoder state) and the context vector, and pass them through a linear layer
                # This is V[s_t, h*_t] + b in the paper
                output = self.linear( [[cell_output], [context_vector]])
            outputs.append(output)

        # If using coverage, reshape it
        if coverage is not None:
            coverage = tf.reshape(coverage, [batch_size, -1])

        return outputs, state, attn_dists, p_gens, coverage
    # def compute_output_shape(self,inputShape):
    #      #calculate shapes from input shape
    #      return [[None,self.max_seq_length,self.hidden_dim],
    #              [None,self.hidden_dim],
    #              [None, self.hidden_dim],
    #              [None, self.hidden_dim],
    #              [None, self.hidden_dim],
    #              ]


In [177]:
def get_initializer(initializer_range=0.02):
  """Creates a `tf.initializers.truncated_normal` with the given range.

  Args:
    initializer_range: float, initializer range for stddev.

  Returns:
    TruncatedNormal initializer with stddev = `initializer_range`.
  """
  return tf.keras.initializers.TruncatedNormal(stddev=initializer_range)

In [267]:
batch_size = 4
seq_len=10
feature_size = 12
hidden_size=240
max_dec_length = 8


dec_features=[tf.random.uniform([batch_size,feature_size])]*max_dec_length

dec_states=tf.random.uniform([batch_size,feature_size])
dec_mask=tf.ones([batch_size,seq_len],tf.float32)
enc_states=tf.random.uniform([batch_size, seq_len,feature_size])
 
coverage = tf.random.uniform([batch_size, seq_len,1,1])


decoder = AttentionDecoder(hidden_size, feature_size, 
                                        seq_len, get_initializer(),
                                        name="attention_decoder")

out = decoder(
            dec_features,
            dec_states,
            enc_states,
            dec_mask,
            coverage)
    
    
#print(out)

 

#print(get_shape_list(input))
#input=tf.random.uniform([ seq_len])
#zero_state=tf.zeros([batch_size, feature_size])

#linear = LinearLayer(feature_size,False)

# print(input)
# out=linear(input)
# encoder_layer = tf.keras.layers.Conv2D(filters=feature_size, kernel_size=(1, 1), padding="SAME")

# inputs = tf.expand_dims(inputs, axis=2)  # now is shape (batch_size, attn_len, 1, attn_size)

# enc_feature= encoder_layer(inputs)     

# atten=AttentionLayer(feature_size,seq_len,True)

# out = atten(encoder_features=enc_feature, decoder_state=dec_states, coverage =coverage, input_mask=input_mask )




In [273]:
out[4]

<tf.Tensor: shape=(4, 10), dtype=float32, numpy=
array([[1.4945861 , 1.6792537 , 1.085821  , 1.547793  , 1.6920732 ,
        1.082137  , 1.0859984 , 0.47908726, 0.9617234 , 0.9767699 ],
       [1.9037642 , 1.3531944 , 1.8471627 , 1.0240195 , 1.3833439 ,
        1.4301822 , 0.77310556, 1.5611751 , 1.5469098 , 0.63561773],
       [1.7449133 , 1.0637496 , 1.3419441 , 0.92907566, 1.6138995 ,
        0.72120076, 1.827379  , 1.837569  , 0.7337004 , 1.6414684 ],
       [1.7412142 , 1.7451642 , 1.1470199 , 1.9235015 , 1.3093983 ,
        1.7299534 , 1.0788914 , 0.6208656 , 1.6557912 , 1.5660311 ]],
      dtype=float32)>

In [274]:

class EmbeddingLookup(tf.keras.layers.Layer):
  """Looks up words embeddings for id tensor."""

  def __init__(self,
               vocab_size,
               embedding_size=768,
               initializer_range=0.02,
               **kwargs):
    super(EmbeddingLookup, self).__init__(**kwargs)
    self.vocab_size = vocab_size
    self.embedding_size = embedding_size
    self.initializer_range = initializer_range

  def build(self, unused_input_shapes):
    """Implements build() for the layer."""
    self.embeddings = self.add_weight(
        "embeddings",
        shape=[self.vocab_size, self.embedding_size],
        initializer=get_initializer(self.initializer_range),
        dtype=self.dtype)
    super(EmbeddingLookup, self).build(unused_input_shapes)

  def call(self, inputs):
    """Implements call() for the layer."""

    input_shape = tf_utils.get_shape_list(inputs)

    flat_input = tf.reshape(inputs, [-1])
    output = tf.gather(self.embeddings, flat_input)
    output = tf.reshape(output, input_shape + [self.embedding_size])


    return output


In [418]:

class ReduceStateLayer(tf.keras.layers.Layer):
    def __init__(self,
                 hidden_dim, **kwargs):
        super(ReduceStateLayer, self).__init__(**kwargs)
        self.hidden_dim=hidden_dim

    def build(self, unused_input_shapes):
        hidden_dim = self.hidden_dim
        self.w_reduce_c = self.add_weight('w_reduce_c', [hidden_dim * 2, hidden_dim], dtype=tf.float32,
                                     initializer=tf.keras.initializers.TruncatedNormal())
        self.w_reduce_h = self.add_weight('w_reduce_h', [hidden_dim * 2, hidden_dim], dtype=tf.float32,
                                     initializer=tf.keras.initializers.TruncatedNormal())
        self.bias_reduce_c = self.add_weight('bias_reduce_c', [hidden_dim], dtype=tf.float32,
                                        initializer=tf.keras.initializers.TruncatedNormal())
        self.bias_reduce_h = self.add_weight('bias_reduce_h', [hidden_dim], dtype=tf.float32,
                                        initializer=tf.keras.initializers.TruncatedNormal())
        super(ReduceStateLayer, self).build(unused_input_shapes)

    def __call__(self,
                 fw_state_h,fw_state_c, bw_state_h,bw_state_c,
                 **kwargs):
        inputs =  (fw_state_h,fw_state_c, bw_state_h,bw_state_c)

        return super(ReduceStateLayer, self).__call__(inputs, **kwargs)
    def call(self, inputs):

        fw_state_h = inputs[0]
        fw_state_c = inputs[1]
        bw_state_h = inputs[2]
        bw_state_c = inputs[3]

        # Apply linear layer
        old_c = tf.concat(axis=1, values=[fw_state_c, bw_state_c])  # Concatenation of fw and bw cell
        old_h = tf.concat(axis=1, values=[fw_state_h, bw_state_h])  # Concatenation of fw and bw state
        new_c = tf.nn.relu(tf.matmul( old_c, self.w_reduce_c) + self.bias_reduce_c)  # Get new cell from old cell
        new_h = tf.nn.relu(tf.matmul( old_h, self.w_reduce_h) + self.bias_reduce_h)  # Get new state from old state
        
        return [new_c, new_h]  # Return new cell and state

   


In [363]:

class Encoder (tf.keras.layers.Layer):
    def __init__(self,
                 hidden_dim,max_seq_length,  **kwargs):
        super(Encoder, self).__init__(**kwargs)
        self.hidden_dim=hidden_dim
        self.max_seq_length=max_seq_length

    def build(self, unused_input_shapes):
        lstm_layer_fw = tf.keras.layers.LSTM(self.hidden_dim, return_sequences=True, return_state=True)
        lstm_layer_bw = tf.keras.layers.LSTM(self.hidden_dim, return_sequences=True, go_backwards=True,
                                             return_state=True)
        self.bidirection = tf.keras.layers.Bidirectional(lstm_layer_fw,backward_layer=lstm_layer_bw , merge_mode="concat")

        self.state_reducer = ReduceStateLayer(self.hidden_dim)

        super(Encoder, self).build(unused_input_shapes)

    def __call__(self,
                 input_word_ids,
                 input_mask=None ,
                 **kwargs):
        inputs = (input_word_ids, input_mask)
        return super(Encoder, self).__call__(inputs, **kwargs)

    def call(self, inputs):
        #unpacked_inputs = tf_utils.unpack_inputs(inputs)
        input_ids=inputs[0]
        masks = inputs[1]
        masks = tf.expand_dims(masks, axis=2)

        outputs = self.bidirection(input_ids*masks)
 
        encoder_outputs=outputs[0]

        fw_state_h,fw_state_ch = outputs[1],outputs[2]
        bw_state_h,bw_state_ch = outputs[3],outputs[4]

        state = self.state_reducer(fw_state_h,fw_state_ch ,bw_state_h, bw_state_ch)

        return encoder_outputs, state
    def compute_output_shape(self,inputShape):
        #calculate shapes from input shape
        return [[None,self.max_seq_length,2*self.hidden_dim],
                 [[None,self.hidden_dim],[None,self.hidden_dim]]]



In [533]:

class PGNetSummaryModel(tf.keras.layers.Layer):
  def __init__(self,
               config,
               float_type=tf.float32,
               **kwargs):
    super(PGNetSummaryModel, self).__init__(**kwargs)

    self.config = (
        PGNetConfig.from_dict(config)
        if isinstance(config, dict) else copy.deepcopy(config))

    self.float_type = float_type



  def build(self, unused_input_shapes):
    """Implements build() for the layer."""
    self.embedding_lookup = EmbeddingLookup(self.config.vocab_size,self.config.hidden_size)
    self.encoder = Encoder(self.config.hidden_size,self.config.max_seq_length, dynamic=True)
    self.decoder = AttentionDecoder(self.config.hidden_size,self.config.hidden_size,
                                    self.config.max_seq_length,get_initializer())
    self.output_projector = OutputProjectionLayer(self.config.hidden_size,self.config.vocab_size)
    self.final_distribution = FinalDistributionLayer(self.config.hidden_size,self.config.vocab_size,self.config.max_oov_size)

    super(PGNetSummaryModel, self).build(unused_input_shapes)

  def __call__(self,
               input_word_ids,
               input_mask=None,
               answer_ids=None,
               answer_mask=None, 
               **kwargs):
    inputs = (input_word_ids, input_mask, answer_ids,answer_mask)
    return super(PGNetSummaryModel, self).__call__(inputs, **kwargs)

  def call(self, inputs,mode="pgnet"):

      input_word_ids = inputs[0]
      input_mask = inputs[1]
      answer_ids= inputs[2]
      answer_mask= inputs[3] 

      emb_enc_inputs = self.embedding_lookup(input_word_ids)  # tensor with shape (batch_size, max_seq_length, emb_size)
      emb_dec_inputs = [self.embedding_lookup(x) for x in tf.unstack(answer_ids, axis=1)]  # list length max_dec_steps containing shape (batch_size, emb_size)

      enc_outputs, enc_state = self.encoder(emb_enc_inputs,input_mask )
 
      self._enc_states = enc_outputs

      self._dec_in_state = enc_state

      if mode=="encoder":
         return (self._enc_states,self._dec_in_state )

      prev_coverage =None # self.prev_coverage #if self.config.mode == "decode" and self.config.use_coverage  else None

      decoder_outputs, self._dec_out_state, self.attn_dists, self.p_gens, self.coverage = self.decoder(
                emb_dec_inputs,
                 self._dec_in_state ,
                 self._enc_states ,
                 input_mask,
                 prev_coverage=prev_coverage)
      if mode == "decoder":
           return (decoder_outputs, self._dec_out_state, self.attn_dists, self.p_gens, self.coverage)

      vocab_dists=self.output_projector(decoder_outputs)

      if self.config.use_pointer_gen:
           final_dists = self.final_distribution(vocab_dists, self.attn_dists,self.p_gens, input_word_ids)
      else:  # final distribution is just vocabulary distribution
           final_dists = vocab_dists

      return  final_dists,self.attn_dists

  def get_config(self):
    config = {"config": self.config.to_dict()}
    base_config = super(PGNetSummaryModel, self).get_config()
    return dict(list(base_config.items()) + list(config.items()))


In [470]:
class OutputProjectionLayer(tf.keras.layers.Layer):
    def __init__(self,
                 hidden_dim,
                 vocab_size,
                 **kwargs):
        super(OutputProjectionLayer, self).__init__(**kwargs)
        self.hidden_dim=hidden_dim
        self.vocab_size =vocab_size


    def build(self, unused_input_shapes):
        self.w = self.add_weight('w', [self.hidden_dim, self.vocab_size], dtype=tf.float32, initializer=tf.keras.initializers.TruncatedNormal())
        self.w_t = tf.transpose(self.w)
        self.v = self.add_weight('v', [self.vocab_size], dtype=tf.float32, initializer=tf.keras.initializers.TruncatedNormal())
      

    def call(self, inputs):
        decoder_outputs = inputs

        vocab_scores = []  # vocab_scores is the vocabulary distribution before applying softmax. Each entry on the list corresponds to one decoder step
        for i, output in enumerate(decoder_outputs):
            vocab_scores.append(tf.matmul(output, self.w)+ self.v)  # apply the linear layer

        vocab_dists = [tf.nn.softmax(s) for s in
                       vocab_scores]
        # The vocabulary distributions. List length max_dec_steps of (batch_size, vsize) arrays. The words are in the order they appear in the vocabulary file.

        return vocab_dists



In [543]:


class FinalDistributionLayer(tf.keras.layers.Layer):
    def __init__(self,
                 hidden_dim,
                 vocab_size,
                 max_oov_size,
                 **kwargs):
        super(FinalDistributionLayer, self).__init__(**kwargs)

        self.hidden_dim=hidden_dim
        self.vocab_size=vocab_size
        self.max_oov_size=max_oov_size

    #def build(self, unused_input_shapes):


    def __call__(self,
                 vocab_dists, attn_dists,p_gens,input_ids,
                 **kwargs):
        inputs = (vocab_dists, attn_dists,p_gens,input_ids)

        return super(FinalDistributionLayer, self).__call__(inputs, **kwargs)

    def call(self, inputs):

        vocab_dists=inputs[0]
        attn_dists=inputs[1]
        p_gens=inputs[2]
        input_ids=inputs[3]
        max_oov_size=self.max_oov_size

        vocab_dists = [p_gen * dist for (p_gen, dist) in zip( p_gens, vocab_dists)]
        attn_dists = [(1 - p_gen) * dist for (p_gen, dist) in zip( p_gens, attn_dists)]
 
        batch_size = tf_utils.get_shape_list(vocab_dists[0])[0]
        
        # Concatenate some zeros to each vocabulary dist, to hold the probabilities for in-article OOV words
        extended_vsize = self.vocab_size + max_oov_size  # the maximum (over the batch) size of the extended vocabulary
        extra_zeros = tf.zeros(( batch_size,  max_oov_size))
        vocab_dists_extended = [tf.concat(axis=1, values=[dist, extra_zeros]) for dist in
                                vocab_dists]  # list length max_dec_steps of shape (batch_size, extended_vsize)

        # Project the values in the attention distributions onto the appropriate entries in the final distributions
        # This means that if a_i = 0.1 and the ith encoder word is w, and w has index 500 in the vocabulary, then we add 0.1 onto the 500th entry of the final distribution
        # This is done for each decoder timestep.
        # This is fiddly; we use tf.scatter_nd to do the projection
        batch_nums = tf.range(0, limit= batch_size)  # shape (batch_size)
        batch_nums = tf.expand_dims(batch_nums, 1)  # shape (batch_size, 1)
        attn_len =  tf_utils.get_shape_list(input_ids)[1]  # number of states we attend over
        batch_nums = tf.tile(batch_nums, [1, attn_len])  # shape (batch_size, attn_len)
        indices = tf.stack((batch_nums,  input_ids), axis=2)  # shape (batch_size, enc_t, 2)
        shape = [ batch_size, extended_vsize]
        
        attn_dists_projected = [tf.scatter_nd(indices, copy_dist, shape) for copy_dist in
                                attn_dists]  # list length max_dec_steps (batch_size, extended_vsize)

        # Add the vocab distributions and the copy distributions together to get the final distributions
        # final_dists is a list length max_dec_steps; each entry is a tensor shape (batch_size, extended_vsize) giving the final distribution for that decoder timestep
        # Note that for decoder timesteps and examples corresponding to a [PAD] token, this is junk - ignore.
        final_dists = [vocab_dist + copy_dist for (vocab_dist, copy_dist) in
                       zip(vocab_dists_extended, attn_dists_projected)]

        return final_dists

In [545]:
import numpy as np
import bert.bert_modeling as bert_modeling
import copy 

batch_size = 4
seq_len=10
feature_size = 12
hidden_size=12
max_dec_length = 8
max_oov = 11
vocab_size = 3000
float_type=tf.float32

bert_config = bert_modeling.BertConfig(vocab_size)

bert_config.add_from_dict({"hidden_size":hidden_size, 
                           "max_seq_length":seq_len,
                           "use_pointer_gen":True,
                           "max_oov_size":max_oov})

input_word_ids = np.random.randint(vocab_size, size=(batch_size,seq_len),dtype=np.int32)
masks=tf.ones([batch_size,seq_len],tf.float32)

 
#print(oov_ids)
answer_ids=np.random.randint(vocab_size, size=(batch_size,max_dec_length))
answer_mask=tf.ones([batch_size,max_dec_length],tf.float32)

pgnet_model_layer =PGNetSummaryModel(config=bert_config ,
                                                  float_type=float_type,
                                                 name='pgnet_summary_model')


final_dists, attn_dists = pgnet_model_layer(  input_word_ids,
                                                masks,
                                                answer_ids,
                                                answer_mask 
                                              )


attn_len: 10


In [547]:
attn_dists

ListWrapper([<tf.Tensor: shape=(4, 10), dtype=float32, numpy=
array([[0.09971954, 0.09981675, 0.10018633, 0.10011055, 0.10031537,
        0.10050201, 0.09985542, 0.0998798 , 0.09986451, 0.09974983],
       [0.10005995, 0.10005333, 0.10025822, 0.09989783, 0.09962853,
        0.09992645, 0.09980742, 0.09966271, 0.10031735, 0.10038821],
       [0.09967137, 0.09958483, 0.09996431, 0.09994293, 0.09985147,
        0.10009833, 0.10029086, 0.10019293, 0.10028132, 0.10012164],
       [0.0999478 , 0.10043196, 0.10009155, 0.10033999, 0.10009501,
        0.09973274, 0.09990396, 0.09998098, 0.09963658, 0.09983946]],
      dtype=float32)>, <tf.Tensor: shape=(4, 10), dtype=float32, numpy=
array([[0.09971951, 0.09981698, 0.10018635, 0.10011049, 0.10031542,
        0.10050216, 0.09985526, 0.0998797 , 0.09986446, 0.09974967],
       [0.10006024, 0.10005345, 0.10025822, 0.09989749, 0.09962828,
        0.0999262 , 0.09980732, 0.09966278, 0.10031758, 0.10038846],
       [0.09967156, 0.09958502, 0.09996435,

In [531]:
batch_size=5
max_oov_size=5
vocab_size=3000
seq_len=30
extended_vsize = vocab_size + max_oov_size
input_word_ids = np.random.randint(vocab_size, size=(batch_size,seq_len))
oov_ids = np.random.randint(vocab_size,high=vocab_size+max_oov_size, size=[batch_size,max_oov_size],dtype=np.int64)
print(oov_ids)

batch_nums = tf.range(0, limit= batch_size)  # shape (batch_size)
batch_nums = tf.expand_dims(batch_nums, 1)  # shape (batch_size, 1)
attn_len = seq_len  # number of states we attend over
batch_nums = tf.tile(batch_nums, [1, attn_len])  # shape (batch_size, attn_len)
print (batch_nums)
indices = tf.stack((batch_nums,  input_word_ids), axis=2)  # shape (batch_size, enc_t, 2)
shape = [ batch_size, extended_vsize]
print ('indices:',indices)


[[3002 3002 3001 3004 3003]
 [3001 3004 3004 3003 3001]
 [3004 3003 3004 3003 3004]
 [3004 3000 3000 3001 3004]
 [3000 3004 3002 3002 3002]]
tf.Tensor(
[[0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0]
 [1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1]
 [2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2]
 [3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3]
 [4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4]], shape=(5, 30), dtype=int32)
indices: tf.Tensor(
[[[   0  991]
  [   0 2577]
  [   0  603]
  [   0 1900]
  [   0  343]
  [   0 1173]
  [   0  232]
  [   0 1571]
  [   0 1715]
  [   0 1679]
  [   0 1429]
  [   0 2389]
  [   0 1546]
  [   0  540]
  [   0 1864]
  [   0 1541]
  [   0  812]
  [   0 1256]
  [   0  965]
  [   0 1988]
  [   0 1171]
  [   0  241]
  [   0  204]
  [   0 2577]
  [   0 1058]
  [   0 2442]
  [   0 1463]
  [   0   80]
  [   0 2389]
  [   0 1628]]

 [[   1 2905]
  [   1  845]
  [   1 1733]
  [   1  575]
  [  

In [446]:
[[1,2,3],[4,5,6]]

[[1, 2, 3], [4, 5, 6]]