<a href="https://colab.research.google.com/github/vardansaini/Custom-T5/blob/main/CUSTOM_T5_Tensorflow(Not_working_so_far).ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import tensorflow as tf
import numpy as np
import os
import torch
import time

In [None]:
# from tensorflow import keras
# keras.__version__

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
def scaled_dot_product_attention(q, k, v, mask):

  '''
  Args:
    q: query shape == (..., seq_len_q, depth)
    k: key shape == (..., seq_len_k, depth)
    v: value shape == (..., seq_len_v, depth_v)
    mask: Float tensor with shape broadcastable
          to (..., seq_len_q, seq_len_k). Defaults to None.

  Returns:
    output, attention_weights
  '''

  matmul_qk = tf.matmul(q, k, transpose_b=True)  # (..., seq_len_q, seq_len_k)

  # scale matmul_qk
  dk = tf.cast(tf.shape(k)[-1], tf.float32)
  scaled_attention_logits = matmul_qk / tf.math.sqrt(dk)

  if mask is not None:
    mask = tf.cast(mask, tf.float32)
    scaled_attention_logits += (mask * -1e9)

  attention_weights = tf.nn.softmax(scaled_attention_logits, axis=-1)  # (..., seq_len_q, seq_len_k)

  print("att shape = ", attention_weights.shape)
  print("v shape = ", v.shape)
  output = tf.matmul(attention_weights, v)  # (..., seq_len_q, depth_v)

  return output, attention_weights

In [None]:
class MultiHeadAttention(tf.keras.layers.Layer):
  def __init__(self, d_model, num_heads):
    super(MultiHeadAttention, self).__init__()
    self.num_heads = num_heads
    self.d_model = d_model

    print(d_model)
    print(self.num_heads)
    assert d_model % self.num_heads == 0

    self.depth = d_model // self.num_heads

    self.wq = tf.keras.layers.Dense(d_model)
    self.wk = tf.keras.layers.Dense(d_model)
    self.wv = tf.keras.layers.Dense(d_model)

    self.dense = tf.keras.layers.Dense(d_model)

  def split_heads(self, x, batch_size):

    x = tf.reshape(x, (batch_size, -1, self.num_heads, self.depth))
    return tf.transpose(x, perm=[0, 2, 1, 3])

  def call(self, v, k, q, mask):
    batch_size = tf.shape(q)[0]

    q = self.wq(q)  # (batch_size, seq_len, d_model)
    k = self.wk(k)  # (batch_size, seq_len, d_model)
    v = self.wv(v)  # (batch_size, seq_len, d_model)

    q = self.split_heads(q, batch_size)  # (batch_size, num_heads, seq_len_q, depth)
    k = self.split_heads(k, batch_size)  # (batch_size, num_heads, seq_len_k, depth)
    v = self.split_heads(v, batch_size)  # (batch_size, num_heads, seq_len_v, depth)

    scaled_attention, attention_weights = scaled_dot_product_attention(
      q, k, v, mask)

    scaled_attention = tf.transpose(scaled_attention, perm=[0, 2, 1, 3])  # (batch_size, seq_len_q, num_heads, depth)

    concat_attention = tf.reshape(scaled_attention,
                    (batch_size, -1, self.d_model))  # (batch_size, seq_len_q, d_model)

    output = self.dense(concat_attention)  # (batch_size, seq_len_q, d_model)

    return output, attention_weights

In [None]:
# """
#   MultiHeadAttention Usage:
# """
# mha = MultiHeadAttention(d_model=512, num_heads=8)
# batch_size=1,
# encoder_sequence=60
# dimensions = 512
# y = tf.random.uniform((batch_size, encoder_sequence, dimensions))
# out, attn = mha(y, k=y, q=y, mask=None)
# print(out.shape, attn.shape)

In [None]:
def point_wise_feed_forward_network(d_model, dff):
  return tf.keras.Sequential([
      tf.keras.layers.Dense(dff, activation='relu'),  # (batch_size, seq_len, dff)
      tf.keras.layers.Dense(d_model)  # (batch_size, seq_len, d_model)
  ])

class EncoderLayer(tf.keras.layers.Layer):
  def __init__(self, d_model, num_heads, dff, rate=0.1):
    super(EncoderLayer, self).__init__()

    self.mha = MultiHeadAttention(d_model, num_heads)
    self.ffn = point_wise_feed_forward_network(d_model, dff)

    self.layernorm1 = tf.keras.layers.LayerNormalization(epsilon=1e-6)
    self.layernorm2 = tf.keras.layers.LayerNormalization(epsilon=1e-6)

    self.dropout1 = tf.keras.layers.Dropout(rate)
    self.dropout2 = tf.keras.layers.Dropout(rate)

  def call(self, x, training, mask):

    print("########################")
    print("Mask: ", mask)
    attn_output, _ = self.mha(x, x, x, mask)  # (batch_size, input_seq_len, d_model)
    out1 = self.layernorm1(x + attn_output)  # (batch_size, input_seq_len, d_model)
    attn_output = self.dropout1(out1, training=training)

    ffn_output = self.ffn(attn_output)  # (batch_size, input_seq_len, d_model)
    out2 = self.layernorm2(attn_output + ffn_output)  # (batch_size, input_seq_len, d_model)
    ffn_output = self.dropout2(out2, training=training)
    return ffn_output

In [None]:
# # check this one again -- with the hugginface model
# def point_wise_feed_forward_network(d_model, dff, rate=0.1):
#   return tf.keras.Sequential([
#       tf.keras.layers.Dense(dff, activation='relu'),  # (batch_size, seq_len, dff)
#       tf.keras.layers.Dense(d_model),  # (batch_size, seq_len, d_model)
#       tf.keras.layers.Dropout(rate)
#   ])

# class EncoderLayer(tf.keras.layers.Layer):
#   def __init__(self, d_model, num_heads, dff, rate=0.1):
#     super(EncoderLayer, self).__init__()

#     self.mha = MultiHeadAttention(d_model, num_heads)
#     self.layernorm1 = tf.keras.layers.LayerNormalization(epsilon=1e-6)
#     self.dropout1 = tf.keras.layers.Dropout(rate)
#     self.ffn = point_wise_feed_forward_network(d_model, dff)

#     # self.layernorm1 = tf.keras.layers.LayerNormalization(epsilon=1e-6)
#     self.layernorm2 = tf.keras.layers.LayerNormalization(epsilon=1e-6)

#     self.dropout2 = tf.keras.layers.Dropout(rate)
#     # self.dropout3 = tf.keras.layers.Dropout(rate)

#   def call(self, x, training, mask):

#     attn_output, _ = self.mha(x, x, x, mask)  # (batch_size, input_seq_len, d_model)
#     attn_output = self.dropout1(attn_output, training=training)
#     out1 = self.layernorm1(x + attn_output)  # (batch_size, input_seq_len, d_model)

#     ffn_output = self.ffn(out1)  # (batch_size, input_seq_len, d_model)
#     ffn_output = self.dropout2(ffn_output, training=training)
#     out2 = self.layernorm2(out1 + ffn_output)  # (batch_size, input_seq_len, d_model)

#     return out2

In [None]:
def get_angles(pos, i, d_model):
  angle_rates = 1 / np.power(10000, (2 * (i//2)) / np.float32(d_model))
  # encoded_vec = np.array([pos/10000 ** (2*i/d_model) for pos in range(length) for i in range(d_model)],
                          # dtype=tf.float32)
  return pos * angle_rates

def positional_encoding(position, d_model):
  angle_rads = get_angles(np.arange(position)[:, np.newaxis],
                          np.arange(d_model)[np.newaxis, :],
                          d_model)
  angle_rads[:, 0::2] = np.sin(angle_rads[:, 0::2])

  # apply cos to odd indices in the array; 2i+1
  angle_rads[:, 1::2] = np.cos(angle_rads[:, 1::2])

  pos_encoding = angle_rads[np.newaxis, ...]

  return tf.cast(pos_encoding, dtype=tf.float32)

  # print("###############################################################")
  # print([pos/10000 ** (2*i/d_model) for pos in range(position) for i in range(d_model)])
  # angle_rads = tf.constant([pos/10000 ** (2*i/d_model) for pos in range(position) for i in range(d_model)], dtype=tf.float32)

  # # apply sin to even indices in the array; 2i
  # selected_elements = angle_rads[::2]
  # sine_values = tf.sin(selected_elements)
  # angle_rads = tf.tensor_scatter_nd_update(angle_rads, indices=tf.range(1, tf.size(angle_rads), 2)[:, tf.newaxis], updates=sine_values)

  # angle_rads = np.sin(angle_rads[::2])
  # angle_rads = angle_rads[::2]

  # apply cos to odd indices in the array; 2i+1
  # angle_rads = np.cos(angle_rads[1::2])
  # angle_rads = angle_rads[1::2]

#   selected_elements = angle_rads[1::2]

# # Compute the cosine of the selected elements
#   cosine_values = tf.cos(selected_elements)

# # Update the selected elements with their cosine values
#   angle_rads = tf.tensor_scatter_nd_update(angle_rads, indices=tf.range(1, tf.size(angle_rads), 2)[:, tf.newaxis], updates=cosine_values)


#     # return encoded_vec.reshape([sentence_length, dim])

#   # pos_encoding = angle_rads[np.newaxis, ...]

#   # return tf.cast(pos_encoding, dtype=tf.float32)
#    return encoded_vec.reshape([sentence_length, dim])




In [None]:
class Encoder(tf.keras.layers.Layer):
  def __init__(self, num_layers, d_model, num_heads, dff, input_vocab_size,
               maximum_position_encoding, rate=0.1):
    super(Encoder, self).__init__()

    self.d_model = d_model
    self.num_layers = num_layers

    self.embedding = tf.keras.layers.Embedding(input_vocab_size, d_model)
    self.pos_encoding = positional_encoding(input_vocab_size,
                                            self.d_model)


    self.enc_layers = [EncoderLayer(d_model, num_heads, dff, rate)
                       for _ in range(num_layers)]

    self.dropout = tf.keras.layers.Dropout(rate)

    self.layernorm = tf.keras.layers.LayerNormalization(epsilon=1e-6)

  def call(self, x, training, mask):

    seq_len = tf.shape(x)[1]
    print("seq_len: ",seq_len)

    # adding embedding and position encoding.
    print("I am in Encoder: ",x)
    # x = self.embedding(x)  # (batch_size, input_seq_len, d_model)
    # might be an error since we got rid of embedding layer
    x *= tf.math.sqrt(tf.cast(self.d_model, tf.float32))
    x += self.pos_encoding[:, :seq_len, :]

    for i in range(self.num_layers):
      x = self.enc_layers[i](x, training, mask)

    x = self.layernorm(x)  # (batch_size, input_seq_len, d_model)
    x = self.dropout(x, training=training)


    return x  # (batch_size, input_seq_len, d_model)

In [None]:
class DecoderLayer(tf.keras.layers.Layer):
  def __init__(self, d_model, num_heads, dff, rate=0.1):
    super(DecoderLayer, self).__init__()

    self.mha1 = MultiHeadAttention(d_model, num_heads)
    self.mha2 = MultiHeadAttention(d_model, num_heads)

    self.ffn = point_wise_feed_forward_network(d_model, dff)

    self.layernorm1 = tf.keras.layers.LayerNormalization(epsilon=1e-6)
    self.layernorm2 = tf.keras.layers.LayerNormalization(epsilon=1e-6)
    self.layernorm3 = tf.keras.layers.LayerNormalization(epsilon=1e-6)

    self.dropout1 = tf.keras.layers.Dropout(rate)
    self.dropout2 = tf.keras.layers.Dropout(rate)
    self.dropout3 = tf.keras.layers.Dropout(rate)


  def call(self, x, enc_output, training,
           look_ahead_mask, padding_mask):
    # enc_output.shape == (batch_size, input_seq_len, d_model)

    attn1, attn_weights_block1 = self.mha1(x, x, x, look_ahead_mask)  # (batch_size, target_seq_len, d_model)

    out1 = self.layernorm1(attn1 + x)
    attn1 = self.dropout1(out1, training=training)

    attn2, attn_weights_block2 = self.mha2(
        enc_output, enc_output, attn1, padding_mask)  # (batch_size, target_seq_len, d_model)
    out2 = self.layernorm2(attn2 + attn1)  # (batch_size, target_seq_len, d_model)
    attn2 = self.dropout2(out2, training=training)

    ffn_output = self.ffn(attn2)  # (batch_size, target_seq_len, d_model)
    out3 = self.layernorm3(ffn_output + attn2)  # (batch_size, target_seq_len, d_model)

    ffn_output = self.dropout3(out3, training=training)

    return ffn_output, attn_weights_block1, attn_weights_block2

In [None]:
class Decoder(tf.keras.layers.Layer):
  def __init__(self, num_layers, d_model, num_heads, dff, target_vocab_size,
               maximum_position_encoding, rate=0.1):
    super(Decoder, self).__init__()

    self.d_model = d_model
    self.num_layers = num_layers

    self.embedding = tf.keras.layers.Embedding(target_vocab_size, d_model)
    self.pos_encoding = positional_encoding(maximum_position_encoding, d_model)

    self.dec_layers = [DecoderLayer(d_model, num_heads, dff, rate)
                       for _ in range(num_layers)]
    self.dropout = tf.keras.layers.Dropout(rate)

    self.layernorm = tf.keras.layers.LayerNormalization(epsilon=1e-6)

  def call(self, x, enc_output, training,
           look_ahead_mask, padding_mask):

    seq_len = tf.shape(x)[1]
    attention_weights = {}

    print("I am in decoder: ",x)
    x = self.embedding(x)  # (batch_size, target_seq_len, d_model)
    x *= tf.math.sqrt(tf.cast(self.d_model, tf.float32))
    x += self.pos_encoding[:, :seq_len, :]



    for i in range(self.num_layers):
      x, block1, block2 = self.dec_layers[i](x, enc_output, training,
                                             look_ahead_mask, padding_mask)

      attention_weights['decoder_layer{}_block1'.format(i+1)] = block1
      attention_weights['decoder_layer{}_block2'.format(i+1)] = block2

    x = self.layernorm1(attention_weights + x)
    x = self.dropout(x, training=training)


    # x.shape == (batch_size, target_seq_len, d_model)
    return x, attention_weights

In [None]:
class Transformer(tf.keras.Model):
  def __init__(self, num_layers, d_model, num_heads, dff, input_vocab_size,
               target_vocab_size, pe_input, pe_target, rate=0.1):
    super(Transformer, self).__init__()

    # self.shared = tf.keras.layers.Embedding(input_vocab_size, d_model)

    self.encoder = Encoder(num_layers, d_model, num_heads, dff,
                           input_vocab_size, pe_input, rate)

    self.decoder = Decoder(num_layers, d_model, num_heads, dff,
                           target_vocab_size, pe_target, rate)

    self.final_layer = tf.keras.layers.Dense(target_vocab_size)

  def call(self, inp, tar, training, enc_padding_mask,
           look_ahead_mask, dec_padding_mask):

    # share = self.shared()
    enc_output = self.encoder(inp, training, enc_padding_mask)  # (batch_size, inp_seq_len, d_model)

    # dec_output.shape == (batch_size, tar_seq_len, d_model)
    dec_output, attention_weights = self.decoder(
        tar, enc_output, training, look_ahead_mask, dec_padding_mask)

    final_output = self.final_layer(dec_output)  # (batch_size, tar_seq_len, target_vocab_size)

    return final_output, attention_weights

In [None]:
# import tensorflow_datasets as tfds

# def load_tokenizers(inputs_outputs_savepaths):
#   print("Loading tokenizers...")
#   inputs_savepath, outputs_savepath = inputs_outputs_savepaths
#   inputs_tokenizer = tfds.features.text.SubwordTextEncoder.load_from_file(inputs_savepath)
#   outputs_tokenizer = tfds.features.text.SubwordTextEncoder.load_from_file(outputs_savepath)

#   return inputs_tokenizer, outputs_tokenizer

# def create_tokenizers(inputs_outputs, inputs_outputs_savepaths, target_vocab_size):
#   inputs, outputs = inputs_outputs
#   inputs_savepath, outputs_savepath = inputs_outputs_savepaths
#   inputs_tokenizer = tfds.features.text.SubwordTextEncoder.build_from_corpus(
#     inputs, target_vocab_size=target_vocab_size)
#   outputs_tokenizer = tfds.features.text.SubwordTextEncoder.build_from_corpus(
#     outputs, target_vocab_size=target_vocab_size)
#   print("Saving tokenizers...")
#   inputs_tokenizer.save_to_file(inputs_savepath)
#   outputs_tokenizer.save_to_file(outputs_savepath)

In [None]:
class CustomSchedule(tf.keras.optimizers.schedules.LearningRateSchedule):
  def __init__(self, d_model, warmup_steps=4000):
    super(CustomSchedule, self).__init__()

    self.d_model = d_model
    self.d_model = tf.cast(self.d_model, tf.float32)

    self.warmup_steps = warmup_steps

  def __call__(self, step):
    arg1 = tf.math.rsqrt(step)
    arg2 = step * (self.warmup_steps ** -1.5)

    return tf.math.rsqrt(self.d_model) * tf.math.minimum(arg1, arg2)

In [None]:
config_path = "/content/drive/MyDrive/"
with open(os.path.join(config_path,"config.yml")) as cf:
  import yaml

with open(os.path.join(config_path,"config.yml")) as cf:
  config = yaml.load(cf, Loader=yaml.FullLoader)

num_layers = config["num_layers"]
d_model = config["d_model"]
dff = config["dff"]
num_heads = config["num_heads"]
dropout_rate = config["dropout_rate"]
max_length = config["max_length"]
epochs = config["epochs"]
batch_size = config["batch_size"]
target_vocab_size = config["target_vocab_size"]
checkpoint = config["checkpoint"]
max_checkpoint = config["max_checkpoint"]
custom_checkpoint = config["custom_checkpoint"]
eval_limit = config["eval_limit"]
exit_phrase = config["exit_phrase"]

In [None]:
# setup config file
# num_layers: 4
# target_vocab_size: 16000
# d_model: 128
# dff: 512
# num_heads: 8
# dropout_rate: 0.1
# storage_path: null
# max_length: 40
# epochs: 100
# batch_size: 64
# ckpt_path : null
# # save a checkpoint every x epochs
# checkpoint: 5
# max_checkpoint: 10
# # select a custom checkpoint to start with
# # (else it will automatically select the latest checkpoint)
# custom_checkpoint: null
# reddit_data: True

# # train/eval/test/script
# mode: "test"
# eval_limit: 10
# exit_phrase: ".exit"

In [None]:
def create_masks(src, tgt):
  print("##############################")
  print("SRC: ", src)
  print("#########")
  print("tgt: ", tgt)
  print("##############################")
  src_mask = tf.math.not_equal(src, 0)[:, tf.newaxis, tf.newaxis]
  tgt_mask = tf.math.not_equal(tgt, 0)[:, tf.newaxis, :, tf.newaxis]
  seq_length = tf.shape(tgt)[1]
  nopeak_mask = tf.linalg.band_part(tf.ones((1, seq_length, seq_length)), -1, 0)
  nopeak_mask = tf.cast(nopeak_mask, dtype=tf.bool)
  combined_mask = tgt_mask & nopeak_mask
  return src_mask, combined_mask, tgt_mask

In [None]:
learning_rate = CustomSchedule(d_model)
optimizer = tf.keras.optimizers.Adam(learning_rate=0.0001, beta_1=(0.9, 0.98), epsilon=1e-9)
train_loss = tf.keras.metrics.Mean(name='train_loss')
train_accuracy = tf.keras.metrics.SparseCategoricalAccuracy(name='train_accuracy')
loss_function = tf.keras.losses.CategoricalCrossentropy()

input_vocab_size = 1
target_vocab_size = 1

transformer = Transformer(
                          num_layers, d_model,
                          num_heads, dff,
                          input_vocab_size,
                          target_vocab_size,
                          pe_input=input_vocab_size,
                          pe_target=target_vocab_size,
                          rate=dropout_rate)


# train_step_signature = [
#       tf.TensorSpec(shape=(None, None), dtype=tf.int64),
#       tf.TensorSpec(shape=(None, None), dtype=tf.int64),
#     ]
print("I AM before train_step")
# @tf.function(input_signature=train_step_signature)
def train_step(inp, tar):
    print(inp)
    print(tar)
    print("I am inside train step")
    tar_inp = tar[:, :-1]
    print(tar_inp)
    tar_real = tar[:, 1:]
    print(tar_real)

    enc_padding_mask, combined_mask, dec_padding_mask = create_masks(inp, tar_inp)
    print("JUST GOT THROUGH CREATE MASKS")
    print("enc_padding_mask: ", enc_padding_mask)
    print("combined_mask ", combined_mask)
    print("dec_padding_mask", dec_padding_mask)
    print("starting predictions")
    with tf.GradientTape() as tape:
      predictions, _ = transformer(inp, tar_inp,
                                   True,
                                   enc_padding_mask,
                                   combined_mask,
                                   dec_padding_mask
                                   )
      print("predictions", predictions)
      loss = loss_function(tar_real, predictions)
      print("loss ", loss)

    gradients = tape.gradient(loss, transformer.trainable_variables)
    print("gradients ",gradients)
    optimizer.apply_gradients(zip(gradients, transformer.trainable_variables))

    train_loss(loss)
    train_accuracy(tar_real, predictions)

    for epoch in range(epochs):
      start = time.time()

      train_loss.reset_states()
      train_accuracy.reset_states()

      batches_in, batches_out = train_dataset
      for (batch, (inp, tar)) in enumerate(zip(batches_in, batches_out)):
        train_step(inp, tar)

        if batch % 100 == 0:
          print('Epoch {} Batch {} Loss {:.4f} Accuracy {:.4f}'.format(
              epoch + 1, batch, train_loss.result(), train_accuracy.result()))

      # if (epoch + 1) % checkpoint == 0:
        # ckpt_save_path = ckpt_manager.save()
        # print (f"Saving checkpoint for epoch {epoch+1} at {ckpt_save_path}")

      print("Epoch {} Loss {:.4f} Accuracy {:.4f}".format(
        epoch + 1, train_loss.result(), train_accuracy.result()))
      print('Time taken for 1 epoch: {} secs\n'.format(time.time() - start))


50
5
50
5
50
5
50
5
50
5
50
5
50
5
50
5
50
5
50
5
50
5
50
5
I AM before train_step


In [None]:
# criterion = tf.keras.losses.CategoricalCrossentropy()
# optimizer = tf.keras.optimizers.Adam(learning_rate=0.0001, beta_1=(0.9, 0.98), epsilon=1e-9)

# transformer.generate()

# for epoch in range(100):
#     optimizer.zero_grad()
#     output = transformer(src_data, tgt_data[:, :-1])
#     loss = criterion(output.contiguous().view(-1, tgt_vocab_size), tgt_data[:, 1:].contiguous().view(-1))
#     loss.backward()
#     optimizer.step()
#     print(f"Epoch: {epoch+1}, Loss: {loss.item()}")


In [None]:
glove = ('/content/drive/MyDrive/glove.6B.50d.txt')

In [None]:
# Reading glove text file
with open(glove) as f:
    lines = f.readlines()

glove_dict = {}

# converting in key value format
# example ("the" : ["0.22323", "0.0023232", ......])
for i in lines:
  tokens  = i.split(" ")
  key = tokens[0]
  tokens.pop(0)
  value = tokens
  glove_dict[key] = value

print(len(glove_dict))

400000


In [None]:
# Reading glove text file
with open(glove) as f:
    lines = f.readlines()

glove_dict = {}

# converting in key value format
# example ("the" : ["0.22323", "0.0023232", ......])
for i in lines:
  tokens  = i.split(" ")
  key = tokens[0]
  tokens.pop(0)
  value = tokens
  glove_dict[key] = value

print(len(glove_dict))

400000


In [None]:
v = glove_dict['the']

float_glove_dict = {}

# converting values from string to float and muliplying to convert from floats to long/int later.
for key, value in glove_dict.items():
  l = []
  for i in value:
    # f = float(i) * 10000
    f = float(i)
    l.append(f)
  float_glove_dict[key] = l

In [None]:
# converting to tensor

count = 0
tens = []
for embeddings in float_glove_dict.values():
  print(embeddings)
  # j = torch.Tensor(embeddings)
  # print(j[0])
  # j = j.unsqueeze(0)
  # tens.append(j)
  int_embd = [float(i) for i in embeddings]
  j = tf.constant(int_embd)
  print(j[0].numpy())

  j = tf.expand_dims(j, axis=0)
  tens.append(j)
  if count == 2:
    break
  count += 1

# print(tens[0] * 10)



print(tens)
print(type(tens))
print(len(tens))

[0.418, 0.24968, -0.41242, 0.1217, 0.34527, -0.044457, -0.49688, -0.17862, -0.00066023, -0.6566, 0.27843, -0.14767, -0.55677, 0.14658, -0.0095095, 0.011658, 0.10204, -0.12792, -0.8443, -0.12181, -0.016801, -0.33279, -0.1552, -0.23131, -0.19181, -1.8823, -0.76746, 0.099051, -0.42125, -0.19526, 4.0071, -0.18594, -0.52287, -0.31681, 0.00059213, 0.0074449, 0.17778, -0.15897, 0.012041, -0.054223, -0.29871, -0.15749, -0.34758, -0.045637, -0.44251, 0.18785, 0.0027849, -0.18411, -0.11514, -0.78581]
0.418
[0.013441, 0.23682, -0.16899, 0.40951, 0.63812, 0.47709, -0.42852, -0.55641, -0.364, -0.23938, 0.13001, -0.063734, -0.39575, -0.48162, 0.23291, 0.090201, -0.13324, 0.078639, -0.41634, -0.15428, 0.10068, 0.48891, 0.31226, -0.1252, -0.037512, -1.5179, 0.12612, -0.02442, -0.042961, -0.28351, 3.5416, -0.11956, -0.014533, -0.1499, 0.21864, -0.33412, -0.13872, 0.31806, 0.70358, 0.44858, -0.080262, 0.63003, 0.32111, -0.46765, 0.22786, 0.36034, -0.37818, -0.56657, 0.044691, 0.30392]
0.013441
[0.15164,

In [None]:
# input_ids = encoded_input.input_ids
input_ids = tens[0]
# print(torch.max(input_ids))
# print(torch.max(input_ids, dim=0))
# input_ids = input_ids.unsqueeze(0)
print(input_ids.shape)
# print(input_ids.unsqueeze(0))
print(input_ids)
print(type(input_ids))
print(len(input_ids[0]))
# attention_mask = [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]

(1, 50)
tf.Tensor(
[[ 4.1800e-01  2.4968e-01 -4.1242e-01  1.2170e-01  3.4527e-01 -4.4457e-02
  -4.9688e-01 -1.7862e-01 -6.6023e-04 -6.5660e-01  2.7843e-01 -1.4767e-01
  -5.5677e-01  1.4658e-01 -9.5095e-03  1.1658e-02  1.0204e-01 -1.2792e-01
  -8.4430e-01 -1.2181e-01 -1.6801e-02 -3.3279e-01 -1.5520e-01 -2.3131e-01
  -1.9181e-01 -1.8823e+00 -7.6746e-01  9.9051e-02 -4.2125e-01 -1.9526e-01
   4.0071e+00 -1.8594e-01 -5.2287e-01 -3.1681e-01  5.9213e-04  7.4449e-03
   1.7778e-01 -1.5897e-01  1.2041e-02 -5.4223e-02 -2.9871e-01 -1.5749e-01
  -3.4758e-01 -4.5637e-02 -4.4251e-01  1.8785e-01  2.7849e-03 -1.8411e-01
  -1.1514e-01 -7.8581e-01]], shape=(1, 50), dtype=float32)
<class 'tensorflow.python.framework.ops.EagerTensor'>
50


In [None]:
# decoder_input_ids = decoded_input.input_ids

decoder_input_ids = tens[1]
print(decoder_input_ids.shape)
print(decoder_input_ids)
print(type(decoder_input_ids))
print(len(decoder_input_ids[0]))
# decoder_attention_mask = [1, 1, 1, 1, 1, 1, 1]

(1, 50)
tf.Tensor(
[[ 0.013441  0.23682  -0.16899   0.40951   0.63812   0.47709  -0.42852
  -0.55641  -0.364    -0.23938   0.13001  -0.063734 -0.39575  -0.48162
   0.23291   0.090201 -0.13324   0.078639 -0.41634  -0.15428   0.10068
   0.48891   0.31226  -0.1252   -0.037512 -1.5179    0.12612  -0.02442
  -0.042961 -0.28351   3.5416   -0.11956  -0.014533 -0.1499    0.21864
  -0.33412  -0.13872   0.31806   0.70358   0.44858  -0.080262  0.63003
   0.32111  -0.46765   0.22786   0.36034  -0.37818  -0.56657   0.044691
   0.30392 ]], shape=(1, 50), dtype=float32)
<class 'tensorflow.python.framework.ops.EagerTensor'>
50


In [None]:
train_step(input_ids, decoder_input_ids)

tf.Tensor(
[[ 4.1800e-01  2.4968e-01 -4.1242e-01  1.2170e-01  3.4527e-01 -4.4457e-02
  -4.9688e-01 -1.7862e-01 -6.6023e-04 -6.5660e-01  2.7843e-01 -1.4767e-01
  -5.5677e-01  1.4658e-01 -9.5095e-03  1.1658e-02  1.0204e-01 -1.2792e-01
  -8.4430e-01 -1.2181e-01 -1.6801e-02 -3.3279e-01 -1.5520e-01 -2.3131e-01
  -1.9181e-01 -1.8823e+00 -7.6746e-01  9.9051e-02 -4.2125e-01 -1.9526e-01
   4.0071e+00 -1.8594e-01 -5.2287e-01 -3.1681e-01  5.9213e-04  7.4449e-03
   1.7778e-01 -1.5897e-01  1.2041e-02 -5.4223e-02 -2.9871e-01 -1.5749e-01
  -3.4758e-01 -4.5637e-02 -4.4251e-01  1.8785e-01  2.7849e-03 -1.8411e-01
  -1.1514e-01 -7.8581e-01]], shape=(1, 50), dtype=float32)
tf.Tensor(
[[ 0.013441  0.23682  -0.16899   0.40951   0.63812   0.47709  -0.42852
  -0.55641  -0.364    -0.23938   0.13001  -0.063734 -0.39575  -0.48162
   0.23291   0.090201 -0.13324   0.078639 -0.41634  -0.15428   0.10068
   0.48891   0.31226  -0.1252   -0.037512 -1.5179    0.12612  -0.02442
  -0.042961 -0.28351   3.5416   -0.11956  -

InvalidArgumentError: ignored