In [1]:
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
import os
import time
from albert import modeling
from albert import optimization
from six.moves import range
import tensorflow.compat.v1 as tf
# from tensorflow.contrib import cluster_resolver as contrib_cluster_resolver
# from tensorflow.contrib import data as contrib_data
# from tensorflow.contrib import tpu as contrib_tpu
from tensorflow.data.experimental import parallel_interleave
import tensorflow

In [2]:
def model_fn_builder(albert_config, init_checkpoint, learning_rate,
                     num_train_steps, num_warmup_steps, use_tpu,
                     use_one_hot_embeddings, optimizer, poly_power,
                     start_warmup_step):
  """Returns `model_fn` closure for TPUEstimator."""

  def model_fn(features, labels, mode, params):  # pylint: disable=unused-argument
    """The `model_fn` for TPUEstimator."""

    tf.logging.info("*** Features ***")
    for name in sorted(features.keys()):
      tf.logging.info("  name = %s, shape = %s" % (name, features[name].shape))

    input_ids = features["input_ids"]
    input_mask = features["input_mask"]
    segment_ids = features["segment_ids"]
    masked_lm_positions = features["masked_lm_positions"]
    masked_lm_ids = features["masked_lm_ids"]
    masked_lm_weights = features["masked_lm_weights"]
    # Note: We keep this feature name `next_sentence_labels` to be compatible
    # with the original data created by lanzhzh@. However, in the ALBERT case
    # it does represent sentence_order_labels.
    sentence_order_labels = features["next_sentence_labels"]

    is_training = (mode == tf.estimator.ModeKeys.TRAIN)

    model = modeling.AlbertModel(
        config=albert_config,
        is_training=is_training,
        input_ids=input_ids,
        input_mask=input_mask,
        token_type_ids=segment_ids,
        use_one_hot_embeddings=use_one_hot_embeddings)

    (masked_lm_loss, masked_lm_example_loss,
     masked_lm_log_probs) = get_masked_lm_output(albert_config,
                                                 model.get_sequence_output(),
                                                 model.get_embedding_table(),
                                                 masked_lm_positions,
                                                 masked_lm_ids,
                                                 masked_lm_weights)

    (sentence_order_loss, sentence_order_example_loss,
     sentence_order_log_probs) = get_sentence_order_output(
         albert_config, model.get_pooled_output(), sentence_order_labels)

    total_loss = masked_lm_loss + sentence_order_loss

    tvars = tf.trainable_variables()

    initialized_variable_names = {}
    scaffold_fn = None
    if init_checkpoint:
      tf.logging.info("number of hidden group %d to initialize",
                      albert_config.num_hidden_groups)
      num_of_initialize_group = 1
      if PARAMS_init_from_group0:
        num_of_initialize_group = albert_config.num_hidden_groups
        if albert_config.net_structure_type > 0:
          num_of_initialize_group = albert_config.num_hidden_layers
      (assignment_map, initialized_variable_names
      ) = modeling.get_assignment_map_from_checkpoint(
              tvars, init_checkpoint, num_of_initialize_group)
      if use_tpu:

        def tpu_scaffold():
          for gid in range(num_of_initialize_group):
            tf.logging.info("initialize the %dth layer", gid)
            tf.logging.info(assignment_map[gid])
            tf.train.init_from_checkpoint(init_checkpoint, assignment_map[gid])
          return tf.train.Scaffold()

        scaffold_fn = tpu_scaffold
      else:
        for gid in range(num_of_initialize_group):
          tf.logging.info("initialize the %dth layer", gid)
          tf.logging.info(assignment_map[gid])
          tf.train.init_from_checkpoint(init_checkpoint, assignment_map[gid])

    tf.logging.info("**** Trainable Variables ****")
    for var in tvars:
      init_string = ""
      if var.name in initialized_variable_names:
        init_string = ", *INIT_FROM_CKPT*"
      tf.logging.info("  name = %s, shape = %s%s", var.name, var.shape,
                      init_string)

    output_spec = None
    if mode == tf.estimator.ModeKeys.TRAIN:
      train_op = optimization.create_optimizer(
          total_loss, learning_rate, num_train_steps, num_warmup_steps,
          use_tpu, optimizer, poly_power, start_warmup_step)

      output_spec = tf.estimator.tpu.TPUEstimatorSpec(
          mode=mode,
          loss=total_loss,
          train_op=train_op,
          scaffold_fn=scaffold_fn)
    elif mode == tf.estimator.ModeKeys.EVAL:

      def metric_fn(*args):
        """Computes the loss and accuracy of the model."""
        (masked_lm_example_loss, masked_lm_log_probs, masked_lm_ids,
         masked_lm_weights, sentence_order_example_loss,
         sentence_order_log_probs, sentence_order_labels) = args[:7]


        masked_lm_log_probs = tf.reshape(masked_lm_log_probs,
                                         [-1, masked_lm_log_probs.shape[-1]])
        masked_lm_predictions = tf.argmax(
            masked_lm_log_probs, axis=-1, output_type=tf.int32)
        masked_lm_example_loss = tf.reshape(masked_lm_example_loss, [-1])
        masked_lm_ids = tf.reshape(masked_lm_ids, [-1])
        masked_lm_weights = tf.reshape(masked_lm_weights, [-1])
        masked_lm_accuracy = tf.metrics.accuracy(
            labels=masked_lm_ids,
            predictions=masked_lm_predictions,
            weights=masked_lm_weights)
        masked_lm_mean_loss = tf.metrics.mean(
            values=masked_lm_example_loss, weights=masked_lm_weights)

        metrics = {
            "masked_lm_accuracy": masked_lm_accuracy,
            "masked_lm_loss": masked_lm_mean_loss,
        }

        sentence_order_log_probs = tf.reshape(
            sentence_order_log_probs, [-1, sentence_order_log_probs.shape[-1]])
        sentence_order_predictions = tf.argmax(
            sentence_order_log_probs, axis=-1, output_type=tf.int32)
        sentence_order_labels = tf.reshape(sentence_order_labels, [-1])
        sentence_order_accuracy = tf.metrics.accuracy(
            labels=sentence_order_labels,
            predictions=sentence_order_predictions)
        sentence_order_mean_loss = tf.metrics.mean(
            values=sentence_order_example_loss)
        metrics.update({
            "sentence_order_accuracy": sentence_order_accuracy,
            "sentence_order_loss": sentence_order_mean_loss
        })
        return metrics

      metric_values = [
          masked_lm_example_loss, masked_lm_log_probs, masked_lm_ids,
          masked_lm_weights, sentence_order_example_loss,
          sentence_order_log_probs, sentence_order_labels
      ]

      eval_metrics = (metric_fn, metric_values)

      output_spec = tf.estimator.tpu.TPUEstimatorSpec(
          mode=mode,
          loss=total_loss,
          eval_metrics=eval_metrics,
          scaffold_fn=scaffold_fn)
    else:
      raise ValueError("Only TRAIN and EVAL modes are supported: %s" % (mode))

    return output_spec

  return model_fn

In [3]:
def get_masked_lm_output(albert_config, input_tensor, output_weights, positions,
                         label_ids, label_weights):
  """Get loss and log probs for the masked LM."""
  input_tensor = gather_indexes(input_tensor, positions)


  with tf.variable_scope("cls/predictions"):
    # We apply one more non-linear transformation before the output layer.
    # This matrix is not used after pre-training.
    with tf.variable_scope("transform"):
      input_tensor = tf.layers.dense(
          input_tensor,
          units=albert_config.embedding_size,
          activation=modeling.get_activation(albert_config.hidden_act),
          kernel_initializer=modeling.create_initializer(
              albert_config.initializer_range))
      input_tensor = modeling.layer_norm(input_tensor)

    # The output weights are the same as the input embeddings, but there is
    # an output-only bias for each token.
    output_bias = tf.get_variable(
        "output_bias",
        shape=[albert_config.vocab_size],
        initializer=tf.zeros_initializer())
    logits = tf.matmul(input_tensor, output_weights, transpose_b=True)
    logits = tf.nn.bias_add(logits, output_bias)
    log_probs = tf.nn.log_softmax(logits, axis=-1)

    label_ids = tf.reshape(label_ids, [-1])
    label_weights = tf.reshape(label_weights, [-1])

    one_hot_labels = tf.one_hot(
        label_ids, depth=albert_config.vocab_size, dtype=tf.float32)

    # The `positions` tensor might be zero-padded (if the sequence is too
    # short to have the maximum number of predictions). The `label_weights`
    # tensor has a value of 1.0 for every real prediction and 0.0 for the
    # padding predictions.
    per_example_loss = -tf.reduce_sum(log_probs * one_hot_labels, axis=[-1])
    numerator = tf.reduce_sum(label_weights * per_example_loss)
    denominator = tf.reduce_sum(label_weights) + 1e-5
    loss = numerator / denominator

  return (loss, per_example_loss, log_probs)

In [4]:
def get_sentence_order_output(albert_config, input_tensor, labels):
  """Get loss and log probs for the next sentence prediction."""

  # Simple binary classification. Note that 0 is "next sentence" and 1 is
  # "random sentence". This weight matrix is not used after pre-training.
  with tf.variable_scope("cls/seq_relationship"):
    output_weights = tf.get_variable(
        "output_weights",
        shape=[2, albert_config.hidden_size],
        initializer=modeling.create_initializer(
            albert_config.initializer_range))
    output_bias = tf.get_variable(
        "output_bias", shape=[2], initializer=tf.zeros_initializer())

    logits = tf.matmul(input_tensor, output_weights, transpose_b=True)
    logits = tf.nn.bias_add(logits, output_bias)
    log_probs = tf.nn.log_softmax(logits, axis=-1)
    labels = tf.reshape(labels, [-1])
    one_hot_labels = tf.one_hot(labels, depth=2, dtype=tf.float32)
    per_example_loss = -tf.reduce_sum(one_hot_labels * log_probs, axis=-1)
    loss = tf.reduce_mean(per_example_loss)
    return (loss, per_example_loss, log_probs)

In [5]:
def gather_indexes(sequence_tensor, positions):
  """Gathers the vectors at the specific positions over a minibatch."""
  sequence_shape = modeling.get_shape_list(sequence_tensor, expected_rank=3)
  batch_size = sequence_shape[0]
  seq_length = sequence_shape[1]
  width = sequence_shape[2]

  flat_offsets = tf.reshape(
      tf.range(0, batch_size, dtype=tf.int32) * seq_length, [-1, 1])
  flat_positions = tf.reshape(positions + flat_offsets, [-1])
  flat_sequence_tensor = tf.reshape(sequence_tensor,
                                    [batch_size * seq_length, width])
  output_tensor = tf.gather(flat_sequence_tensor, flat_positions)
  return output_tensor

In [6]:
def input_fn_builder(input_files,
                     max_seq_length,
                     max_predictions_per_seq,
                     is_training,
                     num_cpu_threads=4):
  """Creates an `input_fn` closure to be passed to TPUEstimator."""

  def input_fn(params):
    """The actual input function."""
    batch_size = params["batch_size"]

    name_to_features = {
        "input_ids": tf.FixedLenFeature([max_seq_length], tf.int64),
        "input_mask": tf.FixedLenFeature([max_seq_length], tf.int64),
        "segment_ids": tf.FixedLenFeature([max_seq_length], tf.int64),
        # Note: We keep this feature name `next_sentence_labels` to be
        # compatible with the original data created by lanzhzh@. However, in
        # the ALBERT case it does represent sentence_order_labels.
        "next_sentence_labels": tf.FixedLenFeature([1], tf.int64),
    }

    if PARAMS_masked_lm_budget:
      name_to_features.update({
          "token_boundary":
              tf.FixedLenFeature([max_seq_length], tf.int64)})
    else:
      name_to_features.update({
          "masked_lm_positions":
              tf.FixedLenFeature([max_predictions_per_seq], tf.int64),
          "masked_lm_ids":
              tf.FixedLenFeature([max_predictions_per_seq], tf.int64),
          "masked_lm_weights":
              tf.FixedLenFeature([max_predictions_per_seq], tf.float32)})

    # For training, we want a lot of parallel reading and shuffling.
    # For eval, we want no shuffling and parallel reading doesn't matter.
    if is_training:
      d = tf.data.Dataset.from_tensor_slices(tf.constant(input_files))
      d = d.repeat()
      d = d.shuffle(buffer_size=len(input_files))

      # `cycle_length` is the number of parallel files that get read.
      cycle_length = min(num_cpu_threads, len(input_files))

      # `sloppy` mode means that the interleaving is not exact. This adds
      # even more randomness to the training pipeline.
      d = d.apply(parallel_interleave(tf.data.TFRecordDataset, sloppy=is_training, cycle_length=cycle_length))
      d = d.shuffle(buffer_size=100)
    else:
      d = tf.data.TFRecordDataset(input_files)
      # Since we evaluate for a fixed number of steps we don't want to encounter
      # out-of-range exceptions.
      d = d.repeat()

    # We must `drop_remainder` on training because the TPU requires fixed
    # size dimensions. For eval, we assume we are evaluating on the CPU or GPU
    # and we *don't* want to drop the remainder, otherwise we wont cover
    # every sample.
    d = d.apply(
        tf.data.experimental.map_and_batch_with_legacy_function(
            lambda record: _decode_record(record, name_to_features),
            batch_size=batch_size,
            num_parallel_batches=num_cpu_threads,
            drop_remainder=True))
    tf.logging.info(d)
    return d

  return input_fn

In [7]:
def _decode_record(record, name_to_features):
  """Decodes a record to a TensorFlow example."""
  example = tf.parse_single_example(record, name_to_features)

  # tf.Example only supports tf.int64, but the TPU only supports tf.int32.
  # So cast all int64 to int32.
  for name in list(example.keys()):
    t = example[name]
    if t.dtype == tf.int64:
      t = tf.to_int32(t)
    example[name] = t

  return example

In [8]:
PARAMS_input_file = './processed_data/Discharge_summary_pretrain.tfrecord'
PARAMS_output_dir = './model/'
PARAMS_init_checkpoint = './model/albert_base_v1/model.ckpt-best'
PARAMS_albert_config_file = './model/albert_base_v1/albert_config.json'
PARAMS_do_train = True
PARAMS_do_eval = False
PARAMS_train_batch_size=4096
PARAMS_eval_batch_size=64
PARAMS_max_seq_length=512
PARAMS_max_predictions_per_seq=20
PARAMS_optimizer='lamb'
PARAMS_learning_rate=.00176
PARAMS_num_train_steps=10 #125000
PARAMS_num_warmup_steps=5 #3125
PARAMS_save_checkpoints_steps=5 #5000

PARAMS_use_tpu = False
PARAMS_master = None
PARAMS_keep_checkpoint_max = 5
PARAMS_poly_power = 1.0
PARAMS_start_warmup_step = 5
PARAMS_iterations_per_loop = 1000
PARAMS_max_eval_steps = 100
PARAMS_init_from_group0 = False
PARAMS_num_tpu_cores = 8
PARAMS_masked_lm_budget = 0
PARAMS_gcp_project = None
PARAMS_tpu_zone = None
PARAMS_tpu_name = None

In [9]:
tf.logging.set_verbosity(tf.logging.INFO)

if not PARAMS_do_train and not PARAMS_do_eval:
    raise ValueError("At least one of `do_train` or `do_eval` must be True.")

albert_config = modeling.AlbertConfig.from_json_file(PARAMS_albert_config_file)

tf.gfile.MakeDirs(PARAMS_output_dir)

input_files = []
for input_pattern in PARAMS_input_file.split(","):
    input_files.extend(tf.gfile.Glob(input_pattern))

tf.logging.info("*** Input Files ***")
for input_file in input_files:
    tf.logging.info("  %s" % input_file)

tpu_cluster_resolver = None
if PARAMS_use_tpu and PARAMS_tpu_name:
    tpu_cluster_resolver = tensorflow.distribute.cluster_resolver.TPUClusterResolver(
    PARAMS_tpu_name, zone=PARAMS_tpu_zone, project=PARAMS_gcp_project)

# is_per_host = tensorflow.estimator.tpu.InputPipelineConfig.PER_HOST_V2
is_per_host = tf.estimator.tpu.InputPipelineConfig.PER_HOST_V2
run_config = tf.estimator.tpu.RunConfig(
    cluster=tpu_cluster_resolver,
    master=PARAMS_master,
    model_dir=PARAMS_output_dir,
    save_checkpoints_steps=PARAMS_save_checkpoints_steps,
    keep_checkpoint_max=PARAMS_keep_checkpoint_max,
    tpu_config=tf.estimator.tpu.TPUConfig(
        iterations_per_loop=PARAMS_iterations_per_loop,
        num_shards=PARAMS_num_tpu_cores,
        per_host_input_for_training=is_per_host))

model_fn = model_fn_builder(
    albert_config=albert_config,
    init_checkpoint=PARAMS_init_checkpoint,
    learning_rate=PARAMS_learning_rate,
    num_train_steps=PARAMS_num_train_steps,
    num_warmup_steps=PARAMS_num_warmup_steps,
    use_tpu=PARAMS_use_tpu,
    use_one_hot_embeddings=PARAMS_use_tpu,
    optimizer=PARAMS_optimizer,
    poly_power=PARAMS_poly_power,
    start_warmup_step=PARAMS_start_warmup_step)

# If TPU is not available, this will fall back to normal Estimator on CPU
# or GPU.
estimator = tf.estimator.tpu.TPUEstimator(
    use_tpu=PARAMS_use_tpu,
    model_fn=model_fn,
    config=run_config,
    train_batch_size=PARAMS_train_batch_size,
    eval_batch_size=PARAMS_eval_batch_size)

if PARAMS_do_train:
    tf.logging.info("***** Running training *****")
    # tf.logging.info("  Batch size = %d", PARAMS_train_batch_size)
    train_input_fn = input_fn_builder(
        input_files=input_files,
        max_seq_length=PARAMS_max_seq_length,
        max_predictions_per_seq=PARAMS_max_predictions_per_seq,
        is_training=True)
    estimator.train(input_fn=train_input_fn, max_steps=PARAMS_num_train_steps)

if PARAMS_do_eval:
    tf.logging.info("***** Running evaluation *****")
    # tf.logging.info("  Batch size = %d", PARAMS_eval_batch_size)
    global_step = -1
    output_eval_file = os.path.join(PARAMS_output_dir, "eval_results.txt")
    writer = tf.gfile.GFile(output_eval_file, "w")
    eval_input_fn = input_fn_builder(
        input_files=input_files,
        max_seq_length=PARAMS_max_seq_length,
        max_predictions_per_seq=PARAMS_max_predictions_per_seq,
        is_training=False)
    best_perf = 0
    key_name = "masked_lm_accuracy"
    while global_step < PARAMS_num_train_steps:
        if estimator.latest_checkpoint() is None:
            tf.logging.info("No checkpoint found yet. Sleeping.")
            time.sleep(1)
        else:
            result = estimator.evaluate(
                input_fn=eval_input_fn, steps=PARAMS_max_eval_steps)
            global_step = result["global_step"]
            tf.logging.info("***** Eval results *****")
            checkpoint_path = estimator.latest_checkpoint()
            for key in sorted(result.keys()):
                # tf.logging.info("  %s = %s", key, str(result[key]))
                writer.write("%s = %s\n" % (key, str(result[key])))
                if result[key_name] > best_perf:
                    best_perf = result[key_name]
                    for ext in ["meta", "data-00000-of-00001", "index"]:
                        src_ckpt = checkpoint_path + ".{}".format(ext)
                        tgt_ckpt = checkpoint_path.rsplit(
                          "-", 1)[0] + "-best.{}".format(ext)
                        # tf.logging.info("saving {} to {}".format(src_ckpt, tgt_ckpt))
                        tf.gfile.Copy(src_ckpt, tgt_ckpt, overwrite=True)
                        writer.write("saved {} to {}\n".format(src_ckpt, tgt_ckpt))

INFO:tensorflow:*** Input Files ***
INFO:tensorflow:  C:\Users\XinXining\Desktop\Yale\Thesis_Project\Albert_Model\processed_data\Discharge_summary_pretrain.tfrecord
INFO:tensorflow:Using config: {'_model_dir': 'C:/Users/XinXining/Desktop/Yale/Thesis_Project/Albert_Model/model/', '_tf_random_seed': None, '_save_summary_steps': 100, '_save_checkpoints_steps': 5, '_save_checkpoints_secs': None, '_session_config': allow_soft_placement: true
graph_options {
  rewrite_options {
    meta_optimizer_iterations: ONE
  }
}
, '_keep_checkpoint_max': 5, '_keep_checkpoint_every_n_hours': 10000, '_log_step_count_steps': None, '_train_distribute': None, '_device_fn': None, '_protocol': None, '_eval_distribute': None, '_experimental_distribute': None, '_experimental_max_worker_delay_secs': None, '_session_creation_timeout_secs': 7200, '_checkpoint_save_graph_def': True, '_service': None, '_cluster_spec': ClusterSpec({}), '_task_type': 'worker', '_task_id': 0, '_global_id_in_cluster': 0, '_master': '', 

  self.pooled_output = tf.layers.dense(
  return layer.apply(inputs)
  input_tensor = tf.layers.dense(


INFO:tensorflow:Done calling model_fn.
INFO:tensorflow:Create CheckpointSaverHook.
INFO:tensorflow:Graph was finalized.
INFO:tensorflow:Running local_init_op.
INFO:tensorflow:Done running local_init_op.
INFO:tensorflow:Calling checkpoint listeners before saving checkpoint 0...
INFO:tensorflow:Saving checkpoints for 0 into C:/Users/XinXining/Desktop/Yale/Thesis_Project/Albert_Model/model/model.ckpt.
INFO:tensorflow:training_loop marked as finished


NotFoundError: Failed to create a NewWriteableFile: C:/Users/XinXining/Desktop/Yale/Thesis_Project/Albert_Model/model/model.ckpt-0_temp\part-00000-of-00001.data-00000-of-00001.tempstate99496378179852786 : The system cannot find the path specified.
; No such process
	 [[node save/SaveV2
 (defined at C:\Users\XinXining\anaconda3\lib\site-packages\tensorflow_estimator\python\estimator\estimator.py:1512)
]]

Errors may have originated from an input operation.
Input Source operations connected to node save/SaveV2:
In[0] save/ShardedFilename:	
In[1] save/SaveV2/tensor_names:	
In[2] save/SaveV2/shape_and_slices:	
In[3] bert/embeddings/layer_normalization/beta/Read/ReadVariableOp (defined at C:\Users\XinXining\anaconda3\lib\site-packages\keras\engine\base_layer_utils.py:117)	
In[4] bert/embeddings/layer_normalization/beta/adam_m/Read/ReadVariableOp (defined at C:\Users\XinXining\Desktop\Yale\Thesis_Project\Albert_Model\albert\lamb_optimizer.py:76)	
In[5] bert/embeddings/layer_normalization/beta/adam_v/Read/ReadVariableOp (defined at C:\Users\XinXining\Desktop\Yale\Thesis_Project\Albert_Model\albert\lamb_optimizer.py:82)	
In[6] bert/embeddings/layer_normalization/gamma/Read/ReadVariableOp:	
In[7] bert/embeddings/layer_normalization/gamma/adam_m/Read/ReadVariableOp:	
In[8] bert/embeddings/layer_normalization/gamma/adam_v/Read/ReadVariableOp:	
In[9] bert/embeddings/position_embeddings/Read/ReadVariableOp (defined at C:\Users\XinXining\Desktop\Yale\Thesis_Project\Albert_Model\albert\modeling.py:602)	
In[10] bert/embeddings/position_embeddings/adam_m/Read/ReadVariableOp:	
In[11] bert/embeddings/position_embeddings/adam_v/Read/ReadVariableOp:	
In[12] bert/embeddings/token_type_embeddings/Read/ReadVariableOp (defined at C:\Users\XinXining\Desktop\Yale\Thesis_Project\Albert_Model\albert\modeling.py:582)	
In[13] bert/embeddings/token_type_embeddings/adam_m/Read/ReadVariableOp:	
In[14] bert/embeddings/token_type_embeddings/adam_v/Read/ReadVariableOp:	
In[15] bert/embeddings/word_embeddings/Read/ReadVariableOp (defined at C:\Users\XinXining\Desktop\Yale\Thesis_Project\Albert_Model\albert\modeling.py:512)	
In[16] bert/embeddings/word_embeddings/adam_m/Read/ReadVariableOp:	
In[17] bert/embeddings/word_embeddings/adam_v/Read/ReadVariableOp:	
In[18] bert/encoder/embedding_hidden_mapping_in/bias/Read/ReadVariableOp (defined at C:\Users\XinXining\Desktop\Yale\Thesis_Project\Albert_Model\albert\modeling.py:781)	
In[19] bert/encoder/embedding_hidden_mapping_in/bias/adam_m/Read/ReadVariableOp:	
In[20] bert/encoder/embedding_hidden_mapping_in/bias/adam_v/Read/ReadVariableOp:	
In[21] bert/encoder/embedding_hidden_mapping_in/kernel/Read/ReadVariableOp (defined at C:\Users\XinXining\Desktop\Yale\Thesis_Project\Albert_Model\albert\modeling.py:777)	
In[22] bert/encoder/embedding_hidden_mapping_in/kernel/adam_m/Read/ReadVariableOp:	
In[23] bert/encoder/embedding_hidden_mapping_in/kernel/adam_v/Read/ReadVariableOp:	
In[24] bert/encoder/transformer/group_0/inner_group_0/attention_1/output/dense/bias/Read/ReadVariableOp (defined at C:\Users\XinXining\Desktop\Yale\Thesis_Project\Albert_Model\albert\modeling.py:739)	
In[25] bert/encoder/transformer/group_0/inner_group_0/attention_1/output/dense/bias/adam_m/Read/ReadVariableOp:	
In[26] bert/encoder/transformer/group_0/inner_group_0/attention_1/output/dense/bias/adam_v/Read/ReadVariableOp:	
In[27] bert/encoder/transformer/group_0/inner_group_0/attention_1/output/dense/kernel/Read/ReadVariableOp (defined at C:\Users\XinXining\Desktop\Yale\Thesis_Project\Albert_Model\albert\modeling.py:734)	
In[28] bert/encoder/transformer/group_0/inner_group_0/attention_1/output/dense/kernel/adam_m/Read/ReadVariableOp:	
In[29] bert/encoder/transformer/group_0/inner_group_0/attention_1/output/dense/kernel/adam_v/Read/ReadVariableOp:	
In[30] bert/encoder/transformer/group_0/inner_group_0/attention_1/self/key/bias/Read/ReadVariableOp (defined at C:\Users\XinXining\Desktop\Yale\Thesis_Project\Albert_Model\albert\modeling.py:693)	
In[31] bert/encoder/transformer/group_0/inner_group_0/attention_1/self/key/bias/adam_m/Read/ReadVariableOp:	
In[32] bert/encoder/transformer/group_0/inner_group_0/attention_1/self/key/bias/adam_v/Read/ReadVariableOp:	
In[33] bert/encoder/transformer/group_0/inner_group_0/attention_1/self/key/kernel/Read/ReadVariableOp (defined at C:\Users\XinXining\Desktop\Yale\Thesis_Project\Albert_Model\albert\modeling.py:688)	
In[34] bert/encoder/transformer/group_0/inner_group_0/attention_1/self/key/kernel/adam_m/Read/ReadVariableOp:	
In[35] bert/encoder/transformer/group_0/inner_group_0/attention_1/self/key/kernel/adam_v/Read/ReadVariableOp:	
In[36] bert/encoder/transformer/group_0/inner_group_0/attention_1/self/query/bias/Read/ReadVariableOp:	
In[37] bert/encoder/transformer/group_0/inner_group_0/attention_1/self/query/bias/adam_m/Read/ReadVariableOp:	
In[38] bert/encoder/transformer/group_0/inner_group_0/attention_1/self/query/bias/adam_v/Read/ReadVariableOp:	
In[39] bert/encoder/transformer/group_0/inner_group_0/attention_1/self/query/kernel/Read/ReadVariableOp:	
In[40] bert/encoder/transformer/group_0/inner_group_0/attention_1/self/query/kernel/adam_m/Read/ReadVariableOp:	
In[41] bert/encoder/transformer/group_0/inner_group_0/attention_1/self/query/kernel/adam_v/Read/ReadVariableOp:	
In[42] bert/encoder/transformer/group_0/inner_group_0/attention_1/self/value/bias/Read/ReadVariableOp:	
In[43] bert/encoder/transformer/group_0/inner_group_0/attention_1/self/value/bias/adam_m/Read/ReadVariableOp:	
In[44] bert/encoder/transformer/group_0/inner_group_0/attention_1/self/value/bias/adam_v/Read/ReadVariableOp:	
In[45] bert/encoder/transformer/group_0/inner_group_0/attention_1/self/value/kernel/Read/ReadVariableOp:	
In[46] bert/encoder/transformer/group_0/inner_group_0/attention_1/self/value/kernel/adam_m/Read/ReadVariableOp:	
In[47] bert/encoder/transformer/group_0/inner_group_0/attention_1/self/value/kernel/adam_v/Read/ReadVariableOp:	
In[48] bert/encoder/transformer/group_0/inner_group_0/ffn_1/intermediate/dense/bias/Read/ReadVariableOp:	
In[49] bert/encoder/transformer/group_0/inner_group_0/ffn_1/intermediate/dense/bias/adam_m/Read/ReadVariableOp:	
In[50] bert/encoder/transformer/group_0/inner_group_0/ffn_1/intermediate/dense/bias/adam_v/Read/ReadVariableOp:	
In[51] bert/encoder/transformer/group_0/inner_group_0/ffn_1/intermediate/dense/kernel/Read/ReadVariableOp:	
In[52] bert/encoder/transformer/group_0/inner_group_0/ffn_1/intermediate/dense/kernel/adam_m/Read/ReadVariableOp:	
In[53] bert/encoder/transformer/group_0/inner_group_0/ffn_1/intermediate/dense/kernel/adam_v/Read/ReadVariableOp:	
In[54] bert/encoder/transformer/group_0/inner_group_0/ffn_1/intermediate/output/dense/bias/Read/ReadVariableOp:	
In[55] bert/encoder/transformer/group_0/inner_group_0/ffn_1/intermediate/output/dense/bias/adam_m/Read/ReadVariableOp:	
In[56] bert/encoder/transformer/group_0/inner_group_0/ffn_1/intermediate/output/dense/bias/adam_v/Read/ReadVariableOp:	
In[57] bert/encoder/transformer/group_0/inner_group_0/ffn_1/intermediate/output/dense/kernel/Read/ReadVariableOp:	
In[58] bert/encoder/transformer/group_0/inner_group_0/ffn_1/intermediate/output/dense/kernel/adam_m/Read/ReadVariableOp:	
In[59] bert/encoder/transformer/group_0/inner_group_0/ffn_1/intermediate/output/dense/kernel/adam_v/Read/ReadVariableOp:	
In[60] bert/encoder/transformer/group_0/layer_0/inner_group_0/layer_normalization_1/beta/Read/ReadVariableOp:	
In[61] bert/encoder/transformer/group_0/layer_0/inner_group_0/layer_normalization_1/beta/adam_m/Read/ReadVariableOp:	
In[62] bert/encoder/transformer/group_0/layer_0/inner_group_0/layer_normalization_1/beta/adam_v/Read/ReadVariableOp:	
In[63] bert/encoder/transformer/group_0/layer_0/inner_group_0/layer_normalization_1/gamma/Read/ReadVariableOp:	
In[64] bert/encoder/transformer/group_0/layer_0/inner_group_0/layer_normalization_1/gamma/adam_m/Read/ReadVariableOp:	
In[65] bert/encoder/transformer/group_0/layer_0/inner_group_0/layer_normalization_1/gamma/adam_v/Read/ReadVariableOp:	
In[66] bert/encoder/transformer/group_0/layer_0/inner_group_0/layer_normalization_2/beta/Read/ReadVariableOp:	
In[67] bert/encoder/transformer/group_0/layer_0/inner_group_0/layer_normalization_2/beta/adam_m/Read/ReadVariableOp:	
In[68] bert/encoder/transformer/group_0/layer_0/inner_group_0/layer_normalization_2/beta/adam_v/Read/ReadVariableOp:	
In[69] bert/encoder/transformer/group_0/layer_0/inner_group_0/layer_normalization_2/gamma/Read/ReadVariableOp:	
In[70] bert/encoder/transformer/group_0/layer_0/inner_group_0/layer_normalization_2/gamma/adam_m/Read/ReadVariableOp:	
In[71] bert/encoder/transformer/group_0/layer_0/inner_group_0/layer_normalization_2/gamma/adam_v/Read/ReadVariableOp:	
In[72] bert/encoder/transformer/group_0_1/layer_1/inner_group_0/layer_normalization_3/beta/Read/ReadVariableOp:	
In[73] bert/encoder/transformer/group_0_1/layer_1/inner_group_0/layer_normalization_3/beta/adam_m/Read/ReadVariableOp:	
In[74] bert/encoder/transformer/group_0_1/layer_1/inner_group_0/layer_normalization_3/beta/adam_v/Read/ReadVariableOp:	
In[75] bert/encoder/transformer/group_0_1/layer_1/inner_group_0/layer_normalization_3/gamma/Read/ReadVariableOp:	
In[76] bert/encoder/transformer/group_0_1/layer_1/inner_group_0/layer_normalization_3/gamma/adam_m/Read/ReadVariableOp:	
In[77] bert/encoder/transformer/group_0_1/layer_1/inner_group_0/layer_normalization_3/gamma/adam_v/Read/ReadVariableOp:	
In[78] bert/encoder/transformer/group_0_1/layer_1/inner_group_0/layer_normalization_4/beta/Read/ReadVariableOp:	
In[79] bert/encoder/transformer/group_0_1/layer_1/inner_group_0/layer_normalization_4/beta/adam_m/Read/ReadVariableOp:	
In[80] bert/encoder/transformer/group_0_1/layer_1/inner_group_0/layer_normalization_4/beta/adam_v/Read/ReadVariableOp:	
In[81] bert/encoder/transformer/group_0_1/layer_1/inner_group_0/layer_normalization_4/gamma/Read/ReadVariableOp:	
In[82] bert/encoder/transformer/group_0_1/layer_1/inner_group_0/layer_normalization_4/gamma/adam_m/Read/ReadVariableOp:	
In[83] bert/encoder/transformer/group_0_1/layer_1/inner_group_0/layer_normalization_4/gamma/adam_v/Read/ReadVariableOp:	
In[84] bert/encoder/transformer/group_0_10/layer_10/inner_group_0/layer_normalization_21/beta/Read/ReadVariableOp:	
In[85] bert/encoder/transformer/group_0_10/layer_10/inner_group_0/layer_normalization_21/beta/adam_m/Read/ReadVariableOp:	
In[86] bert/encoder/transformer/group_0_10/layer_10/inner_group_0/layer_normalization_21/beta/adam_v/Read/ReadVariableOp:	
In[87] bert/encoder/transformer/group_0_10/layer_10/inner_group_0/layer_normalization_21/gamma/Read/ReadVariableOp:	
In[88] bert/encoder/transformer/group_0_10/layer_10/inner_group_0/layer_normalization_21/gamma/adam_m/Read/ReadVariableOp:	
In[89] bert/encoder/transformer/group_0_10/layer_10/inner_group_0/layer_normalization_21/gamma/adam_v/Read/ReadVariableOp:	
In[90] bert/encoder/transformer/group_0_10/layer_10/inner_group_0/layer_normalization_22/beta/Read/ReadVariableOp:	
In[91] bert/encoder/transformer/group_0_10/layer_10/inner_group_0/layer_normalization_22/beta/adam_m/Read/ReadVariableOp:	
In[92] bert/encoder/transformer/group_0_10/layer_10/inner_group_0/layer_normalization_22/beta/adam_v/Read/ReadVariableOp:	
In[93] bert/encoder/transformer/group_0_10/layer_10/inner_group_0/layer_normalization_22/gamma/Read/ReadVariableOp:	
In[94] bert/encoder/transformer/group_0_10/layer_10/inner_group_0/layer_normalization_22/gamma/adam_m/Read/ReadVariableOp:	
In[95] bert/encoder/transformer/group_0_10/layer_10/inner_group_0/layer_normalization_22/gamma/adam_v/Read/ReadVariableOp:	
In[96] bert/encoder/transformer/group_0_11/layer_11/inner_group_0/layer_normalization_23/beta/Read/ReadVariableOp:	
In[97] bert/encoder/transformer/group_0_11/layer_11/inner_group_0/layer_normalization_23/beta/adam_m/Read/ReadVariableOp:	
In[98] bert/encoder/transformer/group_0_11/layer_11/inner_group_0/layer_normalization_23/beta/adam_v/Read/ReadVariableOp:	
In[99] bert/encoder/transformer/group_0_11/layer_11/inner_group_0/layer_normalization_23/gamma/Read/ReadVariableOp:	
In[100] bert/encoder/transformer/group_0_11/layer_11/inner_group_0/layer_normalization_23/gamma/adam_m/Read/ReadVariableOp:	
In[101] bert/encoder/transformer/group_0_11/layer_11/inner_group_0/layer_normalization_23/gamma/adam_v/Read/ReadVariableOp:	
In[102] bert/encoder/transformer/group_0_11/layer_11/inner_group_0/layer_normalization_24/beta/Read/ReadVariableOp:	
In[103] bert/encoder/transformer/group_0_11/layer_11/inner_group_0/layer_normalization_24/beta/adam_m/Read/ReadVariableOp:	
In[104] bert/encoder/transformer/group_0_11/layer_11/inner_group_0/layer_normalization_24/beta/adam_v/Read/ReadVariableOp:	
In[105] bert/encoder/transformer/group_0_11/layer_11/inner_group_0/layer_normalization_24/gamma/Read/ReadVariableOp:	
In[106] bert/encoder/transformer/group_0_11/layer_11/inner_group_0/layer_normalization_24/gamma/adam_m/Read/ReadVariableOp:	
In[107] bert/encoder/transformer/group_0_11/layer_11/inner_group_0/layer_normalization_24/gamma/adam_v/Read/ReadVariableOp:	
In[108] bert/encoder/transformer/group_0_2/layer_2/inner_group_0/layer_normalization_5/beta/Read/ReadVariableOp:	
In[109] bert/encoder/transformer/group_0_2/layer_2/inner_group_0/layer_normalization_5/beta/adam_m/Read/ReadVariableOp:	
In[110] bert/encoder/transformer/group_0_2/layer_2/inner_group_0/layer_normalization_5/beta/adam_v/Read/ReadVariableOp:	
In[111] bert/encoder/transformer/group_0_2/layer_2/inner_group_0/layer_normalization_5/gamma/Read/ReadVariableOp:	
In[112] bert/encoder/transformer/group_0_2/layer_2/inner_group_0/layer_normalization_5/gamma/adam_m/Read/ReadVariableOp:	
In[113] bert/encoder/transformer/group_0_2/layer_2/inner_group_0/layer_normalization_5/gamma/adam_v/Read/ReadVariableOp:	
In[114] bert/encoder/transformer/group_0_2/layer_2/inner_group_0/layer_normalization_6/beta/Read/ReadVariableOp:	
In[115] bert/encoder/transformer/group_0_2/layer_2/inner_group_0/layer_normalization_6/beta/adam_m/Read/ReadVariableOp:	
In[116] bert/encoder/transformer/group_0_2/layer_2/inner_group_0/layer_normalization_6/beta/adam_v/Read/ReadVariableOp:	
In[117] bert/encoder/transformer/group_0_2/layer_2/inner_group_0/layer_normalization_6/gamma/Read/ReadVariableOp:	
In[118] bert/encoder/transformer/group_0_2/layer_2/inner_group_0/layer_normalization_6/gamma/adam_m/Read/ReadVariableOp:	
In[119] bert/encoder/transformer/group_0_2/layer_2/inner_group_0/layer_normalization_6/gamma/adam_v/Read/ReadVariableOp:	
In[120] bert/encoder/transformer/group_0_3/layer_3/inner_group_0/layer_normalization_7/beta/Read/ReadVariableOp:	
In[121] bert/encoder/transformer/group_0_3/layer_3/inner_group_0/layer_normalization_7/beta/adam_m/Read/ReadVariableOp:	
In[122] bert/encoder/transformer/group_0_3/layer_3/inner_group_0/layer_normalization_7/beta/adam_v/Read/ReadVariableOp:	
In[123] bert/encoder/transformer/group_0_3/layer_3/inner_group_0/layer_normalization_7/gamma/Read/ReadVariableOp:	
In[124] bert/encoder/transformer/group_0_3/layer_3/inner_group_0/layer_normalization_7/gamma/adam_m/Read/ReadVariableOp:	
In[125] bert/encoder/transformer/group_0_3/layer_3/inner_group_0/layer_normalization_7/gamma/adam_v/Read/ReadVariableOp:	
In[126] bert/encoder/transformer/group_0_3/layer_3/inner_group_0/layer_normalization_8/beta/Read/ReadVariableOp:	
In[127] bert/encoder/transformer/group_0_3/layer_3/inner_group_0/layer_normalization_8/beta/adam_m/Read/ReadVariableOp:	
In[128] bert/encoder/transformer/group_0_3/layer_3/inner_group_0/layer_normalization_8/beta/adam_v/Read/ReadVariableOp:	
In[129] bert/encoder/transformer/group_0_3/layer_3/inner_group_0/layer_normalization_8/gamma/Read/ReadVariableOp:	
In[130] bert/encoder/transformer/group_0_3/layer_3/inner_group_0/layer_normalization_8/gamma/adam_m/Read/ReadVariableOp:	
In[131] bert/encoder/transformer/group_0_3/layer_3/inner_group_0/layer_normalization_8/gamma/adam_v/Read/ReadVariableOp:	
In[132] bert/encoder/transformer/group_0_4/layer_4/inner_group_0/layer_normalization_10/beta/Read/ReadVariableOp:	
In[133] bert/encoder/transformer/group_0_4/layer_4/inner_group_0/layer_normalization_10/beta/adam_m/Read/ReadVariableOp:	
In[134] bert/encoder/transformer/group_0_4/layer_4/inner_group_0/layer_normalization_10/beta/adam_v/Read/ReadVariableOp:	
In[135] bert/encoder/transformer/group_0_4/layer_4/inner_group_0/layer_normalization_10/gamma/Read/ReadVariableOp:	
In[136] bert/encoder/transformer/group_0_4/layer_4/inner_group_0/layer_normalization_10/gamma/adam_m/Read/ReadVariableOp:	
In[137] bert/encoder/transformer/group_0_4/layer_4/inner_group_0/layer_normalization_10/gamma/adam_v/Read/ReadVariableOp:	
In[138] bert/encoder/transformer/group_0_4/layer_4/inner_group_0/layer_normalization_9/beta/Read/ReadVariableOp:	
In[139] bert/encoder/transformer/group_0_4/layer_4/inner_group_0/layer_normalization_9/beta/adam_m/Read/ReadVariableOp:	
In[140] bert/encoder/transformer/group_0_4/layer_4/inner_group_0/layer_normalization_9/beta/adam_v/Read/ReadVariableOp:	
In[141] bert/encoder/transformer/group_0_4/layer_4/inner_group_0/layer_normalization_9/gamma/Read/ReadVariableOp:	
In[142] bert/encoder/transformer/group_0_4/layer_4/inner_group_0/layer_normalization_9/gamma/adam_m/Read/ReadVariableOp:	
In[143] bert/encoder/transformer/group_0_4/layer_4/inner_group_0/layer_normalization_9/gamma/adam_v/Read/ReadVariableOp:	
In[144] bert/encoder/transformer/group_0_5/layer_5/inner_group_0/layer_normalization_11/beta/Read/ReadVariableOp:	
In[145] bert/encoder/transformer/group_0_5/layer_5/inner_group_0/layer_normalization_11/beta/adam_m/Read/ReadVariableOp:	
In[146] bert/encoder/transformer/group_0_5/layer_5/inner_group_0/layer_normalization_11/beta/adam_v/Read/ReadVariableOp:	
In[147] bert/encoder/transformer/group_0_5/layer_5/inner_group_0/layer_normalization_11/gamma/Read/ReadVariableOp:	
In[148] bert/encoder/transformer/group_0_5/layer_5/inner_group_0/layer_normalization_11/gamma/adam_m/Read/ReadVariableOp:	
In[149] bert/encoder/transformer/group_0_5/layer_5/inner_group_0/layer_normalization_11/gamma/adam_v/Read/ReadVariableOp:	
In[150] bert/encoder/transformer/group_0_5/layer_5/inner_group_0/layer_normalization_12/beta/Read/ReadVariableOp:	
In[151] bert/encoder/transformer/group_0_5/layer_5/inner_group_0/layer_normalization_12/beta/adam_m/Read/ReadVariableOp:	
In[152] bert/encoder/transformer/group_0_5/layer_5/inner_group_0/layer_normalization_12/beta/adam_v/Read/ReadVariableOp:	
In[153] bert/encoder/transformer/group_0_5/layer_5/inner_group_0/layer_normalization_12/gamma/Read/ReadVariableOp:	
In[154] bert/encoder/transformer/group_0_5/layer_5/inner_group_0/layer_normalization_12/gamma/adam_m/Read/ReadVariableOp:	
In[155] bert/encoder/transformer/group_0_5/layer_5/inner_group_0/layer_normalization_12/gamma/adam_v/Read/ReadVariableOp:	
In[156] bert/encoder/transformer/group_0_6/layer_6/inner_group_0/layer_normalization_13/beta/Read/ReadVariableOp:	
In[157] bert/encoder/transformer/group_0_6/layer_6/inner_group_0/layer_normalization_13/beta/adam_m/Read/ReadVariableOp:	
In[158] bert/encoder/transformer/group_0_6/layer_6/inner_group_0/layer_normalization_13/beta/adam_v/Read/ReadVariableOp:	
In[159] bert/encoder/transformer/group_0_6/layer_6/inner_group_0/layer_normalization_13/gamma/Read/ReadVariableOp:	
In[160] bert/encoder/transformer/group_0_6/layer_6/inner_group_0/layer_normalization_13/gamma/adam_m/Read/ReadVariableOp:	
In[161] bert/encoder/transformer/group_0_6/layer_6/inner_group_0/layer_normalization_13/gamma/adam_v/Read/ReadVariableOp:	
In[162] bert/encoder/transformer/group_0_6/layer_6/inner_group_0/layer_normalization_14/beta/Read/ReadVariableOp:	
In[163] bert/encoder/transformer/group_0_6/layer_6/inner_group_0/layer_normalization_14/beta/adam_m/Read/ReadVariableOp:	
In[164] bert/encoder/transformer/group_0_6/layer_6/inner_group_0/layer_normalization_14/beta/adam_v/Read/ReadVariableOp:	
In[165] bert/encoder/transformer/group_0_6/layer_6/inner_group_0/layer_normalization_14/gamma/Read/ReadVariableOp:	
In[166] bert/encoder/transformer/group_0_6/layer_6/inner_group_0/layer_normalization_14/gamma/adam_m/Read/ReadVariableOp:	
In[167] bert/encoder/transformer/group_0_6/layer_6/inner_group_0/layer_normalization_14/gamma/adam_v/Read/ReadVariableOp:	
In[168] bert/encoder/transformer/group_0_7/layer_7/inner_group_0/layer_normalization_15/beta/Read/ReadVariableOp:	
In[169] bert/encoder/transformer/group_0_7/layer_7/inner_group_0/layer_normalization_15/beta/adam_m/Read/ReadVariableOp:	
In[170] bert/encoder/transformer/group_0_7/layer_7/inner_group_0/layer_normalization_15/beta/adam_v/Read/ReadVariableOp:	
In[171] bert/encoder/transformer/group_0_7/layer_7/inner_group_0/layer_normalization_15/gamma/Read/ReadVariableOp:	
In[172] bert/encoder/transformer/group_0_7/layer_7/inner_group_0/layer_normalization_15/gamma/adam_m/Read/ReadVariableOp:	
In[173] bert/encoder/transformer/group_0_7/layer_7/inner_group_0/layer_normalization_15/gamma/adam_v/Read/ReadVariableOp:	
In[174] bert/encoder/transformer/group_0_7/layer_7/inner_group_0/layer_normalization_16/beta/Read/ReadVariableOp:	
In[175] bert/encoder/transformer/group_0_7/layer_7/inner_group_0/layer_normalization_16/beta/adam_m/Read/ReadVariableOp:	
In[176] bert/encoder/transformer/group_0_7/layer_7/inner_group_0/layer_normalization_16/beta/adam_v/Read/ReadVariableOp:	
In[177] bert/encoder/transformer/group_0_7/layer_7/inner_group_0/layer_normalization_16/gamma/Read/ReadVariableOp:	
In[178] bert/encoder/transformer/group_0_7/layer_7/inner_group_0/layer_normalization_16/gamma/adam_m/Read/ReadVariableOp:	
In[179] bert/encoder/transformer/group_0_7/layer_7/inner_group_0/layer_normalization_16/gamma/adam_v/Read/ReadVariableOp:	
In[180] bert/encoder/transformer/group_0_8/layer_8/inner_group_0/layer_normalization_17/beta/Read/ReadVariableOp:	
In[181] bert/encoder/transformer/group_0_8/layer_8/inner_group_0/layer_normalization_17/beta/adam_m/Read/ReadVariableOp:	
In[182] bert/encoder/transformer/group_0_8/layer_8/inner_group_0/layer_normalization_17/beta/adam_v/Read/ReadVariableOp:	
In[183] bert/encoder/transformer/group_0_8/layer_8/inner_group_0/layer_normalization_17/gamma/Read/ReadVariableOp:	
In[184] bert/encoder/transformer/group_0_8/layer_8/inner_group_0/layer_normalization_17/gamma/adam_m/Read/ReadVariableOp:	
In[185] bert/encoder/transformer/group_0_8/layer_8/inner_group_0/layer_normalization_17/gamma/adam_v/Read/ReadVariableOp:	
In[186] bert/encoder/transformer/group_0_8/layer_8/inner_group_0/layer_normalization_18/beta/Read/ReadVariableOp:	
In[187] bert/encoder/transformer/group_0_8/layer_8/inner_group_0/layer_normalization_18/beta/adam_m/Read/ReadVariableOp:	
In[188] bert/encoder/transformer/group_0_8/layer_8/inner_group_0/layer_normalization_18/beta/adam_v/Read/ReadVariableOp:	
In[189] bert/encoder/transformer/group_0_8/layer_8/inner_group_0/layer_normalization_18/gamma/Read/ReadVariableOp:	
In[190] bert/encoder/transformer/group_0_8/layer_8/inner_group_0/layer_normalization_18/gamma/adam_m/Read/ReadVariableOp:	
In[191] bert/encoder/transformer/group_0_8/layer_8/inner_group_0/layer_normalization_18/gamma/adam_v/Read/ReadVariableOp:	
In[192] bert/encoder/transformer/group_0_9/layer_9/inner_group_0/layer_normalization_19/beta/Read/ReadVariableOp:	
In[193] bert/encoder/transformer/group_0_9/layer_9/inner_group_0/layer_normalization_19/beta/adam_m/Read/ReadVariableOp:	
In[194] bert/encoder/transformer/group_0_9/layer_9/inner_group_0/layer_normalization_19/beta/adam_v/Read/ReadVariableOp:	
In[195] bert/encoder/transformer/group_0_9/layer_9/inner_group_0/layer_normalization_19/gamma/Read/ReadVariableOp:	
In[196] bert/encoder/transformer/group_0_9/layer_9/inner_group_0/layer_normalization_19/gamma/adam_m/Read/ReadVariableOp:	
In[197] bert/encoder/transformer/group_0_9/layer_9/inner_group_0/layer_normalization_19/gamma/adam_v/Read/ReadVariableOp:	
In[198] bert/encoder/transformer/group_0_9/layer_9/inner_group_0/layer_normalization_20/beta/Read/ReadVariableOp:	
In[199] bert/encoder/transformer/group_0_9/layer_9/inner_group_0/layer_normalization_20/beta/adam_m/Read/ReadVariableOp:	
In[200] bert/encoder/transformer/group_0_9/layer_9/inner_group_0/layer_normalization_20/beta/adam_v/Read/ReadVariableOp:	
In[201] bert/encoder/transformer/group_0_9/layer_9/inner_group_0/layer_normalization_20/gamma/Read/ReadVariableOp:	
In[202] bert/encoder/transformer/group_0_9/layer_9/inner_group_0/layer_normalization_20/gamma/adam_m/Read/ReadVariableOp:	
In[203] bert/encoder/transformer/group_0_9/layer_9/inner_group_0/layer_normalization_20/gamma/adam_v/Read/ReadVariableOp:	
In[204] bert/pooler/dense/bias/Read/ReadVariableOp (defined at C:\Users\XinXining\anaconda3\lib\site-packages\keras\engine\base_layer_v1.py:423)	
In[205] bert/pooler/dense/bias/adam_m/Read/ReadVariableOp:	
In[206] bert/pooler/dense/bias/adam_v/Read/ReadVariableOp:	
In[207] bert/pooler/dense/kernel/Read/ReadVariableOp:	
In[208] bert/pooler/dense/kernel/adam_m/Read/ReadVariableOp:	
In[209] bert/pooler/dense/kernel/adam_v/Read/ReadVariableOp:	
In[210] cls/predictions/output_bias/Read/ReadVariableOp (defined at <ipython-input-3-a9c4ee69f617>:21)	
In[211] cls/predictions/output_bias/adam_m/Read/ReadVariableOp:	
In[212] cls/predictions/output_bias/adam_v/Read/ReadVariableOp:	
In[213] cls/predictions/transform/dense/bias/Read/ReadVariableOp:	
In[214] cls/predictions/transform/dense/bias/adam_m/Read/ReadVariableOp:	
In[215] cls/predictions/transform/dense/bias/adam_v/Read/ReadVariableOp:	
In[216] cls/predictions/transform/dense/kernel/Read/ReadVariableOp:	
In[217] cls/predictions/transform/dense/kernel/adam_m/Read/ReadVariableOp:	
In[218] cls/predictions/transform/dense/kernel/adam_v/Read/ReadVariableOp:	
In[219] cls/predictions/transform/layer_normalization_25/beta/Read/ReadVariableOp:	
In[220] cls/predictions/transform/layer_normalization_25/beta/adam_m/Read/ReadVariableOp:	
In[221] cls/predictions/transform/layer_normalization_25/beta/adam_v/Read/ReadVariableOp:	
In[222] cls/predictions/transform/layer_normalization_25/gamma/Read/ReadVariableOp:	
In[223] cls/predictions/transform/layer_normalization_25/gamma/adam_m/Read/ReadVariableOp:	
In[224] cls/predictions/transform/layer_normalization_25/gamma/adam_v/Read/ReadVariableOp:	
In[225] cls/seq_relationship/output_bias/Read/ReadVariableOp (defined at <ipython-input-4-fb70aba96a2b>:12)	
In[226] cls/seq_relationship/output_bias/adam_m/Read/ReadVariableOp:	
In[227] cls/seq_relationship/output_bias/adam_v/Read/ReadVariableOp:	
In[228] cls/seq_relationship/output_weights/Read/ReadVariableOp (defined at <ipython-input-4-fb70aba96a2b>:7)	
In[229] cls/seq_relationship/output_weights/adam_m/Read/ReadVariableOp:	
In[230] cls/seq_relationship/output_weights/adam_v/Read/ReadVariableOp:	
In[231] global_step/Read/ReadVariableOp (defined at C:\Users\XinXining\anaconda3\lib\site-packages\tensorflow_estimator\python\estimator\tpu\tpu_estimator.py:147)

Operation defined at: (most recent call last)
>>>   File "C:\Users\XinXining\anaconda3\lib\runpy.py", line 194, in _run_module_as_main
>>>     return _run_code(code, main_globals, None,
>>> 
>>>   File "C:\Users\XinXining\anaconda3\lib\runpy.py", line 87, in _run_code
>>>     exec(code, run_globals)
>>> 
>>>   File "C:\Users\XinXining\anaconda3\lib\site-packages\ipykernel_launcher.py", line 16, in <module>
>>>     app.launch_new_instance()
>>> 
>>>   File "C:\Users\XinXining\anaconda3\lib\site-packages\traitlets\config\application.py", line 845, in launch_instance
>>>     app.start()
>>> 
>>>   File "C:\Users\XinXining\anaconda3\lib\site-packages\ipykernel\kernelapp.py", line 612, in start
>>>     self.io_loop.start()
>>> 
>>>   File "C:\Users\XinXining\anaconda3\lib\site-packages\tornado\platform\asyncio.py", line 199, in start
>>>     self.asyncio_loop.run_forever()
>>> 
>>>   File "C:\Users\XinXining\anaconda3\lib\asyncio\base_events.py", line 570, in run_forever
>>>     self._run_once()
>>> 
>>>   File "C:\Users\XinXining\anaconda3\lib\asyncio\base_events.py", line 1859, in _run_once
>>>     handle._run()
>>> 
>>>   File "C:\Users\XinXining\anaconda3\lib\asyncio\events.py", line 81, in _run
>>>     self._context.run(self._callback, *self._args)
>>> 
>>>   File "C:\Users\XinXining\anaconda3\lib\site-packages\tornado\ioloop.py", line 688, in <lambda>
>>>     lambda f: self._run_callback(functools.partial(callback, future))
>>> 
>>>   File "C:\Users\XinXining\anaconda3\lib\site-packages\tornado\ioloop.py", line 741, in _run_callback
>>>     ret = callback()
>>> 
>>>   File "C:\Users\XinXining\anaconda3\lib\site-packages\tornado\gen.py", line 814, in inner
>>>     self.ctx_run(self.run)
>>> 
>>>   File "C:\Users\XinXining\anaconda3\lib\site-packages\tornado\gen.py", line 775, in run
>>>     yielded = self.gen.send(value)
>>> 
>>>   File "C:\Users\XinXining\anaconda3\lib\site-packages\ipykernel\kernelbase.py", line 381, in dispatch_queue
>>>     yield self.process_one()
>>> 
>>>   File "C:\Users\XinXining\anaconda3\lib\site-packages\tornado\gen.py", line 250, in wrapper
>>>     runner = Runner(ctx_run, result, future, yielded)
>>> 
>>>   File "C:\Users\XinXining\anaconda3\lib\site-packages\tornado\gen.py", line 741, in __init__
>>>     self.ctx_run(self.run)
>>> 
>>>   File "C:\Users\XinXining\anaconda3\lib\site-packages\tornado\gen.py", line 775, in run
>>>     yielded = self.gen.send(value)
>>> 
>>>   File "C:\Users\XinXining\anaconda3\lib\site-packages\ipykernel\kernelbase.py", line 365, in process_one
>>>     yield gen.maybe_future(dispatch(*args))
>>> 
>>>   File "C:\Users\XinXining\anaconda3\lib\site-packages\tornado\gen.py", line 234, in wrapper
>>>     yielded = ctx_run(next, result)
>>> 
>>>   File "C:\Users\XinXining\anaconda3\lib\site-packages\ipykernel\kernelbase.py", line 268, in dispatch_shell
>>>     yield gen.maybe_future(handler(stream, idents, msg))
>>> 
>>>   File "C:\Users\XinXining\anaconda3\lib\site-packages\tornado\gen.py", line 234, in wrapper
>>>     yielded = ctx_run(next, result)
>>> 
>>>   File "C:\Users\XinXining\anaconda3\lib\site-packages\ipykernel\kernelbase.py", line 543, in execute_request
>>>     self.do_execute(
>>> 
>>>   File "C:\Users\XinXining\anaconda3\lib\site-packages\tornado\gen.py", line 234, in wrapper
>>>     yielded = ctx_run(next, result)
>>> 
>>>   File "C:\Users\XinXining\anaconda3\lib\site-packages\ipykernel\ipkernel.py", line 306, in do_execute
>>>     res = shell.run_cell(code, store_history=store_history, silent=silent)
>>> 
>>>   File "C:\Users\XinXining\anaconda3\lib\site-packages\ipykernel\zmqshell.py", line 536, in run_cell
>>>     return super(ZMQInteractiveShell, self).run_cell(*args, **kwargs)
>>> 
>>>   File "C:\Users\XinXining\anaconda3\lib\site-packages\IPython\core\interactiveshell.py", line 2894, in run_cell
>>>     result = self._run_cell(
>>> 
>>>   File "C:\Users\XinXining\anaconda3\lib\site-packages\IPython\core\interactiveshell.py", line 2940, in _run_cell
>>>     return runner(coro)
>>> 
>>>   File "C:\Users\XinXining\anaconda3\lib\site-packages\IPython\core\async_helpers.py", line 68, in _pseudo_sync_runner
>>>     coro.send(None)
>>> 
>>>   File "C:\Users\XinXining\anaconda3\lib\site-packages\IPython\core\interactiveshell.py", line 3165, in run_cell_async
>>>     has_raised = await self.run_ast_nodes(code_ast.body, cell_name,
>>> 
>>>   File "C:\Users\XinXining\anaconda3\lib\site-packages\IPython\core\interactiveshell.py", line 3357, in run_ast_nodes
>>>     if (await self.run_code(code, result,  async_=asy)):
>>> 
>>>   File "C:\Users\XinXining\anaconda3\lib\site-packages\IPython\core\interactiveshell.py", line 3437, in run_code
>>>     exec(code_obj, self.user_global_ns, self.user_ns)
>>> 
>>>   File "<ipython-input-9-d55d79ca8a29>", line 65, in <module>
>>>     estimator.train(input_fn=train_input_fn, max_steps=PARAMS_num_train_steps)
>>> 
>>>   File "C:\Users\XinXining\anaconda3\lib\site-packages\tensorflow_estimator\python\estimator\tpu\tpu_estimator.py", line 3092, in train
>>>     return super(TPUEstimator, self).train(
>>> 
>>>   File "C:\Users\XinXining\anaconda3\lib\site-packages\tensorflow_estimator\python\estimator\estimator.py", line 360, in train
>>>     loss = self._train_model(input_fn, hooks, saving_listeners)
>>> 
>>>   File "C:\Users\XinXining\anaconda3\lib\site-packages\tensorflow_estimator\python\estimator\estimator.py", line 1186, in _train_model
>>>     return self._train_model_default(input_fn, hooks, saving_listeners)
>>> 
>>>   File "C:\Users\XinXining\anaconda3\lib\site-packages\tensorflow_estimator\python\estimator\estimator.py", line 1217, in _train_model_default
>>>     return self._train_with_estimator_spec(estimator_spec, worker_hooks,
>>> 
>>>   File "C:\Users\XinXining\anaconda3\lib\site-packages\tensorflow_estimator\python\estimator\estimator.py", line 1512, in _train_with_estimator_spec
>>>     with training.MonitoredTrainingSession(
>>> 

Original stack trace for 'save/SaveV2':
  File "C:\Users\XinXining\anaconda3\lib\runpy.py", line 194, in _run_module_as_main
    return _run_code(code, main_globals, None,
  File "C:\Users\XinXining\anaconda3\lib\runpy.py", line 87, in _run_code
    exec(code, run_globals)
  File "C:\Users\XinXining\anaconda3\lib\site-packages\ipykernel_launcher.py", line 16, in <module>
    app.launch_new_instance()
  File "C:\Users\XinXining\anaconda3\lib\site-packages\traitlets\config\application.py", line 845, in launch_instance
    app.start()
  File "C:\Users\XinXining\anaconda3\lib\site-packages\ipykernel\kernelapp.py", line 612, in start
    self.io_loop.start()
  File "C:\Users\XinXining\anaconda3\lib\site-packages\tornado\platform\asyncio.py", line 199, in start
    self.asyncio_loop.run_forever()
  File "C:\Users\XinXining\anaconda3\lib\asyncio\base_events.py", line 570, in run_forever
    self._run_once()
  File "C:\Users\XinXining\anaconda3\lib\asyncio\base_events.py", line 1859, in _run_once
    handle._run()
  File "C:\Users\XinXining\anaconda3\lib\asyncio\events.py", line 81, in _run
    self._context.run(self._callback, *self._args)
  File "C:\Users\XinXining\anaconda3\lib\site-packages\tornado\ioloop.py", line 688, in <lambda>
    lambda f: self._run_callback(functools.partial(callback, future))
  File "C:\Users\XinXining\anaconda3\lib\site-packages\tornado\ioloop.py", line 741, in _run_callback
    ret = callback()
  File "C:\Users\XinXining\anaconda3\lib\site-packages\tornado\gen.py", line 814, in inner
    self.ctx_run(self.run)
  File "C:\Users\XinXining\anaconda3\lib\site-packages\tornado\gen.py", line 775, in run
    yielded = self.gen.send(value)
  File "C:\Users\XinXining\anaconda3\lib\site-packages\ipykernel\kernelbase.py", line 381, in dispatch_queue
    yield self.process_one()
  File "C:\Users\XinXining\anaconda3\lib\site-packages\tornado\gen.py", line 250, in wrapper
    runner = Runner(ctx_run, result, future, yielded)
  File "C:\Users\XinXining\anaconda3\lib\site-packages\tornado\gen.py", line 741, in __init__
    self.ctx_run(self.run)
  File "C:\Users\XinXining\anaconda3\lib\site-packages\tornado\gen.py", line 775, in run
    yielded = self.gen.send(value)
  File "C:\Users\XinXining\anaconda3\lib\site-packages\ipykernel\kernelbase.py", line 365, in process_one
    yield gen.maybe_future(dispatch(*args))
  File "C:\Users\XinXining\anaconda3\lib\site-packages\tornado\gen.py", line 234, in wrapper
    yielded = ctx_run(next, result)
  File "C:\Users\XinXining\anaconda3\lib\site-packages\ipykernel\kernelbase.py", line 268, in dispatch_shell
    yield gen.maybe_future(handler(stream, idents, msg))
  File "C:\Users\XinXining\anaconda3\lib\site-packages\tornado\gen.py", line 234, in wrapper
    yielded = ctx_run(next, result)
  File "C:\Users\XinXining\anaconda3\lib\site-packages\ipykernel\kernelbase.py", line 543, in execute_request
    self.do_execute(
  File "C:\Users\XinXining\anaconda3\lib\site-packages\tornado\gen.py", line 234, in wrapper
    yielded = ctx_run(next, result)
  File "C:\Users\XinXining\anaconda3\lib\site-packages\ipykernel\ipkernel.py", line 306, in do_execute
    res = shell.run_cell(code, store_history=store_history, silent=silent)
  File "C:\Users\XinXining\anaconda3\lib\site-packages\ipykernel\zmqshell.py", line 536, in run_cell
    return super(ZMQInteractiveShell, self).run_cell(*args, **kwargs)
  File "C:\Users\XinXining\anaconda3\lib\site-packages\IPython\core\interactiveshell.py", line 2894, in run_cell
    result = self._run_cell(
  File "C:\Users\XinXining\anaconda3\lib\site-packages\IPython\core\interactiveshell.py", line 2940, in _run_cell
    return runner(coro)
  File "C:\Users\XinXining\anaconda3\lib\site-packages\IPython\core\async_helpers.py", line 68, in _pseudo_sync_runner
    coro.send(None)
  File "C:\Users\XinXining\anaconda3\lib\site-packages\IPython\core\interactiveshell.py", line 3165, in run_cell_async
    has_raised = await self.run_ast_nodes(code_ast.body, cell_name,
  File "C:\Users\XinXining\anaconda3\lib\site-packages\IPython\core\interactiveshell.py", line 3357, in run_ast_nodes
    if (await self.run_code(code, result,  async_=asy)):
  File "C:\Users\XinXining\anaconda3\lib\site-packages\IPython\core\interactiveshell.py", line 3437, in run_code
    exec(code_obj, self.user_global_ns, self.user_ns)
  File "<ipython-input-9-d55d79ca8a29>", line 65, in <module>
    estimator.train(input_fn=train_input_fn, max_steps=PARAMS_num_train_steps)
  File "C:\Users\XinXining\anaconda3\lib\site-packages\tensorflow_estimator\python\estimator\tpu\tpu_estimator.py", line 3092, in train
    return super(TPUEstimator, self).train(
  File "C:\Users\XinXining\anaconda3\lib\site-packages\tensorflow_estimator\python\estimator\estimator.py", line 360, in train
    loss = self._train_model(input_fn, hooks, saving_listeners)
  File "C:\Users\XinXining\anaconda3\lib\site-packages\tensorflow_estimator\python\estimator\estimator.py", line 1186, in _train_model
    return self._train_model_default(input_fn, hooks, saving_listeners)
  File "C:\Users\XinXining\anaconda3\lib\site-packages\tensorflow_estimator\python\estimator\estimator.py", line 1217, in _train_model_default
    return self._train_with_estimator_spec(estimator_spec, worker_hooks,
  File "C:\Users\XinXining\anaconda3\lib\site-packages\tensorflow_estimator\python\estimator\estimator.py", line 1512, in _train_with_estimator_spec
    with training.MonitoredTrainingSession(
  File "C:\Users\XinXining\anaconda3\lib\site-packages\tensorflow\python\training\monitored_session.py", line 613, in MonitoredTrainingSession
    return MonitoredSession(
  File "C:\Users\XinXining\anaconda3\lib\site-packages\tensorflow\python\training\monitored_session.py", line 1058, in __init__
    super(MonitoredSession, self).__init__(
  File "C:\Users\XinXining\anaconda3\lib\site-packages\tensorflow\python\training\monitored_session.py", line 761, in __init__
    self._sess = _RecoverableSession(self._coordinated_creator)
  File "C:\Users\XinXining\anaconda3\lib\site-packages\tensorflow\python\training\monitored_session.py", line 1267, in __init__
    _WrappedSession.__init__(self, self._create_session())
  File "C:\Users\XinXining\anaconda3\lib\site-packages\tensorflow\python\training\monitored_session.py", line 1272, in _create_session
    return self._sess_creator.create_session()
  File "C:\Users\XinXining\anaconda3\lib\site-packages\tensorflow\python\training\monitored_session.py", line 914, in create_session
    self.tf_sess = self._session_creator.create_session()
  File "C:\Users\XinXining\anaconda3\lib\site-packages\tensorflow\python\training\monitored_session.py", line 672, in create_session
    self._scaffold.finalize()
  File "C:\Users\XinXining\anaconda3\lib\site-packages\tensorflow\python\training\monitored_session.py", line 244, in finalize
    self._saver.build()
  File "C:\Users\XinXining\anaconda3\lib\site-packages\tensorflow\python\training\saver.py", line 935, in build
    self._build(self._filename, build_save=True, build_restore=True)
  File "C:\Users\XinXining\anaconda3\lib\site-packages\tensorflow\python\training\saver.py", line 963, in _build
    self.saver_def = self._builder._build_internal(  # pylint: disable=protected-access
  File "C:\Users\XinXining\anaconda3\lib\site-packages\tensorflow\python\training\saver.py", line 525, in _build_internal
    save_tensor = self._AddShardedSaveOps(filename_tensor, per_device)
  File "C:\Users\XinXining\anaconda3\lib\site-packages\tensorflow\python\training\saver.py", line 317, in _AddShardedSaveOps
    return self._AddShardedSaveOpsForV2(filename_tensor, per_device)
  File "C:\Users\XinXining\anaconda3\lib\site-packages\tensorflow\python\training\saver.py", line 291, in _AddShardedSaveOpsForV2
    sharded_saves.append(self._AddSaveOps(sharded_filename, saveables))
  File "C:\Users\XinXining\anaconda3\lib\site-packages\tensorflow\python\training\saver.py", line 223, in _AddSaveOps
    save = self.save_op(filename_tensor, saveables)
  File "C:\Users\XinXining\anaconda3\lib\site-packages\tensorflow\python\training\saver.py", line 138, in save_op
    return io_ops.save_v2(filename_tensor, tensor_names, tensor_slices,
  File "C:\Users\XinXining\anaconda3\lib\site-packages\tensorflow\python\ops\gen_io_ops.py", line 1712, in save_v2
    _, _, _op, _outputs = _op_def_library._apply_op_helper(
  File "C:\Users\XinXining\anaconda3\lib\site-packages\tensorflow\python\framework\op_def_library.py", line 744, in _apply_op_helper
    op = g._create_op_internal(op_type_name, inputs, dtypes=None,
  File "C:\Users\XinXining\anaconda3\lib\site-packages\tensorflow\python\framework\ops.py", line 3697, in _create_op_internal
    ret = Operation(
  File "C:\Users\XinXining\anaconda3\lib\site-packages\tensorflow\python\framework\ops.py", line 2101, in __init__
    self._traceback = tf_stack.extract_stack_for_node(self._c_op)
