In [1]:
import os
import sys
import pandas as pd
from datetime import datetime

from google.colab import drive

In [2]:
drive.mount('/content/gdrive', force_remount=True)

Mounted at /content/gdrive


In [3]:
ROOT_DIR = '/content/gdrive/MyDrive/Colab Notebooks'
DATA_DIR = os.path.join(ROOT_DIR, 'data', 'disaster_tweets')

In [4]:
print(ROOT_DIR)
print(DATA_DIR)

/content/gdrive/MyDrive/Colab Notebooks
/content/gdrive/MyDrive/Colab Notebooks/data/disaster_tweets


In [5]:
if ROOT_DIR not in sys.path:
  sys.path.append(ROOT_DIR)

In [6]:
!pip install -r "$ROOT_DIR/requirementscl.txt"



In [7]:
#!/usr/bin/env python
# coding=utf-8
# Copyright 2020 The HuggingFace Team. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
""" Fine-tuning the library models for sequence classification."""
import logging
import os
from dataclasses import dataclass, field
from typing import Dict, Optional

import datasets
import numpy as np
import tensorflow as tf

from transformers import (
    AutoConfig,
    AutoTokenizer,
    EvalPrediction,
    HfArgumentParser,
    PreTrainedTokenizer,
    TFAutoModelForSequenceClassification,
    TFTrainer,
    TFTrainingArguments,
)
from transformers.utils import logging as hf_logging


hf_logging.set_verbosity_info()
hf_logging.enable_default_handler()
hf_logging.enable_explicit_format()


def get_tfds(
    train_file: str,
    eval_file: str,
    test_file: str,
    tokenizer: PreTrainedTokenizer,
    label_column_id: int,
    max_seq_length: Optional[int] = None,
):
    files = {}

    if train_file is not None:
        files[datasets.Split.TRAIN] = [train_file]
    if eval_file is not None:
        files[datasets.Split.VALIDATION] = [eval_file]
    if test_file is not None:
        files[datasets.Split.TEST] = [test_file]

    ds = datasets.load_dataset("csv", data_files=files)
    features_name = list(ds[list(files.keys())[0]].features.keys())
    print(f"Features Name: {features_name}")
    
    label_name = features_name.pop(label_column_id)
    label_list = list(set(ds[list(files.keys())[0]][label_name]))
    label2id = {label: i for i, label in enumerate(label_list)}
    input_names = tokenizer.model_input_names
    transformed_ds = {}

    if len(features_name) == 1:
        for k in files.keys():
            transformed_ds[k] = ds[k].map(
                lambda example: tokenizer.batch_encode_plus(
                    example[features_name[0]], truncation=True, max_length=max_seq_length, padding="max_length"
                ),
                batched=True,
            )
    elif len(features_name) == 2:
        for k in files.keys():
            transformed_ds[k] = ds[k].map(
                lambda example: tokenizer.batch_encode_plus(
                    (example[features_name[0]], example[features_name[1]]),
                    truncation=True,
                    max_length=max_seq_length,
                    padding="max_length",
                ),
                batched=True,
            )

    def gen_train():
        for ex in transformed_ds[datasets.Split.TRAIN]:
            d = {k: v for k, v in ex.items() if k in input_names}
            label = label2id[ex[label_name]]
            yield (d, label)

    def gen_val():
        for ex in transformed_ds[datasets.Split.VALIDATION]:
            d = {k: v for k, v in ex.items() if k in input_names}
            label = label2id[ex[label_name]]
            yield (d, label)

    def gen_test():
        for ex in transformed_ds[datasets.Split.TEST]:
            d = {k: v for k, v in ex.items() if k in input_names}
            label = label2id[ex[label_name]]
            yield (d, label)

    train_ds = (
        tf.data.Dataset.from_generator(
            gen_train,
            ({k: tf.int32 for k in input_names}, tf.int64),
            ({k: tf.TensorShape([None]) for k in input_names}, tf.TensorShape([])),
        )
        if datasets.Split.TRAIN in transformed_ds
        else None
    )

    if train_ds is not None:
        train_ds = train_ds.apply(tf.data.experimental.assert_cardinality(len(ds[datasets.Split.TRAIN])))

    val_ds = (
        tf.data.Dataset.from_generator(
            gen_val,
            ({k: tf.int32 for k in input_names}, tf.int64),
            ({k: tf.TensorShape([None]) for k in input_names}, tf.TensorShape([])),
        )
        if datasets.Split.VALIDATION in transformed_ds
        else None
    )

    if val_ds is not None:
        val_ds = val_ds.apply(tf.data.experimental.assert_cardinality(len(ds[datasets.Split.VALIDATION])))

    test_ds = (
        tf.data.Dataset.from_generator(
            gen_test,
            ({k: tf.int32 for k in input_names}, tf.int64),
            ({k: tf.TensorShape([None]) for k in input_names}, tf.TensorShape([])),
        )
        if datasets.Split.TEST in transformed_ds
        else None
    )

    if test_ds is not None:
        test_ds = test_ds.apply(tf.data.experimental.assert_cardinality(len(ds[datasets.Split.TEST])))

    return train_ds, val_ds, test_ds, label2id


logger = logging.getLogger(__name__)


@dataclass
class DataTrainingArguments:
    """
    Arguments pertaining to what data we are going to input our model for training and eval.

    Using `HfArgumentParser` we can turn this class
    into argparse arguments to be able to specify them on
    the command line.
    """

    label_column_id: int = field(metadata={"help": "Which column contains the label"})
    train_file: str = field(default=None, metadata={"help": "The path of the training file"})
    dev_file: Optional[str] = field(default=None, metadata={"help": "The path of the development file"})
    test_file: Optional[str] = field(default=None, metadata={"help": "The path of the test file"})
    max_seq_length: int = field(
        default=128,
        metadata={
            "help": "The maximum total input sequence length after tokenization. Sequences longer "
            "than this will be truncated, sequences shorter will be padded."
        },
    )
    overwrite_cache: bool = field(
        default=False, metadata={"help": "Overwrite the cached training and evaluation sets"}
    )


@dataclass
class ModelArguments:
    """
    Arguments pertaining to which model/config/tokenizer we are going to fine-tune from.
    """

    model_name_or_path: str = field(
        metadata={"help": "Path to pretrained model or model identifier from huggingface.co/models"}
    )
    config_name: Optional[str] = field(
        default=None, metadata={"help": "Pretrained config name or path if not the same as model_name"}
    )
    tokenizer_name: Optional[str] = field(
        default=None, metadata={"help": "Pretrained tokenizer name or path if not the same as model_name"}
    )
    use_fast: bool = field(default=False, metadata={"help": "Set this flag to use fast tokenization."})
    # If you want to tweak more attributes on your tokenizer, you should do it in a distinct script,
    # or just modify its tokenizer_config.json.
    cache_dir: Optional[str] = field(
        default=None,
        metadata={"help": "Where do you want to store the pretrained models downloaded from huggingface.co"},
    )

In [8]:
def run_training(model_name, results_name):
  # See all possible arguments in src/transformers/training_args.py
  # or by passing the --help flag to this script.
  # We now keep distinct sets of args, for a cleaner separation of concerns.
  parser = HfArgumentParser((ModelArguments, DataTrainingArguments, TFTrainingArguments))
  model_args, data_args, training_args = parser.parse_args_into_dataclasses()

  if (
      os.path.exists(training_args.output_dir)
      and os.listdir(training_args.output_dir)
      and training_args.do_train
      and not training_args.overwrite_output_dir
  ):
      raise ValueError(
          f"Output directory ({training_args.output_dir}) already exists and is not empty. Use --overwrite_output_dir to overcome."
      )

  # Setup logging
  logging.basicConfig(
      format="%(asctime)s - %(levelname)s - %(name)s -   %(message)s",
      datefmt="%m/%d/%Y %H:%M:%S",
      level=logging.INFO,
  )
  logger.info(
      "n_replicas: %s, distributed training: %s, 16-bits training: %s",
      training_args.n_replicas,
      bool(training_args.n_replicas > 1),
      training_args.fp16,
  )
  logger.info("Training/evaluation parameters %s", training_args)

  # Load pretrained model and tokenizer
  #
  # Distributed training:
  # The .from_pretrained methods guarantee that only one local process can concurrently
  # download model & vocab.

  tokenizer = AutoTokenizer.from_pretrained(
      model_args.tokenizer_name if model_args.tokenizer_name else model_args.model_name_or_path,
      cache_dir=model_args.cache_dir,
  )

  train_dataset, eval_dataset, test_ds, label2id = get_tfds(
      train_file=data_args.train_file,
      eval_file=data_args.dev_file,
      test_file=data_args.test_file,
      tokenizer=tokenizer,
      label_column_id=data_args.label_column_id,
      max_seq_length=data_args.max_seq_length,
  )

  config = AutoConfig.from_pretrained(
      model_args.config_name if model_args.config_name else model_args.model_name_or_path,
      num_labels=len(label2id),
      label2id=label2id,
      id2label={id: label for label, id in label2id.items()},
      finetuning_task="text-classification",
      cache_dir=model_args.cache_dir,
  )

  with training_args.strategy.scope():
      model = TFAutoModelForSequenceClassification.from_pretrained(
          model_args.model_name_or_path,
          from_pt=bool(".bin" in model_args.model_name_or_path),
          config=config,
          cache_dir=model_args.cache_dir,
      )

  def compute_metrics(p: EvalPrediction) -> Dict:
      preds = np.argmax(p.predictions, axis=1)

      return {"acc": (preds == p.label_ids).mean()}

  # Initialize our Trainer
  trainer = TFTrainer(
      model=model,
      args=training_args,
      train_dataset=train_dataset,
      eval_dataset=eval_dataset,
      compute_metrics=compute_metrics,
  )

  print(f"train: {len(train_dataset)}")
  print(f"valid: {len(eval_dataset)}")
  print(f"test: {len(test_ds)}")
  print(f"Model name: {model_name}")
  print(f"Results name: {model_results_name}")

  # Training
  if training_args.do_train:
      trainer.train()
      trainer.save_model()
      tokenizer.save_pretrained(training_args.output_dir)

  # Evaluation
  results = {}
  if training_args.do_eval:
      logger.info("*** Evaluate ***")
      result = trainer.evaluate()
      output_eval_file = os.path.join(training_args.output_dir, f"eval_results_{model_results_name}.txt")

      with open(output_eval_file, "w") as writer:
          logger.info("***** Eval results *****")

          for key, value in result.items():
              logger.info("  %s = %s", key, value)
              writer.write("%s = %s\n" % (key, value))

          results.update(result)

          accuracy = results[[k for k in results.keys() if "acc" in k][0]]
          accuracy_str = f"a0{(int(round(accuracy*1000,0)))}"
          print(accuracy_str)
  else:
    acuracy_str = 'aunk'

  # Prediction
  if training_args.do_predict:
      logger.info("*** predictions ***")
      preds = trainer.predict(test_ds)

      logger.info("*** RESUTS: ***")
      logger.info(preds)
      logger.info("*** :RESUTS ***")

      decoded_preds = np.argmax(preds.predictions, axis=1)
      org_test = pd.read_csv(os.path.join(DATA_DIR, 'test.csv'))
      org_test['target'] = decoded_preds
      org_test[['id', 'target']].to_csv(os.path.join(DATA_DIR, f'./sub_{accuracy_str}_{model_results_name}.csv'), index=False)


In [9]:
#############
# MAIN LOOP #
#############
import shutil

# AVAILABLE MODELS:
# ConvBertConfig, DistilBertConfig, AlbertConfig, CamembertConfig, XLMRobertaConfig, LongformerConfig, 
# RobertaConfig, BertConfig, XLNetConfig, MobileBertConfig, FlaubertConfig, XLMConfig, ElectraConfig, FunnelConfig, GPT2Config, MPNetConfig, 
# OpenAIGPTConfig, TransfoXLConfig, CTRLConfig.

models = ['distilbert-base-uncased', 'roberta-base', 'bert-base-uncased', 'bert-base-multilingual-uncased']
devsetmode = 'v1024o'
for model_name in models:
  num_epochs = "3"
  timestamp=str(datetime.now()).replace(' ','_').replace(':','').replace('-','').split('.')[0][2:-2]

  model_results_name = f"{model_name.replace('/', '-')}_{num_epochs}e_{devsetmode}_{timestamp}"
  model_full_path = os.path.join(DATA_DIR, f"mod_{model_name}")

  args = ['run_tf_text_classification.py']
  args.extend([
    "--train_file", os.path.join(DATA_DIR, "train_formatted.csv"),
    "--test_file", os.path.join(DATA_DIR, "test_formatted.csv"),
    "--dev_file", os.path.join(DATA_DIR, "valid_formatted.csv"),
    "--label_column_id", "0",
    "--model_name_or_path", model_name, 
    "--output_dir", model_full_path, 
    "--num_train_epochs", num_epochs,
    "--per_device_train_batch_size", "16",
    "--per_device_eval_batch_size", "32",
    "--do_train", 
    "--do_eval",
    "--do_predict", 
    "--logging_steps", "476", ### 460 for test 256, 444 for test 512, 476 for all records in training
    "--evaluation_strategy", "steps", 
    "--save_steps", "476", ### WAS: "476", ### 2 times per epoch (476 steps per epoch)
    "--overwrite_output_dir", 
    "--max_seq_length", "128"    
  ])

  print(f"*******************")
  print(f"*** Processing: ***")
  print(f"*******************")
  print(f"MODEL NAME: {model_name}")
  print(f"RESULTS NAME: {model_results_name}")
  print(f"TIMESTAMP: {timestamp}")
  
  sys.argv = args

  # Ensure all new files are accessible (it is not always the case for Google Drive)  
  drive.mount('/content/gdrive', force_remount=True)

  run_training(model_name, model_results_name)

  # Clean training artefacts
  logger.info(f"removing checkpoints...")
  shutil.rmtree(os.path.join(model_full_path, 'checkpoint'))
  
  files = os.listdir(model_full_path)
  for f in files:
    if os.path.splitext(f)[1] == '.h5':
      logger.info(f"removing file {f}")
      os.remove(os.path.join(model_full_path, f))

*******************
*** Processing: ***
*******************
MODEL NAME: distilbert-base-uncased
RESULTS NAME: distilbert-base-uncased_3e_v1024o_210319_2234
TIMESTAMP: 210319_2234


[INFO|training_args.py:631] 2021-03-19 22:34:23,164 >> PyTorch: setting up devices
[INFO|training_args.py:555] 2021-03-19 22:34:23,227 >> The default value for the training argument `--report_to` will change in v5 (from all installed integrations to none). In v5, you will need to use `--report_to all` to get the same behavior as now. You should start updating your code and make this info disappear :-).
[INFO|training_args_tf.py:192] 2021-03-19 22:34:23,236 >> Tensorflow: setting up strategy


Mounted at /content/gdrive


03/19/2021 22:34:23 - INFO - __main__ -   n_replicas: 1, distributed training: False, 16-bits training: False
03/19/2021 22:34:23 - INFO - __main__ -   Training/evaluation parameters TFTrainingArguments(output_dir='/content/gdrive/MyDrive/Colab Notebooks/data/disaster_tweets/mod_distilbert-base-uncased', overwrite_output_dir=True, do_train=True, do_eval=True, do_predict=True, evaluation_strategy=<IntervalStrategy.STEPS: 'steps'>, prediction_loss_only=False, per_device_train_batch_size=16, per_device_eval_batch_size=32, per_gpu_train_batch_size=None, per_gpu_eval_batch_size=None, gradient_accumulation_steps=1, eval_accumulation_steps=None, learning_rate=5e-05, weight_decay=0.0, adam_beta1=0.9, adam_beta2=0.999, adam_epsilon=1e-08, max_grad_norm=1.0, num_train_epochs=3.0, max_steps=-1, lr_scheduler_type=<SchedulerType.LINEAR: 'linear'>, warmup_ratio=0.0, warmup_steps=0, logging_dir='runs/Mar19_22-34-23_a7d129682a59', logging_strategy=<IntervalStrategy.STEPS: 'steps'>, logging_first_step=

Features Name: ['target', 'text']


[INFO|configuration_utils.py:463] 2021-03-19 22:34:25,774 >> loading configuration file https://huggingface.co/distilbert-base-uncased/resolve/main/config.json from cache at /root/.cache/huggingface/transformers/23454919702d26495337f3da04d1655c7ee010d5ec9d77bdb9e399e00302c0a1.d423bdf2f58dc8b77d5f5d18028d7ae4a72dcfd8f468e81fe979ada957a8c361
[INFO|configuration_utils.py:499] 2021-03-19 22:34:25,775 >> Model config DistilBertConfig {
  "activation": "gelu",
  "architectures": [
    "DistilBertForMaskedLM"
  ],
  "attention_dropout": 0.1,
  "dim": 768,
  "dropout": 0.1,
  "finetuning_task": "text-classification",
  "hidden_dim": 3072,
  "id2label": {
    "0": 0,
    "1": 1
  },
  "initializer_range": 0.02,
  "label2id": {
    "0": 0,
    "1": 1
  },
  "max_position_embeddings": 512,
  "model_type": "distilbert",
  "n_heads": 12,
  "n_layers": 6,
  "pad_token_id": 0,
  "qa_dropout": 0.1,
  "seq_classif_dropout": 0.2,
  "sinusoidal_pos_embds": false,
  "tie_weights_": true,
  "transformers_v

train: 7613
valid: 1024
test: 3263
Model name: distilbert-base-uncased
Results name: distilbert-base-uncased_3e_v1024o_210319_2234
Please report this to the TensorFlow team. When filing the bug, set the verbosity to 10 (on Linux, `export AUTOGRAPH_VERBOSITY=10`) and attach the full output.
Cause: module, class, method, function, traceback, frame, or code object was expected, got cython_function_or_method


Please report this to the TensorFlow team. When filing the bug, set the verbosity to 10 (on Linux, `export AUTOGRAPH_VERBOSITY=10`) and attach the full output.
Cause: module, class, method, function, traceback, frame, or code object was expected, got cython_function_or_method


Please report this to the TensorFlow team. When filing the bug, set the verbosity to 10 (on Linux, `export AUTOGRAPH_VERBOSITY=10`) and attach the full output.
Cause: module, class, method, function, traceback, frame, or code object was expected, got cython_function_or_method
Cause: while/else statement not yet supported


Cause: while/else statement not yet supported


Cause: while/else statement not yet supported










[INFO|trainer_tf.py:306] 2021-03-19 22:37:03,672 >> ***** Running Evaluation *****
[INFO|trainer_tf.py:307] 2021-03-19 22:37:03,673 >>   Num examples in dataset = 1024
[INFO|trainer_tf.py:309] 2021-03-19 22:37:03,674 >>   Num examples in used in evaluation = 1024
[INFO|trainer_tf.py:310] 2021-03-19 22:37:03,675 >>   Batch size = 32








[INFO|trainer_tf.py:404] 2021-03-19 22:37:09,202 >> {'eval_loss': 0.29513081908226013, 'eval_acc': 0.884765625, 'epoch': 1.0, 'step': 476}
[INFO|trainer_tf.py:404] 2021-03-19 22:37:09,213 >> {'loss': 0.41401082, 'learning_rate': 3.333333e-05, 'epoch': 1.0, 'step': 476}
[INFO|trainer_tf.py:595] 2021-03-19 22:37:12,667 >> Saving checkpoint for step 476 at /content/gdrive/MyDrive/Colab Notebooks/data/disaster_tweets/mod_distilbert-base-uncased/checkpoint/ckpt-1
[INFO|trainer_tf.py:306] 2021-03-19 22:38:54,510 >> ***** Running Evaluation *****
[INFO|trainer_tf.py:307] 2021-03-19 22:38:54,511 >>   Num examples in dataset = 1024
[INFO|trainer_tf.py:309] 2021-03-19 22:38:54,512 >>   Num examples in used in evaluation = 1024
[INFO|trainer_tf.py:310] 2021-03-19 22:38:54,513 >>   Batch size = 32
[INFO|trainer_tf.py:404] 2021-03-19 22:38:59,113 >> {'eval_loss': 0.18254512548446655, 'eval_acc': 0.9375, 'epoch': 2.0, 'step': 952}
[INFO|trainer_tf.py:404] 2021-03-19 22:38:59,121 >> {'loss': 0.264261







[INFO|trainer_tf.py:404] 2021-03-19 22:40:58,864 >> {'eval_loss': 0.13496039807796478, 'eval_acc': 0.9541015625, 'epoch': 3.0, 'step': 1428}
03/19/2021 22:40:58 - INFO - __main__ -   ***** Eval results *****
03/19/2021 22:40:58 - INFO - __main__ -     eval_loss = 0.13496039807796478
03/19/2021 22:40:58 - INFO - __main__ -     eval_acc = 0.9541015625
03/19/2021 22:40:58 - INFO - __main__ -   *** predictions ***
[INFO|trainer_tf.py:306] 2021-03-19 22:40:58,891 >> ***** Running Prediction *****
[INFO|trainer_tf.py:307] 2021-03-19 22:40:58,896 >>   Num examples in dataset = 3263
[INFO|trainer_tf.py:310] 2021-03-19 22:40:58,899 >>   Batch size = 32


a0954






03/19/2021 22:41:14 - INFO - __main__ -   *** RESUTS: ***
03/19/2021 22:41:14 - INFO - __main__ -   PredictionOutput(predictions=array([[-0.9648697,  1.1234663],
       [-2.4418776,  2.7539551],
       [-2.1869729,  2.5227158],
       ...,
       [-2.8132024,  2.9213922],
       [-1.3580459,  1.590666 ],
       [-2.6961486,  2.908961 ]], dtype=float32), label_ids=array([0, 0, 0, ..., 0, 0, 0]), metrics={'eval_loss': 1.7891193464690565, 'eval_acc': 0.6019000919399325})
03/19/2021 22:41:14 - INFO - __main__ -   *** :RESUTS ***
03/19/2021 22:41:14 - INFO - __main__ -   removing checkpoints...
03/19/2021 22:41:14 - INFO - __main__ -   removing file tf_model.h5


*******************
*** Processing: ***
*******************
MODEL NAME: roberta-base
RESULTS NAME: roberta-base_3e_v1024o_210319_2241
TIMESTAMP: 210319_2241


[INFO|training_args.py:631] 2021-03-19 22:41:15,231 >> PyTorch: setting up devices
[INFO|training_args.py:555] 2021-03-19 22:41:15,234 >> The default value for the training argument `--report_to` will change in v5 (from all installed integrations to none). In v5, you will need to use `--report_to all` to get the same behavior as now. You should start updating your code and make this info disappear :-).
[INFO|training_args_tf.py:192] 2021-03-19 22:41:15,238 >> Tensorflow: setting up strategy
03/19/2021 22:41:15 - INFO - __main__ -   n_replicas: 1, distributed training: False, 16-bits training: False
03/19/2021 22:41:15 - INFO - __main__ -   Training/evaluation parameters TFTrainingArguments(output_dir='/content/gdrive/MyDrive/Colab Notebooks/data/disaster_tweets/mod_roberta-base', overwrite_output_dir=True, do_train=True, do_eval=True, do_predict=True, evaluation_strategy=<IntervalStrategy.STEPS: 'steps'>, prediction_loss_only=False, per_device_train_batch_size=16, per_device_eval_batch

Mounted at /content/gdrive


[INFO|configuration_utils.py:463] 2021-03-19 22:41:15,456 >> loading configuration file https://huggingface.co/roberta-base/resolve/main/config.json from cache at /root/.cache/huggingface/transformers/733bade19e5f0ce98e6531021dd5180994bb2f7b8bd7e80c7968805834ba351e.35205c6cfc956461d8515139f0f8dd5d207a2f336c0c3a83b4bc8dca3518e37b
[INFO|configuration_utils.py:499] 2021-03-19 22:41:15,457 >> Model config RobertaConfig {
  "architectures": [
    "RobertaForMaskedLM"
  ],
  "attention_probs_dropout_prob": 0.1,
  "bos_token_id": 0,
  "eos_token_id": 2,
  "gradient_checkpointing": false,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-05,
  "max_position_embeddings": 514,
  "model_type": "roberta",
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "pad_token_id": 1,
  "position_embedding_type": "absolute",
  "transformers_version": "4.4.2",
  "type_vocab_size": 1,
  "use_cache"

Features Name: ['target', 'text']


[INFO|configuration_utils.py:463] 2021-03-19 22:41:17,540 >> loading configuration file https://huggingface.co/roberta-base/resolve/main/config.json from cache at /root/.cache/huggingface/transformers/733bade19e5f0ce98e6531021dd5180994bb2f7b8bd7e80c7968805834ba351e.35205c6cfc956461d8515139f0f8dd5d207a2f336c0c3a83b4bc8dca3518e37b
[INFO|configuration_utils.py:499] 2021-03-19 22:41:17,541 >> Model config RobertaConfig {
  "architectures": [
    "RobertaForMaskedLM"
  ],
  "attention_probs_dropout_prob": 0.1,
  "bos_token_id": 0,
  "eos_token_id": 2,
  "finetuning_task": "text-classification",
  "gradient_checkpointing": false,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "id2label": {
    "0": 0,
    "1": 1
  },
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "label2id": {
    "0": 0,
    "1": 1
  },
  "layer_norm_eps": 1e-05,
  "max_position_embeddings": 514,
  "model_type": "roberta",
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  

train: 7613
valid: 1024
test: 3263
Model name: roberta-base
Results name: roberta-base_3e_v1024o_210319_2241














[INFO|trainer_tf.py:306] 2021-03-19 22:45:04,203 >> ***** Running Evaluation *****
[INFO|trainer_tf.py:307] 2021-03-19 22:45:04,206 >>   Num examples in dataset = 1024
[INFO|trainer_tf.py:309] 2021-03-19 22:45:04,207 >>   Num examples in used in evaluation = 1024
[INFO|trainer_tf.py:310] 2021-03-19 22:45:04,208 >>   Batch size = 32








[INFO|trainer_tf.py:404] 2021-03-19 22:45:13,973 >> {'eval_loss': 0.36592695116996765, 'eval_acc': 0.859375, 'epoch': 1.0, 'step': 476}
[INFO|trainer_tf.py:404] 2021-03-19 22:45:13,982 >> {'loss': 0.4499769, 'learning_rate': 3.333333e-05, 'epoch': 1.0, 'step': 476}
[INFO|trainer_tf.py:595] 2021-03-19 22:45:22,687 >> Saving checkpoint for step 476 at /content/gdrive/MyDrive/Colab Notebooks/data/disaster_tweets/mod_roberta-base/checkpoint/ckpt-1
[INFO|trainer_tf.py:306] 2021-03-19 22:48:42,890 >> ***** Running Evaluation *****
[INFO|trainer_tf.py:307] 2021-03-19 22:48:42,891 >>   Num examples in dataset = 1024
[INFO|trainer_tf.py:309] 2021-03-19 22:48:42,892 >>   Num examples in used in evaluation = 1024
[INFO|trainer_tf.py:310] 2021-03-19 22:48:42,893 >>   Batch size = 32
[INFO|trainer_tf.py:404] 2021-03-19 22:48:51,210 >> {'eval_loss': 0.2956276535987854, 'eval_acc': 0.8857421875, 'epoch': 2.0, 'step': 952}
[INFO|trainer_tf.py:404] 2021-03-19 22:48:51,220 >> {'loss': 0.33803, 'learning







[INFO|trainer_tf.py:404] 2021-03-19 22:53:03,869 >> {'eval_loss': 0.21554146707057953, 'eval_acc': 0.923828125, 'epoch': 3.0, 'step': 1428}
03/19/2021 22:53:03 - INFO - __main__ -   ***** Eval results *****
03/19/2021 22:53:03 - INFO - __main__ -     eval_loss = 0.21554146707057953
03/19/2021 22:53:03 - INFO - __main__ -     eval_acc = 0.923828125
03/19/2021 22:53:03 - INFO - __main__ -   *** predictions ***
[INFO|trainer_tf.py:306] 2021-03-19 22:53:03,901 >> ***** Running Prediction *****
[INFO|trainer_tf.py:307] 2021-03-19 22:53:03,903 >>   Num examples in dataset = 3263
[INFO|trainer_tf.py:310] 2021-03-19 22:53:03,906 >>   Batch size = 32


a0924






03/19/2021 22:53:30 - INFO - __main__ -   *** RESUTS: ***
03/19/2021 22:53:30 - INFO - __main__ -   PredictionOutput(predictions=array([[-2.242197 ,  2.0956461],
       [-1.7960083,  1.6218038],
       [-2.0303984,  1.8653785],
       ...,
       [-2.7090876,  2.6523266],
       [-0.9058579,  1.0671012],
       [-1.9163721,  1.7595011]], dtype=float32), label_ids=array([0, 0, 0, ..., 0, 0, 0]), metrics={'eval_loss': 1.6623221004710478, 'eval_acc': 0.5758504443763408})
03/19/2021 22:53:30 - INFO - __main__ -   *** :RESUTS ***
03/19/2021 22:53:30 - INFO - __main__ -   removing checkpoints...
03/19/2021 22:53:31 - INFO - __main__ -   removing file tf_model.h5


*******************
*** Processing: ***
*******************
MODEL NAME: bert-base-uncased
RESULTS NAME: bert-base-uncased_3e_v1024o_210319_2253
TIMESTAMP: 210319_2253


[INFO|training_args.py:631] 2021-03-19 22:53:32,304 >> PyTorch: setting up devices
[INFO|training_args.py:555] 2021-03-19 22:53:32,309 >> The default value for the training argument `--report_to` will change in v5 (from all installed integrations to none). In v5, you will need to use `--report_to all` to get the same behavior as now. You should start updating your code and make this info disappear :-).
[INFO|training_args_tf.py:192] 2021-03-19 22:53:32,316 >> Tensorflow: setting up strategy
03/19/2021 22:53:32 - INFO - __main__ -   n_replicas: 1, distributed training: False, 16-bits training: False
03/19/2021 22:53:32 - INFO - __main__ -   Training/evaluation parameters TFTrainingArguments(output_dir='/content/gdrive/MyDrive/Colab Notebooks/data/disaster_tweets/mod_bert-base-uncased', overwrite_output_dir=True, do_train=True, do_eval=True, do_predict=True, evaluation_strategy=<IntervalStrategy.STEPS: 'steps'>, prediction_loss_only=False, per_device_train_batch_size=16, per_device_eval_

Mounted at /content/gdrive


[INFO|configuration_utils.py:463] 2021-03-19 22:53:32,543 >> loading configuration file https://huggingface.co/bert-base-uncased/resolve/main/config.json from cache at /root/.cache/huggingface/transformers/3c61d016573b14f7f008c02c4e51a366c67ab274726fe2910691e2a761acf43e.637c6035640bacb831febcc2b7f7bee0a96f9b30c2d7e9ef84082d9f252f3170
[INFO|configuration_utils.py:499] 2021-03-19 22:53:32,545 >> Model config BertConfig {
  "architectures": [
    "BertForMaskedLM"
  ],
  "attention_probs_dropout_prob": 0.1,
  "gradient_checkpointing": false,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-12,
  "max_position_embeddings": 512,
  "model_type": "bert",
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "pad_token_id": 0,
  "position_embedding_type": "absolute",
  "transformers_version": "4.4.2",
  "type_vocab_size": 2,
  "use_cache": true,
  "vocab_size": 30522
}

[INFO|tokeniz

Features Name: ['target', 'text']


[INFO|configuration_utils.py:463] 2021-03-19 22:53:34,411 >> loading configuration file https://huggingface.co/bert-base-uncased/resolve/main/config.json from cache at /root/.cache/huggingface/transformers/3c61d016573b14f7f008c02c4e51a366c67ab274726fe2910691e2a761acf43e.637c6035640bacb831febcc2b7f7bee0a96f9b30c2d7e9ef84082d9f252f3170
[INFO|configuration_utils.py:499] 2021-03-19 22:53:34,413 >> Model config BertConfig {
  "architectures": [
    "BertForMaskedLM"
  ],
  "attention_probs_dropout_prob": 0.1,
  "finetuning_task": "text-classification",
  "gradient_checkpointing": false,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "id2label": {
    "0": 0,
    "1": 1
  },
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "label2id": {
    "0": 0,
    "1": 1
  },
  "layer_norm_eps": 1e-12,
  "max_position_embeddings": 512,
  "model_type": "bert",
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "pad_token_id": 0,
  "position_embedding_type"

train: 7613
valid: 1024
test: 3263
Model name: bert-base-uncased
Results name: bert-base-uncased_3e_v1024o_210319_2253














[INFO|trainer_tf.py:306] 2021-03-19 22:57:10,484 >> ***** Running Evaluation *****
[INFO|trainer_tf.py:307] 2021-03-19 22:57:10,490 >>   Num examples in dataset = 1024
[INFO|trainer_tf.py:309] 2021-03-19 22:57:10,491 >>   Num examples in used in evaluation = 1024
[INFO|trainer_tf.py:310] 2021-03-19 22:57:10,492 >>   Batch size = 32








[INFO|trainer_tf.py:404] 2021-03-19 22:57:20,746 >> {'eval_loss': 0.3039873540401459, 'eval_acc': 0.8857421875, 'epoch': 1.0, 'step': 476}
[INFO|trainer_tf.py:404] 2021-03-19 22:57:20,754 >> {'loss': 0.41882545, 'learning_rate': 3.333333e-05, 'epoch': 1.0, 'step': 476}
[INFO|trainer_tf.py:595] 2021-03-19 22:57:26,513 >> Saving checkpoint for step 476 at /content/gdrive/MyDrive/Colab Notebooks/data/disaster_tweets/mod_bert-base-uncased/checkpoint/ckpt-1
[INFO|trainer_tf.py:306] 2021-03-19 23:00:39,621 >> ***** Running Evaluation *****
[INFO|trainer_tf.py:307] 2021-03-19 23:00:39,622 >>   Num examples in dataset = 1024
[INFO|trainer_tf.py:309] 2021-03-19 23:00:39,623 >>   Num examples in used in evaluation = 1024
[INFO|trainer_tf.py:310] 2021-03-19 23:00:39,624 >>   Batch size = 32
[INFO|trainer_tf.py:404] 2021-03-19 23:00:48,536 >> {'eval_loss': 0.19302938878536224, 'eval_acc': 0.9326171875, 'epoch': 2.0, 'step': 952}
[INFO|trainer_tf.py:404] 2021-03-19 23:00:48,545 >> {'loss': 0.281751







[INFO|trainer_tf.py:404] 2021-03-19 23:04:46,630 >> {'eval_loss': 0.11916890740394592, 'eval_acc': 0.9609375, 'epoch': 3.0, 'step': 1428}
03/19/2021 23:04:46 - INFO - __main__ -   ***** Eval results *****
03/19/2021 23:04:46 - INFO - __main__ -     eval_loss = 0.11916890740394592
03/19/2021 23:04:46 - INFO - __main__ -     eval_acc = 0.9609375
03/19/2021 23:04:46 - INFO - __main__ -   *** predictions ***
[INFO|trainer_tf.py:306] 2021-03-19 23:04:46,656 >> ***** Running Prediction *****
[INFO|trainer_tf.py:307] 2021-03-19 23:04:46,659 >>   Num examples in dataset = 3263
[INFO|trainer_tf.py:310] 2021-03-19 23:04:46,660 >>   Batch size = 32


a0961






03/19/2021 23:05:15 - INFO - __main__ -   *** RESUTS: ***
03/19/2021 23:05:15 - INFO - __main__ -   PredictionOutput(predictions=array([[-1.791618 ,  2.1779225],
       [-1.9666513,  2.3399668],
       [-1.6043497,  2.0729783],
       ...,
       [-2.4841366,  2.6329815],
       [-2.0199013,  2.3375275],
       [-2.1570654,  2.3207397]], dtype=float32), label_ids=array([0, 0, 0, ..., 0, 0, 0]), metrics={'eval_loss': 1.7081491806927849, 'eval_acc': 0.5887220349371743})
03/19/2021 23:05:15 - INFO - __main__ -   *** :RESUTS ***
03/19/2021 23:05:16 - INFO - __main__ -   removing checkpoints...
03/19/2021 23:05:16 - INFO - __main__ -   removing file tf_model.h5


*******************
*** Processing: ***
*******************
MODEL NAME: bert-base-multilingual-uncased
RESULTS NAME: bert-base-multilingual-uncased_3e_v1024o_210319_2305
TIMESTAMP: 210319_2305


[INFO|training_args.py:631] 2021-03-19 23:05:17,664 >> PyTorch: setting up devices
[INFO|training_args.py:555] 2021-03-19 23:05:17,674 >> The default value for the training argument `--report_to` will change in v5 (from all installed integrations to none). In v5, you will need to use `--report_to all` to get the same behavior as now. You should start updating your code and make this info disappear :-).
[INFO|training_args_tf.py:192] 2021-03-19 23:05:17,683 >> Tensorflow: setting up strategy
03/19/2021 23:05:17 - INFO - __main__ -   n_replicas: 1, distributed training: False, 16-bits training: False
03/19/2021 23:05:17 - INFO - __main__ -   Training/evaluation parameters TFTrainingArguments(output_dir='/content/gdrive/MyDrive/Colab Notebooks/data/disaster_tweets/mod_bert-base-multilingual-uncased', overwrite_output_dir=True, do_train=True, do_eval=True, do_predict=True, evaluation_strategy=<IntervalStrategy.STEPS: 'steps'>, prediction_loss_only=False, per_device_train_batch_size=16, per

Mounted at /content/gdrive


[INFO|configuration_utils.py:463] 2021-03-19 23:05:17,922 >> loading configuration file https://huggingface.co/bert-base-multilingual-uncased/resolve/main/config.json from cache at /root/.cache/huggingface/transformers/af4e101d208f361f141144dca21e9c4148aaf0e85441c2e335743d10829c6cad.d63adade93e44e64bedd306ec82ffd33eedabaf0ff08aabe581acaa48616a508
[INFO|configuration_utils.py:499] 2021-03-19 23:05:17,923 >> Model config BertConfig {
  "architectures": [
    "BertForMaskedLM"
  ],
  "attention_probs_dropout_prob": 0.1,
  "directionality": "bidi",
  "gradient_checkpointing": false,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-12,
  "max_position_embeddings": 512,
  "model_type": "bert",
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "pad_token_id": 0,
  "pooler_fc_size": 768,
  "pooler_num_attention_heads": 12,
  "pooler_num_fc_layers": 3,
  "pooler_size_per_head": 12

Features Name: ['target', 'text']


[INFO|configuration_utils.py:463] 2021-03-19 23:05:19,933 >> loading configuration file https://huggingface.co/bert-base-multilingual-uncased/resolve/main/config.json from cache at /root/.cache/huggingface/transformers/af4e101d208f361f141144dca21e9c4148aaf0e85441c2e335743d10829c6cad.d63adade93e44e64bedd306ec82ffd33eedabaf0ff08aabe581acaa48616a508
[INFO|configuration_utils.py:499] 2021-03-19 23:05:19,934 >> Model config BertConfig {
  "architectures": [
    "BertForMaskedLM"
  ],
  "attention_probs_dropout_prob": 0.1,
  "directionality": "bidi",
  "finetuning_task": "text-classification",
  "gradient_checkpointing": false,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "id2label": {
    "0": 0,
    "1": 1
  },
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "label2id": {
    "0": 0,
    "1": 1
  },
  "layer_norm_eps": 1e-12,
  "max_position_embeddings": 512,
  "model_type": "bert",
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "pad_

train: 7613
valid: 1024
test: 3263
Model name: bert-base-multilingual-uncased
Results name: bert-base-multilingual-uncased_3e_v1024o_210319_2305














[INFO|trainer_tf.py:306] 2021-03-19 23:09:15,199 >> ***** Running Evaluation *****
[INFO|trainer_tf.py:307] 2021-03-19 23:09:15,200 >>   Num examples in dataset = 1024
[INFO|trainer_tf.py:309] 2021-03-19 23:09:15,202 >>   Num examples in used in evaluation = 1024
[INFO|trainer_tf.py:310] 2021-03-19 23:09:15,203 >>   Batch size = 32








[INFO|trainer_tf.py:404] 2021-03-19 23:09:25,532 >> {'eval_loss': 0.3208419680595398, 'eval_acc': 0.8642578125, 'epoch': 1.0, 'step': 476}
[INFO|trainer_tf.py:404] 2021-03-19 23:09:25,539 >> {'loss': 0.4490564, 'learning_rate': 3.333333e-05, 'epoch': 1.0, 'step': 476}
[INFO|trainer_tf.py:595] 2021-03-19 23:09:46,095 >> Saving checkpoint for step 476 at /content/gdrive/MyDrive/Colab Notebooks/data/disaster_tweets/mod_bert-base-multilingual-uncased/checkpoint/ckpt-1
[INFO|trainer_tf.py:306] 2021-03-19 23:13:12,564 >> ***** Running Evaluation *****
[INFO|trainer_tf.py:307] 2021-03-19 23:13:12,566 >>   Num examples in dataset = 1024
[INFO|trainer_tf.py:309] 2021-03-19 23:13:12,567 >>   Num examples in used in evaluation = 1024
[INFO|trainer_tf.py:310] 2021-03-19 23:13:12,568 >>   Batch size = 32
[INFO|trainer_tf.py:404] 2021-03-19 23:13:21,483 >> {'eval_loss': 0.22632597386837006, 'eval_acc': 0.9169921875, 'epoch': 2.0, 'step': 952}
[INFO|trainer_tf.py:404] 2021-03-19 23:13:21,492 >> {'los







[INFO|trainer_tf.py:404] 2021-03-19 23:18:00,548 >> {'eval_loss': 0.1612837016582489, 'eval_acc': 0.9453125, 'epoch': 3.0, 'step': 1428}
03/19/2021 23:18:00 - INFO - __main__ -   ***** Eval results *****
03/19/2021 23:18:00 - INFO - __main__ -     eval_loss = 0.1612837016582489
03/19/2021 23:18:00 - INFO - __main__ -     eval_acc = 0.9453125
03/19/2021 23:18:00 - INFO - __main__ -   *** predictions ***
[INFO|trainer_tf.py:306] 2021-03-19 23:18:00,573 >> ***** Running Prediction *****
[INFO|trainer_tf.py:307] 2021-03-19 23:18:00,576 >>   Num examples in dataset = 3263
[INFO|trainer_tf.py:310] 2021-03-19 23:18:00,579 >>   Batch size = 32


a0945






03/19/2021 23:18:29 - INFO - __main__ -   *** RESUTS: ***
03/19/2021 23:18:29 - INFO - __main__ -   PredictionOutput(predictions=array([[-1.3270354,  1.4175619],
       [-2.5180933,  2.3808997],
       [-2.4241726,  2.336313 ],
       ...,
       [-2.8179977,  2.4849398],
       [-0.9030537,  1.0315692],
       [-2.525087 ,  2.4081502]], dtype=float32), label_ids=array([0, 0, 0, ..., 0, 0, 0]), metrics={'eval_loss': 1.6473717783011643, 'eval_acc': 0.5985289610787619})
03/19/2021 23:18:29 - INFO - __main__ -   *** :RESUTS ***
03/19/2021 23:18:29 - INFO - __main__ -   removing checkpoints...
03/19/2021 23:18:29 - INFO - __main__ -   removing file tf_model.h5
