# Notebook to train segmenter model

This notebook uses the previously created tokenizer and labeled data to effectively train Transformer Encoder models to segment Brazilian legal text data.

In [7]:
import os

import transformers
import torch
import torch.nn
import datasets

try:
    from segmentador import *

except ImportError:
    from src import *


import eval_model


TRAIN_MODEL = True
PREDICT_TEST_SET_AT_END = True
DEBUG_RUN = False
RESOURCE_DIR = ".."


USE_FP16 = False
DEVICE = "cuda"
LOCAL_FILES_ONLY = True
LOAD_PRETRAINED_WEIGHTS = True
NUM_TRAIN_EPOCHS = 4
NUM_HIDDEN_LAYERS = 2
GRAD_ACCUMULATION_STEPS = 16
VOCAB_SIZE = 6000


TRAINED_MODEL_SAVE_PATH = os.path.join(
    RESOURCE_DIR,
    "segmenter_model_v2",
    f"{NUM_HIDDEN_LAYERS}_{VOCAB_SIZE}_layer_model",
)

TRAINER_STATE_SAVE_PATH = os.path.join(
    RESOURCE_DIR,
    "saved_trainer_states",
    f"{NUM_HIDDEN_LAYERS}_{VOCAB_SIZE}_layer_model",
)

# Setup model weights and labeled data

In [8]:
MODEL_URIS = {
    "trained_model": (TRAINED_MODEL_SAVE_PATH, True),
    "pretrained_bertimbau_hugginface_hub": ("neuralmind/bert-base-portuguese-cased", False),
    "pretrained_bertimbau_local": ("../base_models/bert-base-portuguese-cased", False),
}


if LOCAL_FILES_ONLY:
    MODEL_URIS.pop("pretrained_bertimbau_hugginface_hub")


for uri_tag, (uri_model, init_from_pretrained) in MODEL_URIS.items():
    try:
        seg_model = Segmenter(
            local_files_only=LOCAL_FILES_ONLY,
            device=DEVICE,
            uri_model=uri_model,
            init_from_pretrained_weights=init_from_pretrained,
            uri_tokenizer=os.path.join(RESOURCE_DIR, "tokenizers", f"{VOCAB_SIZE}_subwords"),
            num_hidden_layers=NUM_HIDDEN_LAYERS,
        )
        print(f"Loaded model from ({uri_tag}, {uri_model}).")
        break

    except OSError as e:
        print(e)
        continue


seg_model.model

401 Client Error: Unauthorized for url: https://huggingface.co/segmenter_model_v2/2_6000_layer_model/resolve/main/config.json


We couldn't connect to 'https://huggingface.co/' to load this model and it looks like ../segmenter_model_v2/2_6000_layer_model is not the path to a directory conaining a config.json file.
Checkout your internet connection or see how to run the library in offline mode at 'https://huggingface.co/docs/transformers/installation#offline-mode'.


401 Client Error: Unauthorized for url: https://huggingface.co/base_models/bert-base-portuguese-cased/resolve/main/config.json


We couldn't connect to 'https://huggingface.co/' to load this model and it looks like ../base_models/bert-base-portuguese-cased is not the path to a directory conaining a config.json file.
Checkout your internet connection or see how to run the library in offline mode at 'https://huggingface.co/docs/transformers/installation#offline-mode'.


BertForTokenClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(6000, 768, padding_idx=0)
      (position_embeddings): Embedding(1024, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0): BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12, elementwis

In [None]:
seg_model.model.config

In [4]:
# df_tokenized_split = datasets.load_from_disk(
#     os.path.join(RESOURCE_DIR, f"data/df_tokenized_split_0_120000_{VOCAB_SIZE}"),
# )
df_tokenized_split = datasets.load_from_disk(
    os.path.join("final_curated_dataset_for_hyperparameter_tuning"),
)
df_tokenized_split

DatasetDict({
    train: Dataset({
        features: ['labels', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 151302
    })
    eval: Dataset({
        features: ['labels', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 2015
    })
    test: Dataset({
        features: ['labels', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 2299
    })
})

In [None]:
save_steps = int(df_tokenized_split["train"].num_rows / GRAD_ACCUMULATION_STEPS * 0.10)


training_args = transformers.TrainingArguments(
    output_dir=os.path.join(
        RESOURCE_DIR, "segmenter_checkpoint", f"{NUM_HIDDEN_LAYERS}_{VOCAB_SIZE}_layer_model"
    ),
    logging_dir=os.path.join(
        RESOURCE_DIR, "loggings", f"{NUM_HIDDEN_LAYERS}_{VOCAB_SIZE}_layer_model"
    ),
    fp16=USE_FP16 and torch.cuda.is_available(),
    load_best_model_at_end=True,
    metric_for_best_model="eval_loss",
    greater_is_better=False,
    evaluation_strategy="steps",
    logging_strategy="steps",
    save_strategy="steps",
    save_steps=save_steps,
    eval_steps=save_steps,
    logging_steps=save_steps,
    save_total_limit=5,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    gradient_accumulation_steps=GRAD_ACCUMULATION_STEPS,
    learning_rate=5e-4,
    max_grad_norm=1.0,
    num_train_epochs=NUM_TRAIN_EPOCHS,
    lr_scheduler_type="linear",
    warmup_ratio=0.1,
    weight_decay=0.0,
    report_to="all",
)

data_collator = transformers.DataCollatorForTokenClassification(
    seg_model.tokenizer,
    pad_to_multiple_of=8 if USE_FP16 else 1,
)

if DEBUG_RUN:
    df_tokenized_split["train"] = df_tokenized_split["train"].shard(num_shards=500, index=0)
    df_tokenized_split["eval"] = df_tokenized_split["eval"].shard(num_shards=500, index=0)
    df_tokenized_split["test"] = df_tokenized_split["test"].shard(num_shards=500, index=0)
    print(df_tokenized_split)


trainer = transformers.Trainer(
    model=seg_model.model,
    tokenizer=seg_model.tokenizer,
    args=training_args,
    data_collator=data_collator,
    train_dataset=df_tokenized_split["train"],
    eval_dataset=df_tokenized_split["eval"],
    compute_metrics=eval_model.compute_metrics,
)

# Train model

In [None]:
if TRAIN_MODEL:
    try:
        train_results = trainer.train(resume_from_checkpoint=TRAINER_STATE_SAVE_PATH)

    except ValueError:
        train_results = trainer.train()

    train_metrics = train_results.metrics
    trainer.log_metrics(split="all", metrics=train_metrics)
    trainer.save_metrics(split="all", metrics=train_metrics)

    trainer.save_model(TRAINED_MODEL_SAVE_PATH)
    trainer.save_state()

# Evaluate trained model

In [None]:
if PREDICT_TEST_SET_AT_END:
    y_preds_test = trainer.predict(df_tokenized_split["test"])

    test_metrics = y_preds_test.metrics
    trainer.log_metrics(split="test", metrics=test_metrics)
    trainer.save_metrics(split="test", metrics=test_metrics)