In [8]:
import transformers
import torch
import datasets
import numpy as np

import segmentador

In [2]:
seg_model = segmentador.Segmenter(local_files_only=True)

Some weights of the model checkpoint at neuralmind/bert-base-portuguese-cased were not used when initializing BertForTokenClassification: ['cls.seq_relationship.bias', 'cls.predictions.decoder.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.bias', 'cls.predictions.transform.dense.weight', 'cls.seq_relationship.weight', 'cls.predictions.transform.LayerNorm.bias']
- This IS expected if you are initializing BertForTokenClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForTokenClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForTokenClassification were not initialized from the model check

In [3]:
df = datasets.load_dataset(
    "csv",
    data_files=["../data/content.txt"],
    header=None,
    names=["text"],
    cache_dir="../cache",
    nrows=1000,
)
df = df.filter(lambda item: isinstance(item["text"], str) and len(item["text"]) >= 128)

def preprocess_instance(item, ind):
    preprocessed_text = seg_model.preprocess_legal_text(item["text"])
    tokens = preprocessed_text.split(" ")
    labels = [0] * len(tokens)

    ret = {
        "id": str(ind),
        "labels": labels,
        "tokens": tokens,
    }
    
    return ret

df = df.map(preprocess_instance, with_indices=True)

Using custom data configuration default-be64f6fa7b07dcb7
Reusing dataset csv (../cache/csv/default-be64f6fa7b07dcb7/0.0.0/6b9057d9e23d9d8a2f05b985917a0da84d70c5dae3d22ddd8a3f22fb01c69d9e)


  0%|          | 0/1 [00:00<?, ?it/s]

Loading cached processed dataset at ../cache/csv/default-be64f6fa7b07dcb7/0.0.0/6b9057d9e23d9d8a2f05b985917a0da84d70c5dae3d22ddd8a3f22fb01c69d9e/cache-6db184cfe38bd6f3.arrow


0ex [00:00, ?ex/s]

In [4]:
def tokenize_and_align_labels(examples):
    # source: https://huggingface.co/docs/transformers/custom_datasets#preprocess
    tokenized_inputs = seg_model.tokenizer(
        examples["tokens"],
        truncation=True,
        max_length=512,
        is_split_into_words=True,
    )

    labels = []
    
    for i, label in enumerate(examples["labels"]):
        word_ids = tokenized_inputs.word_ids(batch_index=i)  # Map tokens to their respective word.
        previous_word_idx = None
        label_ids = []
        for word_idx in word_ids:  # Set the special tokens to -100.
            if word_idx is None:
                label_ids.append(-100)
            elif word_idx != previous_word_idx:  # Only label the first token of a given word.
                label_ids.append(label[word_idx])
            else:
                label_ids.append(-100)
            previous_word_idx = word_idx
        labels.append(label_ids)

    tokenized_inputs["labels"] = labels
    
    return tokenized_inputs


df_tokenized = df.map(tokenize_and_align_labels, batched=True)

  0%|          | 0/1 [00:00<?, ?ba/s]

In [5]:
df_tokenized_train_eval_test = df_tokenized["train"].train_test_split(test_size=0.2, shuffle=True, seed=16)
df_tokenized_test_eval = df_tokenized_train_eval_test["test"].train_test_split(test_size=0.5, shuffle=False)
df_tokenized_split = datasets.DatasetDict({
    "train": df_tokenized_train_eval_test["train"],
    "eval": df_tokenized_test_eval["train"],
    "test": df_tokenized_test_eval["test"],
})
df_tokenized_split

DatasetDict({
    train: Dataset({
        features: ['text', 'id', 'labels', 'tokens', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 800
    })
    eval: Dataset({
        features: ['text', 'id', 'labels', 'tokens', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 100
    })
    test: Dataset({
        features: ['text', 'id', 'labels', 'tokens', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 100
    })
})

In [6]:
metric = datasets.load_metric("accuracy")

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    return metric.compute(predictions=predictions, references=labels)

save_steps = int(df_tokenized_split["train"].num_rows * 0.10)

training_args = transformers.TrainingArguments(
    output_dir="../segmenter_checkpoint",
    fp16=torch.cuda.is_available(),
    load_best_model_at_end=True,
    metric_for_best_model="eval_loss",
    greater_is_better=False,
    evaluation_strategy="steps",
    logging_strategy="steps",
    save_strategy="steps",
    save_steps=save_steps,
    eval_steps=save_steps,
    logging_steps=save_steps,
    save_total_limit=5,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    gradient_accumulation_steps=8,
    learning_rate=5e-5,
    max_grad_norm=1.0,
    num_train_epochs=5,
    lr_scheduler_type="linear",
    warmup_ratio=0.1,
    weight_decay=5e-3,
)

data_collator = transformers.DataCollatorForTokenClassification(seg_model.tokenizer)

trainer = transformers.Trainer(
    model=seg_model.model,
    tokenizer=seg_model.tokenizer,
    args=training_args,
    data_collator=data_collator,
    train_dataset=df_tokenized_split["train"],
    eval_dataset=df_tokenized_split["eval"],
    compute_metrics=compute_metrics,
)

Using amp half precision backend


In [9]:
trainer.train()

The following columns in the training set  don't have a corresponding argument in `BertForTokenClassification.forward` and have been ignored: tokens, id, text.
***** Running training *****
  Num examples = 800
  Num Epochs = 5
  Instantaneous batch size per device = 8
  Total train batch size (w. parallel, distributed & accumulation) = 64
  Gradient Accumulation steps = 8
  Total optimization steps = 60


RuntimeError: CUDA out of memory. Tried to allocate 96.00 MiB (GPU 0; 3.82 GiB total capacity; 2.52 GiB already allocated; 86.25 MiB free; 2.56 GiB reserved in total by PyTorch) If reserved memory is >> allocated memory try setting max_split_size_mb to avoid fragmentation.  See documentation for Memory Management and PYTORCH_CUDA_ALLOC_CONF