In [1]:
!lscpu 

Architecture:            x86_64
  CPU op-mode(s):        32-bit, 64-bit
  Address sizes:         39 bits physical, 48 bits virtual
  Byte Order:            Little Endian
CPU(s):                  24
  On-line CPU(s) list:   0-23
Vendor ID:               GenuineIntel
  Model name:            13th Gen Intel(R) Core(TM) i7-13700KF
    CPU family:          6
    Model:               183
    Thread(s) per core:  2
    Core(s) per socket:  12
    Socket(s):           1
    Stepping:            1
    BogoMIPS:            6835.19
    Flags:               fpu vme de pse tsc msr pae mce cx8 apic sep mtrr pge mc
                         a cmov pat pse36 clflush mmx fxsr sse sse2 ss ht syscal
                         l nx pdpe1gb rdtscp lm constant_tsc rep_good nopl xtopo
                         logy tsc_reliable nonstop_tsc cpuid pni pclmulqdq ssse3
                          fma cx16 sse4_1 sse4_2 movbe popcnt aes xsave avx f16c
                          rdrand hypervisor lahf_lm abm 3dnowprefetc

In [2]:
import torch

print(torch.cuda.is_available())

for i in range(torch.cuda.device_count()):
   print(torch.cuda.get_device_properties(i))

True
_CudaDeviceProperties(name='NVIDIA GeForce GTX 1070', major=6, minor=1, total_memory=8191MB, multi_processor_count=15, uuid=ce792cd3-464b-1176-c4ca-2ecf6eef4afa, L2_cache_size=2MB)


In [3]:
from multiprocessing import cpu_count
import os, transformers 
import tqdm as notebook_tqdm

# set cache directory out of $HOME to $WORK
default_cache_dir = ".cache/"
os.environ["HF_HOME"] = default_cache_dir

print(transformers.__version__)

4.53.3


## Dataset

In [4]:
from datasets import load_dataset

wikipedia = load_dataset("wikimedia/wikipedia", 
                         "20231101.pt", 
                         split="train[:1_000]", 
                         num_proc=cpu_count(),
                         cache_dir=default_cache_dir
                         )

wikipedia = wikipedia.remove_columns([col for col in wikipedia.column_names if col != "text"])  # only keep the 'text' column

print(wikipedia, '\n')

print(wikipedia[0]["text"][:1000])

Dataset({
    features: ['text'],
    num_rows: 1000
}) 

Astronomia é uma ciência natural que estuda corpos celestes (como estrelas, planetas, cometas, nebulosas, aglomerados de estrelas, galáxias) e fenômenos que se originam fora da atmosfera da Terra (como a radiação cósmica de fundo em micro-ondas). Preocupada com a evolução, a física e a química de objetos celestes, bem como a formação e o desenvolvimento do universo.

A astronomia é uma das mais antigas ciências. Culturas pré-históricas deixaram registrados vários artefatos astronômicos, como Stonehenge, os montes de Newgrange e os menires. As primeiras civilizações, como os babilônios, gregos, chineses, indianos, persas e maias realizaram observações metódicas do céu noturno. No entanto, a invenção do telescópio permitiu o desenvolvimento da astronomia moderna. Historicamente, a astronomia incluiu disciplinas tão diversas como astrometria, navegação astronômica, astronomia observacional e a elaboração de calendários. Durante o p

In [5]:
from datasets import concatenate_datasets

# raw_datasets = concatenate_datasets([wikipedia, brwac])
raw_datasets = concatenate_datasets([wikipedia])

raw_datasets

Dataset({
    features: ['text'],
    num_rows: 1000
})

In [6]:
import nltk

stopwords = nltk.corpus.stopwords.words('portuguese')

stopwords

['a',
 'à',
 'ao',
 'aos',
 'aquela',
 'aquelas',
 'aquele',
 'aqueles',
 'aquilo',
 'as',
 'às',
 'até',
 'com',
 'como',
 'da',
 'das',
 'de',
 'dela',
 'delas',
 'dele',
 'deles',
 'depois',
 'do',
 'dos',
 'e',
 'é',
 'ela',
 'elas',
 'ele',
 'eles',
 'em',
 'entre',
 'era',
 'eram',
 'éramos',
 'essa',
 'essas',
 'esse',
 'esses',
 'esta',
 'está',
 'estamos',
 'estão',
 'estar',
 'estas',
 'estava',
 'estavam',
 'estávamos',
 'este',
 'esteja',
 'estejam',
 'estejamos',
 'estes',
 'esteve',
 'estive',
 'estivemos',
 'estiver',
 'estivera',
 'estiveram',
 'estivéramos',
 'estiverem',
 'estivermos',
 'estivesse',
 'estivessem',
 'estivéssemos',
 'estou',
 'eu',
 'foi',
 'fomos',
 'for',
 'fora',
 'foram',
 'fôramos',
 'forem',
 'formos',
 'fosse',
 'fossem',
 'fôssemos',
 'fui',
 'há',
 'haja',
 'hajam',
 'hajamos',
 'hão',
 'havemos',
 'haver',
 'hei',
 'houve',
 'houvemos',
 'houver',
 'houvera',
 'houverá',
 'houveram',
 'houvéramos',
 'houverão',
 'houverei',
 'houverem',
 'hou

In [7]:
def analize_texts(row):

    list_para = []
    list_words = []
    list_stopwords = []
    list_average = []

    for doc in row["text"]:
        for paragraph in doc.split('\n'):

            # strip whitespaces
            paragraph = paragraph.strip()

            # skip single or empty worded paragraphs
            if (len(paragraph.split()) < 2):
                continue

            # count how many stopwords are in the paragraph
            stopwords_cnt = 0    
            for word in paragraph.split():
                for stop in stopwords:
                    if stop.casefold() == word.casefold():  # insensitive case
                        stopwords_cnt += 1
                        break # count once and speed up everything
            
            # count non whitespace characters
            characters = 0
            for word in paragraph.split():
                characters += len(word)

            list_para.append(paragraph)
            list_words.append(len(paragraph.split()))
            list_stopwords.append(stopwords_cnt)
            list_average.append(characters/len(paragraph.split()))

    return {"paragraphs" : list_para, "num_words" : list_words, "stopwords" : list_stopwords, "average" : list_average}

preprocessed_datasets = raw_datasets.map(analize_texts,
                                         batched = True,
                                         remove_columns=["text"],
                                         num_proc = cpu_count(),
                                        )

preprocessed_datasets = preprocessed_datasets.rename_column("paragraphs", "text")

preprocessed_datasets

Dataset({
    features: ['text', 'num_words', 'stopwords', 'average'],
    num_rows: 74024
})

In [8]:
filtered_datasets = preprocessed_datasets.filter(
    lambda example: 
            example["num_words"] >= 10 
        and example["num_words"] <= 512
        and example["stopwords"] >= 1
        and example["average"] >= 2 
        and example["average"] <= 15
    )

filtered_datasets

Dataset({
    features: ['text', 'num_words', 'stopwords', 'average'],
    num_rows: 42945
})

In [9]:
total_words_corpus = sum( filtered_datasets["num_words"] )

print(f"{total_words_corpus:_}")

3_031_590


In [10]:
split_dataset = filtered_datasets.train_test_split(test_size=0.05, shuffle=True, seed=42)

split_dataset

DatasetDict({
    train: Dataset({
        features: ['text', 'num_words', 'stopwords', 'average'],
        num_rows: 40797
    })
    test: Dataset({
        features: ['text', 'num_words', 'stopwords', 'average'],
        num_rows: 2148
    })
})

## Tokenizer 

In [11]:
from transformers import AutoTokenizer

modern_tokenizer = AutoTokenizer.from_pretrained("answerdotai/ModernBERT-base")

modern_tokenizer

PreTrainedTokenizerFast(name_or_path='answerdotai/ModernBERT-base', vocab_size=50280, model_max_length=8192, is_fast=True, padding_side='right', truncation_side='right', special_tokens={'unk_token': '[UNK]', 'sep_token': '[SEP]', 'pad_token': '[PAD]', 'cls_token': '[CLS]', 'mask_token': '[MASK]'}, clean_up_tokenization_spaces=True, added_tokens_decoder={
	0: AddedToken("|||IP_ADDRESS|||", rstrip=False, lstrip=False, single_word=False, normalized=True, special=False),
	1: AddedToken("<|padding|>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	50254: AddedToken("                        ", rstrip=False, lstrip=False, single_word=False, normalized=True, special=False),
	50255: AddedToken("                       ", rstrip=False, lstrip=False, single_word=False, normalized=True, special=False),
	50256: AddedToken("                      ", rstrip=False, lstrip=False, single_word=False, normalized=True, special=False),
	50257: AddedToken("                    

In [12]:
def batch_iterator(dataset, batch_size=1000):
    for i in range(0, len(dataset), batch_size):
        yield dataset[i: i + batch_size]["text"]

tokenizer = modern_tokenizer.train_new_from_iterator(
    text_iterator = batch_iterator(split_dataset["train"]),
    vocab_size = modern_tokenizer.vocab_size
    )






In [13]:
tokenizer

PreTrainedTokenizerFast(name_or_path='answerdotai/ModernBERT-base', vocab_size=50280, model_max_length=8192, is_fast=True, padding_side='right', truncation_side='right', special_tokens={'unk_token': '[UNK]', 'sep_token': '[SEP]', 'pad_token': '[PAD]', 'cls_token': '[CLS]', 'mask_token': '[MASK]'}, clean_up_tokenization_spaces=True, added_tokens_decoder={
	0: AddedToken("<|padding|>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	1: AddedToken("<|endoftext|>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	2: AddedToken("[UNK]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	3: AddedToken("[CLS]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	4: AddedToken("[SEP]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	5: AddedToken("[PAD]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	6: A

In [14]:
def test_tokenizer(sample):
    encoding = tokenizer.encode(sample)
    print(encoding)
    print(tokenizer.convert_ids_to_tokens(encoding))
    print()

test_tokenizer('''
Atiraram o pau no gato, mas o gato não morreu
''')

test_tokenizer('''
Não sei, só sei que foi assim...
''')

test_tokenizer('''
Testando o modo continuo, e tambem o modo subjuntivo ( soubesse )
''')

test_tokenizer('''
justo, justa, justiça, injusto, injustamente, justamente, junto
''')

test_tokenizer('''
    testando acentos, será que manter os acentos melhora a acurácia do meu modelo? 
''')

test_tokenizer('''
    amigo: amiguinho, amiga, amiguinha, amigão, amigaço, amigalhaço
''')

test_tokenizer('''
    meu endereço de ip é 10.10.10.10/24 para a minha subnet
''')

[3, 205, 6370, 919, 413, 280, 11268, 343, 10329, 18, 641, 280, 10329, 477, 4441, 205, 4]
['[CLS]', 'Ċ', 'At', 'ira', 'ram', 'Ġo', 'Ġpau', 'Ġno', 'Ġgato', ',', 'Ġmas', 'Ġo', 'Ġgato', 'ĠnÃ£o', 'Ġmorreu', 'Ċ', '[SEP]']

[3, 205, 7533, 46039, 18, 1608, 46039, 313, 442, 1191, 5307, 205, 4]
['[CLS]', 'Ċ', 'NÃ£o', 'Ġsei', ',', 'ĠsÃ³', 'Ġsei', 'Ġque', 'Ġfoi', 'Ġassim', '...', 'Ċ', '[SEP]']

[3, 205, 58, 1234, 346, 280, 2135, 1471, 85, 18, 264, 19646, 303, 280, 2135, 20666, 271, 681, 351, 30533, 1167, 6542, 205, 4]
['[CLS]', 'Ċ', 'T', 'esta', 'ndo', 'Ġo', 'Ġmodo', 'Ġcontinu', 'o', ',', 'Ġe', 'Ġtamb', 'em', 'Ġo', 'Ġmodo', 'Ġsubju', 'nt', 'ivo', 'Ġ(', 'Ġsoub', 'esse', 'Ġ)', 'Ċ', '[SEP]']

[3, 205, 80, 8577, 18, 20623, 18, 7202, 18, 12436, 8577, 18, 12436, 364, 1156, 18, 15026, 18, 3270, 205, 4]
['[CLS]', 'Ċ', 'j', 'usto', ',', 'Ġjusta', ',', 'ĠjustiÃ§a', ',', 'Ġinj', 'usto', ',', 'Ġinj', 'us', 'tamente', ',', 'Ġjustamente', ',', 'Ġjunto', 'Ċ', '[SEP]']

[3, 205, 27769, 14858, 346, 542, 1853, 18, 

In [15]:
def normalize_and_pre_tokenize(text):
    normalized = tokenizer.backend_tokenizer.normalizer.normalize_str(text)
    processed = tokenizer.backend_tokenizer.pre_tokenizer.pre_tokenize_str(normalized)
    return processed

normalize_and_pre_tokenize( split_dataset["train"][0]["text"] )

[('A', (0, 1)),
 ('ĠpopulaÃ§Ã£o', (1, 11)),
 ('Ġtotal', (11, 17)),
 ('Ġda', (17, 20)),
 ('ĠAmÃ©rica', (20, 28)),
 ('Ġera', (28, 32)),
 ('Ġde', (32, 35)),
 ('Ġ', (35, 36)),
 ('Ġhabitantes', (36, 47)),
 ('Ġsegundo', (47, 55)),
 ('Ġestimativas', (55, 67)),
 ('Ġde', (67, 70)),
 ('Ġ2008', (70, 75)),
 ('.', (75, 76)),
 ('ĠA', (76, 78)),
 ('ĠpopulaÃ§Ã£o', (78, 88)),
 ('Ġda', (88, 91)),
 ('ĠAmÃ©rica', (91, 99)),
 ('Ġcompreende', (99, 110)),
 ('Ġdescendentes', (110, 123)),
 ('Ġde', (123, 126)),
 ('Ġgrandes', (126, 134)),
 ('Ġgrupos', (134, 141)),
 ('ĠÃ©tnicos', (141, 149)),
 (',', (149, 150)),
 ('Ġcomo', (150, 155)),
 ('Ġos', (155, 158)),
 ('ĠindÃŃgenas', (158, 168)),
 ('Ġ(', (168, 170)),
 ('inclusive', (170, 179)),
 ('ĠinuÃŃtes', (179, 187)),
 ('Ġe', (187, 189)),
 ('ĠaleÃºtas', (189, 197)),
 ('),', (197, 199)),
 ('Ġos', (199, 202)),
 ('Ġeuropeus', (202, 211)),
 ('Ġ(', (211, 213)),
 ('principalmente', (213, 227)),
 ('ĠespanhÃ³is', (227, 237)),
 (',', (237, 238)),
 ('Ġingleses', (238, 247)),
 ('

In [16]:
def group_texts(examples):
    tokenized_inputs = tokenizer(
        examples["text"], 
        truncation=True,
        padding="max_length",
        return_special_tokens_mask=True,
    )
    return tokenized_inputs

# preprocess dataset

tokenized_datasets = split_dataset.map(group_texts, 
                                      batched=True,
                                      remove_columns=["text"], 
                                      # num_proc=cpu_count()
                                      )

tokenized_datasets

DatasetDict({
    train: Dataset({
        features: ['num_words', 'stopwords', 'average', 'input_ids', 'attention_mask', 'special_tokens_mask'],
        num_rows: 40797
    })
    test: Dataset({
        features: ['num_words', 'stopwords', 'average', 'input_ids', 'attention_mask', 'special_tokens_mask'],
        num_rows: 2148
    })
})

In [17]:
training_dataset = tokenized_datasets["train"]
evaluation_dataset = tokenized_datasets["test"]

In [18]:
from transformers import ModernBertConfig

config = ModernBertConfig.from_pretrained("answerdotai/ModernBERT-base", reference_compile=False)

config

ModernBertConfig {
  "architectures": [
    "ModernBertForMaskedLM"
  ],
  "attention_bias": false,
  "attention_dropout": 0.0,
  "bos_token_id": 50281,
  "classifier_activation": "gelu",
  "classifier_bias": false,
  "classifier_dropout": 0.0,
  "classifier_pooling": "mean",
  "cls_token_id": 50281,
  "decoder_bias": true,
  "deterministic_flash_attn": false,
  "embedding_dropout": 0.0,
  "eos_token_id": 50282,
  "global_attn_every_n_layers": 3,
  "global_rope_theta": 160000.0,
  "gradient_checkpointing": false,
  "hidden_activation": "gelu",
  "hidden_size": 768,
  "initializer_cutoff_factor": 2.0,
  "initializer_range": 0.02,
  "intermediate_size": 1152,
  "layer_norm_eps": 1e-05,
  "local_attention": 128,
  "local_rope_theta": 10000.0,
  "max_position_embeddings": 8192,
  "mlp_bias": false,
  "mlp_dropout": 0.0,
  "model_type": "modernbert",
  "norm_bias": false,
  "norm_eps": 1e-05,
  "num_attention_heads": 12,
  "num_hidden_layers": 22,
  "pad_token_id": 50283,
  "position_embedd

In [19]:
config.hidden_size = 128
config.intermediate_size = 256
config.num_attention_heads = 4
config.num_hidden_layers = 8

config

ModernBertConfig {
  "architectures": [
    "ModernBertForMaskedLM"
  ],
  "attention_bias": false,
  "attention_dropout": 0.0,
  "bos_token_id": 50281,
  "classifier_activation": "gelu",
  "classifier_bias": false,
  "classifier_dropout": 0.0,
  "classifier_pooling": "mean",
  "cls_token_id": 50281,
  "decoder_bias": true,
  "deterministic_flash_attn": false,
  "embedding_dropout": 0.0,
  "eos_token_id": 50282,
  "global_attn_every_n_layers": 3,
  "global_rope_theta": 160000.0,
  "gradient_checkpointing": false,
  "hidden_activation": "gelu",
  "hidden_size": 128,
  "initializer_cutoff_factor": 2.0,
  "initializer_range": 0.02,
  "intermediate_size": 256,
  "layer_norm_eps": 1e-05,
  "local_attention": 128,
  "local_rope_theta": 10000.0,
  "max_position_embeddings": 8192,
  "mlp_bias": false,
  "mlp_dropout": 0.0,
  "model_type": "modernbert",
  "norm_bias": false,
  "norm_eps": 1e-05,
  "num_attention_heads": 4,
  "num_hidden_layers": 8,
  "pad_token_id": 50283,
  "position_embedding

In [20]:
from transformers import ModernBertForMaskedLM

model = ModernBertForMaskedLM(config=config)

print("parameters: ", model.num_parameters())

model

parameters:  7826880


ModernBertForMaskedLM(
  (model): ModernBertModel(
    (embeddings): ModernBertEmbeddings(
      (tok_embeddings): Embedding(50368, 128, padding_idx=50283)
      (norm): LayerNorm((128,), eps=1e-05, elementwise_affine=True)
      (drop): Dropout(p=0.0, inplace=False)
    )
    (layers): ModuleList(
      (0): ModernBertEncoderLayer(
        (attn_norm): Identity()
        (attn): ModernBertAttention(
          (Wqkv): Linear(in_features=128, out_features=384, bias=False)
          (rotary_emb): ModernBertRotaryEmbedding()
          (Wo): Linear(in_features=128, out_features=128, bias=False)
          (out_drop): Identity()
        )
        (mlp_norm): LayerNorm((128,), eps=1e-05, elementwise_affine=True)
        (mlp): ModernBertMLP(
          (Wi): Linear(in_features=128, out_features=512, bias=False)
          (act): GELUActivation()
          (drop): Dropout(p=0.0, inplace=False)
          (Wo): Linear(in_features=256, out_features=128, bias=False)
        )
      )
      (1-7): 7 

In [21]:
from transformers import DataCollatorForLanguageModeling

# mask 30% of the tokens
data_collator = DataCollatorForLanguageModeling(
    tokenizer=tokenizer,
    mlm = True,
    mlm_probability=0.3
) 

In [22]:
model_name = f"ModBertBr/{10.0}"

In [23]:
from transformers import Trainer, TrainingArguments, get_wsd_schedule
from torch.optim import AdamW

total_steps = 100

training_args = TrainingArguments(
    output_dir=f'training/{model_name}',
    overwrite_output_dir=True,
    
    # num_train_epochs=1,                     # number of training epochs
    max_steps=total_steps,
    # max_steps=100,

    gradient_accumulation_steps = 1,
    # eval_accumulation_steps = 1,

    per_device_train_batch_size=4,          # batch size for training
    # per_device_eval_batch_size=32,           # batch size for evaluation

    
    logging_strategy="steps",
    logging_first_step=True, # output the initial loss
    logging_steps=1_000,
    # logging_dir=f"training-logs/{model_name}",
    # report_to=["tensorboard"],

    save_strategy="steps",
    save_steps=1_000,                      # Save checkpoints every 100 steps
    save_total_limit=5,                  # Limit the total number of saved checkpoints

    fp16=True,                            # Enable mixed precision for faster training
)

# Create default optimizer
optimizer = AdamW(
    model.parameters(),
    lr = 8e-4,
    weight_decay=1e-2,
    betas = (0.9, 0.999),
)

scheduler = get_wsd_schedule(
    optimizer=optimizer,                  # Your optimizer
    num_warmup_steps=total_steps * 0.1,   # Number of warmup steps
    num_stable_steps=total_steps * 0.8,   # Number of stable steps
    num_decay_steps=total_steps * 0.1,   # Number of decay steps
    warmup_type="linear",   # Warmup type
    decay_type="1-sqrt",    # Decay type
    num_cycles=0.5,         # Number of cosine cycles
    min_lr_ratio=0.0,       # Minimum learning rate ratio
)

trainer = Trainer(
    model=model,                        # Model to train
    args=training_args,                 # Training arguments
    train_dataset=training_dataset,     # Training dataset
    # eval_dataset=evaluation_dataset,  # Evaluation dataset
    data_collator=data_collator,
    optimizers=(optimizer, scheduler),
)

In [None]:
trainer.train()