In [1]:
from tqdm import tqdm
from multiprocessing import cpu_count
from datasets import load_dataset, load_from_disk

# set cache directory out of $HOME to $WORK
import os
os.environ["HF_HOME"] = "cache/"
default_cache_dir = "cache/"

import transformers 
print(transformers.__version__)

4.48.1


merge

In [2]:
context_size = 512

In [3]:
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained("wallacelw/ModBERTBr")

In [4]:
folder_path = "dataset/tokenized"
folder_exists = os.path.isdir(folder_path)

if folder_exists:
    tokenized_datasets = load_from_disk(folder_path)

Loading dataset from disk:   0%|          | 0/713 [00:00<?, ?it/s]

Loading dataset from disk:   0%|          | 0/80 [00:00<?, ?it/s]

In [5]:
tokenized_datasets

DatasetDict({
    train: Dataset({
        features: ['input_ids', 'token_type_ids', 'attention_mask', 'special_tokens_mask'],
        num_rows: 98904411
    })
    test: Dataset({
        features: ['input_ids', 'token_type_ids', 'attention_mask', 'special_tokens_mask'],
        num_rows: 10989380
    })
})

In [6]:
old_model_name = f"trained/Modern/{4.6}"

In [7]:
from transformers import ModernBertForMaskedLM

model = ModernBertForMaskedLM.from_pretrained(old_model_name)

You are attempting to use Flash Attention 2.0 with a model not initialized on GPU. Make sure to move the model to GPU after initializing it on CPU with `model.to('cuda')`.


In [8]:
model_name = f"Modern/{5.0}"

In [9]:
from transformers import DataCollatorForLanguageModeling

# mask 30% of the tokens
data_collator = DataCollatorForLanguageModeling(
    tokenizer=tokenizer,
    mlm = True,
    mlm_probability=0.3
) 

In [10]:
from transformers import Trainer, TrainingArguments
from transformers import get_wsd_schedule, AdamW

total_steps = 400_000

training_args = TrainingArguments(
    output_dir=f'training/{model_name}',
    overwrite_output_dir=True,

    max_steps=total_steps,

    gradient_accumulation_steps = 1,

    per_device_train_batch_size=32,
    
    logging_strategy="steps",
    logging_first_step=True, # output the initial loss
    logging_steps=1_000,
    logging_dir=f"training-logs/{model_name}",
    report_to=["tensorboard"],

    save_strategy="steps",
    save_steps=1_000,                      # Save checkpoints every 100 steps
    save_total_limit=5,                  # Limit the total number of saved checkpoints

    fp16=True,                            # Enable mixed precision for faster training

    learning_rate=1e-5,
    weight_decay=1e-5,
    # adam_beta1=0.9,
    # adam_beta2=0.999,
    # adam_epsilon=1e-06,
)

optimizer = AdamW(
    model.parameters(), 
    lr=training_args.learning_rate,
    weight_decay=training_args.weight_decay,
)

lr_scheduler = get_wsd_schedule(
    optimizer=optimizer,
    num_warmup_steps= total_steps * 0.1,
    num_stable_steps= total_steps * 0.6,
    num_decay_steps= total_steps * 0.3,
    min_lr_ratio= 0,
    num_cycles= 0.5,
)


trainer = Trainer(
    model=model,                        
    args=training_args,                
    train_dataset=tokenized_datasets["train"],
    data_collator=data_collator,
    optimizers=(optimizer, lr_scheduler),
)



In [11]:
import torch

torch.cuda.empty_cache()

In [12]:
trainer.train(resume_from_checkpoint=True)

There were missing keys in the checkpoint model loaded: ['decoder.weight'].
  torch.load(os.path.join(checkpoint, OPTIMIZER_NAME), map_location=map_location)
	per_device_train_batch_size: 32 (from args) != 64 (from trainer_state.json)
  checkpoint_rng_state = torch.load(rng_file)


Step,Training Loss


TrainOutput(global_step=400001, training_loss=3.8058215601607604e-06, metrics={'train_runtime': 21.9659, 'train_samples_per_second': 4661768.783, 'train_steps_per_second': 18210.034, 'total_flos': 3.4903458558276796e+19, 'train_loss': 3.8058215601607604e-06, 'epoch': 1.0353439662892847})

In [18]:
trainer.save_model("trained/Modern/5.0")

In [19]:
from transformers import ModernBertForMaskedLM

model = ModernBertForMaskedLM.from_pretrained("trained/Modern/5.0")

model

ModernBertForMaskedLM(
  (model): ModernBertModel(
    (embeddings): ModernBertEmbeddings(
      (tok_embeddings): Embedding(32768, 768, padding_idx=0)
      (norm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
      (drop): Dropout(p=0.0, inplace=False)
    )
    (layers): ModuleList(
      (0): ModernBertEncoderLayer(
        (attn_norm): Identity()
        (attn): ModernBertAttention(
          (Wqkv): Linear(in_features=768, out_features=2304, bias=False)
          (rotary_emb): ModernBertUnpaddedRotaryEmbedding(dim=64, base=160000.0, scale_base=None)
          (Wo): Linear(in_features=768, out_features=768, bias=False)
          (out_drop): Identity()
        )
        (mlp_norm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (mlp): ModernBertMLP(
          (Wi): Linear(in_features=768, out_features=2304, bias=False)
          (act): GELUActivation()
          (drop): Dropout(p=0.0, inplace=False)
          (Wo): Linear(in_features=1152, out_features=768, 

In [22]:
model.push_to_hub("wallacelw/ModBERTBr2")


CommitInfo(commit_url='https://huggingface.co/wallacelw/ModBERTBr2/commit/a9d973c35d2d760ddd2e23a11ef435a1202ffde7', commit_message='Upload ModernBertForMaskedLM', commit_description='', oid='a9d973c35d2d760ddd2e23a11ef435a1202ffde7', pr_url=None, repo_url=RepoUrl('https://huggingface.co/wallacelw/ModBERTBr2', endpoint='https://huggingface.co', repo_type='model', repo_id='wallacelw/ModBERTBr2'), pr_revision=None, pr_num=None)

In [23]:
tokenizer.push_to_hub("wallacelw/ModBERTBr2")

CommitInfo(commit_url='https://huggingface.co/wallacelw/ModBERTBr2/commit/4dae1410a9b87e591c6146fb4fee0f86eaa014a3', commit_message='Upload tokenizer', commit_description='', oid='4dae1410a9b87e591c6146fb4fee0f86eaa014a3', pr_url=None, repo_url=RepoUrl('https://huggingface.co/wallacelw/ModBERTBr2', endpoint='https://huggingface.co', repo_type='model', repo_id='wallacelw/ModBERTBr2'), pr_revision=None, pr_num=None)