## ModernBERT

BERT is an ancient (given the speed of innovation) model. While it has proven to be useful in multiple places, it has (relative) shortcomings. ModernBert takes all the recent advances in the area and applies them to BERT-style model.

1. Architectural changes:
    1. Rotary Positional Encodings
    2. GeGLU instead of MLP layers
    3. Extra normalization layer after embeddings
    4. Alternating Global and Local attention layers
2. Training Data:
    2 Trillion tokens of highly diverse data
3. Training process:
    3 phases:
    1. 1.7 trillion tokens at a sequence length of 1024
    2. Long-context adaption phase: 250 billion tokens at sequence length of 8192
    3. Annealing on 50 billion tokens

In [1]:
import torch
from torch.utils.data import DataLoader
from transformers import AutoTokenizer, AutoModelForSequenceClassification, AutoModelForMaskedLM, Trainer, TrainingArguments
from datasets import load_dataset
from tqdm import tqdm
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

In [2]:
tokenizer = AutoTokenizer.from_pretrained("answerdotai/ModernBERT-base")
model = AutoModelForMaskedLM.from_pretrained("answerdotai/ModernBERT-base",
                                            #  attn_implementation="flash_attention_2", 
                                             torch_dtype=torch.float16)
# model = model.to('cuda')

In [4]:
text = "I really [MASK] the movie. It was quite interesting."

inputs = tokenizer(text, return_tensors='pt')

print(f"MASK token id: {tokenizer.mask_token_id}")
print(inputs)

for key in inputs:
    # inputs[key] = inputs[key].to('cuda')
    inputs[key] = inputs[key]

output = model(**inputs)

print(output['logits'][0, 3].shape)
predicted_index = output['logits'][0,3].argmax()

predicted_token = tokenizer.convert_ids_to_tokens([predicted_index])[0]
print(predicted_token)

MASK token id: 50284
{'input_ids': tensor([[50281,    42,  1663, 50284,   253,  6440,    15,   733,   369,  3240,
          4722,    15, 50282]]), 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]])}
torch.Size([50368])
Ġliked


In [5]:
text = "I really [MASK] the movie. It was loud and boring."

inputs = tokenizer(text, return_tensors='pt')

print(f"MASK token id: {tokenizer.mask_token_id}")
print(inputs)

for key in inputs:
    # inputs[key] = inputs[key].to('cuda')
    inputs[key] = inputs[key]

output = model(**inputs)

print(output['logits'][0, 3].shape)
predicted_index = output['logits'][0,3].argmax()

predicted_token = tokenizer.convert_ids_to_tokens([predicted_index])[0]
print(predicted_token)

MASK token id: 50284
{'input_ids': tensor([[50281,    42,  1663, 50284,   253,  6440,    15,   733,   369, 11216,
           285, 22258,    15, 50282]]), 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]])}
torch.Size([50368])
Ġhated


In [6]:
del model

## Finetuning using HF Trainer API

In [1]:
# device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
device = torch.device('cpu')

NameError: name 'torch' is not defined

In [7]:
# Load the dataset
dataset = load_dataset("fancyzhx/ag_news")

In [8]:
dataset['train'][0]

{'text': "Wall St. Bears Claw Back Into the Black (Reuters) Reuters - Short-sellers, Wall Street's dwindling\\band of ultra-cynics, are seeing green again.",
 'label': 2}

In [9]:
# Tokenize the dataset
def tokenize_function(examples):
    return tokenizer(examples["text"], padding="max_length", max_length=100, truncation=True, return_tensors="pt").to(device)

tokenized_dataset = dataset.map(tokenize_function, batched=True)
tokenized_dataset.set_format('torch', columns=['input_ids', 'attention_mask', 'label'])

In [10]:
labels = tokenized_dataset["train"].features["label"].names
num_labels = len(labels)
label2id, id2label = dict(), dict()
for i, label in enumerate(labels):
    label2id[label] = i
    id2label[i] = label

In [11]:
tokenized_dataset['test']['label']

tensor([2, 3, 3,  ..., 1, 2, 2])

In [12]:
label2id

{'World': 0, 'Sports': 1, 'Business': 2, 'Sci/Tech': 3}

In [13]:
id2label

{0: 'World', 1: 'Sports', 2: 'Business', 3: 'Sci/Tech'}

In [14]:
model = AutoModelForSequenceClassification.from_pretrained("answerdotai/ModernBERT-base", 
                                                           attn_implementation="flash_attention_2", 
                                                           torch_dtype=torch.bfloat16,
                                                           num_labels=len(label2id),
                                                           label2id=label2id,
                                                           id2label=id2label
                                                           )

# model = AutoModelForSequenceClassification.from_pretrained("bert-base-uncased", 
#                                                            num_labels=2,
#                                                         #    label2id=label2id,
#                                                         #    id2label=id2label
#                                                            )
model = model.to(device)

Some weights of ModernBertForSequenceClassification were not initialized from the model checkpoint at answerdotai/ModernBERT-base and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [15]:
output_model_dir = "./test-api-finetuning/"

In [16]:
training_args = TrainingArguments(
    output_dir = output_model_dir,
    per_device_train_batch_size = 64,
    per_device_eval_batch_size = 32,
    learning_rate = 5e-5,
    num_train_epochs = 2,
    bf16 = True, # bfloat16 training 
    optim = "adamw_torch_fused", # improved optimizer 
    # logging & evaluation strategies
    logging_strategy = "steps",
    logging_steps = 100,
    eval_strategy = "steps",
    save_strategy = "steps",
    save_total_limit = 2,
    load_best_model_at_end = True,
    metric_for_best_model = "f1",
    # push to hub parameters
    push_to_hub = False,
    report_to="none",
    
)

In [17]:
import numpy as np
from sklearn.metrics import f1_score, precision_recall_fscore_support

In [18]:
def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    predictions = np.argmax(predictions, axis = 1)
    score = f1_score(labels, predictions, average='macro')
    return {"f1": float(score) if score == 1 else score}

In [19]:
# def compute_loss(eval_pred, num_items_in_batch):
#     logits, labels = eval_pred
#     loss_fct = torch.nn.CrossEntropyLoss()
#     return loss_fct(logits.view(-1, num_labels), labels.view(-1))

In [20]:
trainer = Trainer(
    model = model,
    args = training_args,
    train_dataset = tokenized_dataset["train"].select(range(10000)),
    eval_dataset = tokenized_dataset["test"].select(range(100)),
    compute_metrics = compute_metrics,
)

In [21]:
trainer.train()

Step,Training Loss,Validation Loss,F1
100,0.6769,0.372692,0.833167
200,0.4096,0.330427,0.857928
300,0.3779,0.335434,0.848628


Could not locate the best model at ./test-api-finetuning/checkpoint-200/pytorch_model.bin, if you are running a distributed training on multiple nodes, you should activate `--save_on_each_node`.


TrainOutput(global_step=314, training_loss=0.48316904693652113, metrics={'train_runtime': 68.1225, 'train_samples_per_second': 293.589, 'train_steps_per_second': 4.609, 'total_flos': 1331103792000000.0, 'train_loss': 0.48316904693652113, 'epoch': 2.0})

In [22]:
outputs = trainer.predict(tokenized_dataset["test"].select(range(100)))
preds = outputs.predictions.argmax(-1)
labels = outputs.label_ids

In [23]:
print(classification_report(labels, preds, target_names=label2id.keys()))

              precision    recall  f1-score   support

       World       1.00      0.83      0.91        30
      Sports       0.91      1.00      0.95        21
    Business       0.53      0.83      0.65        12
    Sci/Tech       0.94      0.84      0.89        37

    accuracy                           0.87       100
   macro avg       0.84      0.88      0.85       100
weighted avg       0.90      0.87      0.88       100

