In [1]:
from datasets import load_dataset
from transformers import (
    AutoTokenizer,
    DataCollatorForLanguageModeling,
    Trainer,
    TrainingArguments,
    AutoModelForMaskedLM,
)

from chop import MaseGraph
import chop.passes as passes


# checkpoint = "bert-base-uncased"
# tokenizer_checkpoint = "bert-base-uncased"

checkpoint = "roberta-base"
tokenizer_checkpoint = "roberta-base"

# checkpoint = "albert/albert-base-v2"
# tokenizer_checkpoint = "albert/albert-base-v2"


dataset_name = "xu-song/cc100-samples"



  from .autonotebook import tqdm as notebook_tqdm


In [2]:
# Load the dataset

dataset = load_dataset(dataset_name, "en", split="train[:100%]")
tokenizer = AutoTokenizer.from_pretrained(tokenizer_checkpoint)

def tokenize_function(example):
    return tokenizer(
        example["text"],
        truncation=True,
    )

# Tokenize
dataset = dataset.map(tokenize_function, batched=True)

# split the dataset in train and test
dataset = dataset.train_test_split(test_size=0.2)

print(dataset)
# print(dataset["train"][0])

data_collator = DataCollatorForLanguageModeling(
    tokenizer=tokenizer,
    mlm=True,
    mlm_probability=0.15  # Standard BERT masking probability
)



Map: 100%|██████████| 10000/10000 [00:00<00:00, 47524.51 examples/s]

DatasetDict({
    train: Dataset({
        features: ['text', 'input_ids', 'attention_mask'],
        num_rows: 8000
    })
    test: Dataset({
        features: ['text', 'input_ids', 'attention_mask'],
        num_rows: 2000
    })
})





In [3]:
# MaseGraph conversion

model = AutoModelForMaskedLM.from_pretrained(checkpoint)
print(model.config) # can see the architectural params like hidden_size, num_attention_heads, etc.

mg = MaseGraph(
    model,
    hf_input_names=[
        "input_ids",
        "attention_mask",
        "labels",
    ],
)

mg, _ = passes.init_metadata_analysis_pass(mg)
mg, _ = passes.add_common_metadata_analysis_pass(mg)


`past_key_values` were not specified as input names, but model.config.use_cache = True. Setting model.config.use_cache = False.
[32mINFO    [0m [34mGetting dummy input for roberta-base.[0m


RobertaConfig {
  "_attn_implementation_autoset": true,
  "_name_or_path": "roberta-base",
  "architectures": [
    "RobertaForMaskedLM"
  ],
  "attention_probs_dropout_prob": 0.1,
  "bos_token_id": 0,
  "classifier_dropout": null,
  "eos_token_id": 2,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-05,
  "max_position_embeddings": 514,
  "model_type": "roberta",
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "pad_token_id": 1,
  "position_embedding_type": "absolute",
  "transformers_version": "4.48.3",
  "type_vocab_size": 1,
  "use_cache": true,
  "vocab_size": 50265
}

tensor([[    0, 15238,   189,   185,    81,     5,   232,    65,   183,     2],
        [    0,   713,    16,   596,    47,   197,  1532,  4516, 10463,     2]])
tensor([[0, 0, 0, 0, 0, 0, 0, 0, 0, 0]])
tensor([[    0, 15238,   189,   185,    81,     5,   232,    65,   183,     2],
        [    0,   7

In [4]:

training_args = TrainingArguments(
    output_dir = "mase-trainer",
    report_to="none",
    num_train_epochs=3,
    save_safetensors=False, # fixes safetensor can't save error after training for an epoch
)

trainer = Trainer(
    mg.model,
    training_args,
    train_dataset=dataset["train"],
    eval_dataset=dataset["test"],
    data_collator=data_collator,
    tokenizer=tokenizer,
)


eval_results = trainer.evaluate()
print(f"Evaluation loss: {eval_results['eval_loss']}")

trainer.train()

eval_results = trainer.evaluate()
print(f"Evaluation loss: {eval_results['eval_loss']}")

mg.export("test_1")


NVIDIA GeForce RTX 5080 with CUDA capability sm_120 is not compatible with the current PyTorch installation.
The current PyTorch install supports CUDA capabilities sm_50 sm_60 sm_61 sm_70 sm_75 sm_80 sm_86 sm_37 sm_90 compute_37.
If you want to use the NVIDIA GeForce RTX 5080 GPU with PyTorch, please check the instructions at https://pytorch.org/get-started/locally/

  trainer = Trainer(


Evaluation loss: 2.2507145404815674


Step,Training Loss
500,2.352
1000,2.2572
1500,2.1064
2000,2.0185
2500,1.8798
3000,1.8307


[32mINFO    [0m [34mExporting MaseGraph to test_1.pt, test_1.mz[0m
[32mINFO    [0m [34mExporting GraphModule to test_1.pt[0m


Evaluation loss: 1.9630663394927979


[32mINFO    [0m [34mExporting MaseMetadata to test_1.mz[0m


In [5]:
# Load the model back to check mase save function is working

mg2 = MaseGraph.from_checkpoint("test_1")

trainer = Trainer(
    mg2.model,
    training_args,
    train_dataset=dataset["train"],
    eval_dataset=dataset["test"],
    data_collator=data_collator,
    tokenizer=tokenizer,
)


eval_results = trainer.evaluate()
print(f"Evaluation loss: {eval_results['eval_loss']}")

  loaded_model = torch.load(f)
  trainer = Trainer(


Evaluation loss: 1.89931321144104


In [6]:
from chop.passes.module import report_trainable_parameters_analysis_pass

_, _ = report_trainable_parameters_analysis_pass(mg.model)

+-----------------------------------------------------+------------------------+
| Submodule                                           |   Trainable Parameters |
| roberta                                             |              124055040 |
+-----------------------------------------------------+------------------------+
| roberta.embeddings                                  |               39000576 |
+-----------------------------------------------------+------------------------+
| roberta.embeddings.word_embeddings                  |               38603520 |
+-----------------------------------------------------+------------------------+
| roberta.embeddings.token_type_embeddings            |                    768 |
+-----------------------------------------------------+------------------------+
| roberta.embeddings.position_embeddings              |                 394752 |
+-----------------------------------------------------+------------------------+
| roberta.embeddings.LayerNo

In [7]:
# debug pass that prints all the call_module nodes

from chop.tools import get_logger

logger = get_logger("mase_logger")
logger.setLevel("INFO")


def count_nodes_analysis_pass(mg, pass_args={}):
    call_module_nodes = [node for node in mg.fx_graph.nodes if node.op == 'call_module']
    total_nodes = len(call_module_nodes)
    
    node_info = []
    for node in call_module_nodes:
        info = {
            'name': node.name,
            'op': node.op,
            'target': str(node.target)
        }
        node_info.append(info)
        logger.debug(f"Node: {info}")

    logger.info(f"Total number of call_module nodes: {total_nodes}")
    for node in node_info:
        logger.info(f"Node name: {node['name']}, target: {node['target']}")

    return mg, {
        "total_nodes": total_nodes,
        "node_info": node_info
    }

mg, pass_out = count_nodes_analysis_pass(mg)

logger.info(f"Total node count is: {pass_out['total_nodes']}")


[32mINFO    [0m [34mTotal number of call_module nodes: 129[0m
[32mINFO    [0m [34mNode name: roberta_embeddings_word_embeddings, target: roberta.embeddings.word_embeddings[0m
[32mINFO    [0m [34mNode name: roberta_embeddings_token_type_embeddings, target: roberta.embeddings.token_type_embeddings[0m
[32mINFO    [0m [34mNode name: roberta_embeddings_position_embeddings, target: roberta.embeddings.position_embeddings[0m
[32mINFO    [0m [34mNode name: roberta_embeddings_layer_norm, target: roberta.embeddings.LayerNorm[0m
[32mINFO    [0m [34mNode name: roberta_embeddings_dropout, target: roberta.embeddings.dropout[0m
[32mINFO    [0m [34mNode name: roberta_encoder_layer_0_attention_self_query, target: roberta.encoder.layer.0.attention.self.query[0m
[32mINFO    [0m [34mNode name: roberta_encoder_layer_0_attention_self_key, target: roberta.encoder.layer.0.attention.self.key[0m
[32mINFO    [0m [34mNode name: roberta_encoder_layer_0_attention_self_value, target

In [8]:
from chop import MaseGraph

mg = MaseGraph(model)
mg.draw("bert-base-uncased.svg")