# GenoMix Mamba 2

In [None]:
import sys

from transformers import Mamba2Config
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)

sys.path.append('..')

from genomix.models.genomix_modeling.genomix_vanilla_mamba2 import GenomixMamba2


In [2]:
config = Mamba2Config()
config.d_model = 512
config.n_head = 8
config.n_layer = 24
config.vocab_size=16
config.hidden_size=512
config.intermediate_size=512
config.num_heads=8
config.num_layers=24
config.tie_word_embeddings=True

print(config)

Mamba2Config {
  "bos_token_id": 0,
  "chunk_size": 256,
  "conv_kernel": 4,
  "d_model": 512,
  "eos_token_id": 2,
  "expand": 2,
  "head_dim": 64,
  "hidden_act": "silu",
  "hidden_size": 512,
  "initializer_range": 0.1,
  "intermediate_size": 512,
  "layer_norm_epsilon": 1e-05,
  "model_type": "mamba2",
  "n_groups": 8,
  "n_head": 8,
  "n_layer": 24,
  "num_heads": 8,
  "num_hidden_layers": 64,
  "num_layers": 24,
  "pad_token_id": 1,
  "rescale_prenorm_residual": false,
  "residual_in_fp32": true,
  "rms_norm": true,
  "state_size": 128,
  "tie_word_embeddings": true,
  "time_step_floor": 0.0001,
  "time_step_limit": [
    0.0,
    Infinity
  ],
  "time_step_max": 0.1,
  "time_step_min": 0.001,
  "time_step_rank": 256,
  "transformers_version": "4.46.3",
  "use_bias": false,
  "use_cache": true,
  "use_conv_bias": true,
  "vocab_size": 16
}



In [3]:
model = GenomixMamba2(config)



In [4]:
model

GenomixMamba2(
  (model): Mamba2ForCausalLM(
    (backbone): Mamba2Model(
      (embeddings): Embedding(16, 512)
      (layers): ModuleList(
        (0-63): 64 x Mamba2Block(
          (norm): Mamba2RMSNorm()
          (mixer): Mamba2Mixer(
            (act): SiLU()
            (conv1d): Conv1d(3072, 3072, kernel_size=(4,), stride=(1,), padding=(3,), groups=3072)
            (in_proj): Linear(in_features=512, out_features=4104, bias=False)
            (norm): MambaRMSNormGated()
            (out_proj): Linear(in_features=1024, out_features=512, bias=False)
          )
        )
      )
      (norm_f): Mamba2RMSNorm()
    )
    (lm_head): Linear(in_features=512, out_features=16, bias=False)
  )
)

In [5]:
input_id1 = id(model.model.backbone.embeddings.weight)
input_id2 = id(model.model.lm_head.weight)

print(input_id1==input_id2)

True


**NOTE**: if `safe_serialization=True` (default setting) in `model.save_pretrained`,
There will raise error:

```python
RuntimeError: The weights trying to be saved contained shared tensors [{'model.lm_head.weight', 'model.backbone.embeddings.weight'}] that are mismatching the transformers base configuration. Try saving using `safe_serialization=False` or remove this tensor sharing.
```

In [6]:

print(GenomixMamba2.config_class)
# model.save_pretrained('/home/share/huadjyin/home/baiyong01/projects/genomix/tmp/models', safe_serialization=False)

<class 'transformers.models.mamba2.configuration_mamba2.Mamba2Config'>


In [7]:
config = Mamba2Config.from_pretrained('/home/share/huadjyin/home/baiyong01/projects/genomix/tmp/models/config.json')

In [8]:
config

Mamba2Config {
  "_name_or_path": "genomix_mamba2",
  "architectures": [
    "GenomixMamba2"
  ],
  "bos_token_id": 0,
  "chunk_size": 256,
  "conv_kernel": 4,
  "d_model": 512,
  "eos_token_id": 2,
  "expand": 2,
  "head_dim": 64,
  "hidden_act": "silu",
  "hidden_size": 512,
  "initializer_range": 0.1,
  "intermediate_size": 512,
  "layer_norm_epsilon": 1e-05,
  "model_type": "mamba2",
  "n_groups": 8,
  "n_head": 8,
  "n_layer": 24,
  "num_heads": 8,
  "num_hidden_layers": 64,
  "num_layers": 24,
  "pad_token_id": 1,
  "rescale_prenorm_residual": false,
  "residual_in_fp32": true,
  "rms_norm": true,
  "state_size": 128,
  "tie_word_embeddings": true,
  "time_step_floor": 0.0001,
  "time_step_limit": [
    0.0,
    Infinity
  ],
  "time_step_max": 0.1,
  "time_step_min": 0.001,
  "time_step_rank": 256,
  "torch_dtype": "float32",
  "transformers_version": "4.46.3",
  "use_bias": false,
  "use_cache": true,
  "use_conv_bias": true,
  "vocab_size": 16
}

In [9]:
model = GenomixMamba2.from_pretrained(
    '/home/share/huadjyin/home/baiyong01/projects/genomix/tmp/models',local_files_only=True)

In [9]:
model

GenomixMamba2(
  (model): Mamba2ForCausalLM(
    (backbone): Mamba2Model(
      (embeddings): Embedding(16, 512)
      (layers): ModuleList(
        (0-63): 64 x Mamba2Block(
          (norm): Mamba2RMSNorm()
          (mixer): Mamba2Mixer(
            (act): SiLU()
            (conv1d): Conv1d(3072, 3072, kernel_size=(4,), stride=(1,), padding=(3,), groups=3072)
            (in_proj): Linear(in_features=512, out_features=4104, bias=False)
            (norm): MambaRMSNormGated()
            (out_proj): Linear(in_features=1024, out_features=512, bias=False)
          )
        )
      )
      (norm_f): Mamba2RMSNorm()
    )
    (lm_head): Linear(in_features=512, out_features=16, bias=False)
  )
)

In [13]:
model.model.backbone.embeddings.weight.shape

torch.Size([16, 512])

In [17]:
model.base_model_prefix

'model.backbone'

In [21]:
from transformers import JetMoeConfig

jet_config = JetMoeConfig()
jet_config

JetMoeConfig {
  "activation_function": "silu",
  "attention_dropout": 0.0,
  "aux_loss_coef": 0.01,
  "bos_token_id": 1,
  "eos_token_id": 2,
  "hidden_size": 2048,
  "initializer_range": 0.01,
  "intermediate_size": 5632,
  "kv_channels": 128,
  "max_position_embeddings": 4096,
  "model_type": "jetmoe",
  "num_attention_heads": 32,
  "num_experts_per_tok": 2,
  "num_hidden_layers": 12,
  "num_key_value_heads": 16,
  "num_local_experts": 8,
  "output_router_logits": false,
  "rms_norm_eps": 1e-06,
  "rope_theta": 10000.0,
  "transformers_version": "4.46.3",
  "use_cache": true,
  "vocab_size": 32000
}