In [1]:
import torch
from mamba_ssm import Mamba
from mamba_ssm.models.mixer_seq_simple import MambaLMHeadModel
from transformers import AutoTokenizer

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
batch, length, dim = 2, 64, 16
x = torch.randn(batch, length, dim).to("cuda")
model = Mamba(
    # This module uses roughly 3 * expand * d_model^2 parameters
    d_model=dim, # Model dimension d_model
    d_state=16,  # SSM state expansion factor
    d_conv=4,    # Local convolution width
    expand=2,    # Block expansion factor
).to("cuda")
y = model(x)
assert y.shape == x.shape

In [3]:
repeats = 3
device = "cuda"
dtype = torch.float16

In [4]:
tokenizer = AutoTokenizer.from_pretrained("state-spaces/mamba-130m-hf")
model = MambaLMHeadModel.from_pretrained("state-spaces/mamba-130m", device=device, dtype=dtype)

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In [15]:
tokens = tokenizer("Hey how are you doing?", return_tensors="pt")
input_ids = tokens.input_ids.to(device=device)
out = model.generate(input_ids, max_length=100)
print(tokenizer.batch_decode(out))


["Hey how are you doing?\n\nI'm so glad you're here. I'm so glad you're here. I'm so glad you're here. I'm so glad you're here. I'm so glad you're here. I'm so glad you're here. I'm so glad you're here. I'm so glad you're here. I'm so glad you're here. I'm so glad you're here. I'm so glad you're here. I'm so glad"]


In [5]:
tokens = tokenizer("Hey how are you doing?", return_tensors="pt")
input_ids = tokens.input_ids.to(device=device)

In [6]:
backbone_output = model.backbone(input_ids)
backbone_output.size()

torch.Size([1, 6, 768])

In [7]:
emb_vector = model.backbone.embedding(input_ids)

In [8]:
emb_vector.size()

torch.Size([1, 6, 768])