In [1]:
import torch
from mamba_ssm.modules.mamba_simple import Mamba

  from .autonotebook import tqdm as notebook_tqdm


In [2]:


batch, length, dim = 2, 64, 16
x = torch.randn(batch, length, dim).to("cuda")
model = Mamba(
    # This module uses roughly 3 * expand * d_model^2 parameters
    d_model=dim, # Model dimension d_model
    d_state=16,  # SSM state expansion factor
    d_conv=4,    # Local convolution width
    expand=2,    # Block expansion factor
).to("cuda")
y = model(x)
assert y.shape == x.shape

In [4]:
model

Mamba(
  (in_proj): Linear(in_features=16, out_features=64, bias=False)
  (conv1d): Conv1d(32, 32, kernel_size=(4,), stride=(1,), padding=(3,), groups=32)
  (act): SiLU()
  (x_proj): Linear(in_features=32, out_features=33, bias=False)
  (dt_proj): Linear(in_features=1, out_features=32, bias=True)
  (out_proj): Linear(in_features=32, out_features=16, bias=False)
)

In [6]:
# load pretrained mamba model
from transformers import MambaConfig, MambaForCausalLM, AutoTokenizer
import torch

tokenizer = AutoTokenizer.from_pretrained("state-spaces/mamba-130m-hf")
model = MambaForCausalLM.from_pretrained("state-spaces/mamba-130m-hf")
input_ids = tokenizer("Hey how are you doing?", return_tensors="pt")["input_ids"]

out = model.generate(input_ids, max_new_tokens=10)
print(tokenizer.batch_decode(out))


The fast path is not available because one of `(selective_state_update, selective_scan_fn, causal_conv1d_fn, causal_conv1d_update, mamba_inner_fn)` is None. Falling back to the sequential implementation of Mamba, as use_mambapy is set to False. To install follow https://github.com/state-spaces/mamba/#installation and https://github.com/Dao-AILab/causal-conv1d. For the mamba.py backend, follow https://github.com/alxndrTL/mamba.py.
The attention mask is not set and cannot be inferred from input because pad token is same as eos token. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.


["Hey how are you doing?\n\nI'm so glad you're here."]


In [8]:
def get_output(prompt):
    input_ids = tokenizer(prompt, return_tensors="pt")["input_ids"]
    out = model.generate(input_ids, max_new_tokens=10)
    return tokenizer.batch_decode(out)[0]

get_output("Hey how are you doing?")

"Hey how are you doing?\n\nI'm so glad you're here."

In [10]:
get_output("1+1=")

'1+1=2$ and $1+1=3$'

In [None]:
import time
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM
from mamba_ssm.models.mixer_seq_simple import MambaLMHeadModel

def generate_text(model, tokenizer, prompt, max_length=100):
    inputs = tokenizer(prompt, return_tensors="pt")
    start_time = time.time()
    
    with torch.no_grad():
        outputs = model.generate(**inputs, max_length=max_length)
    
    end_time = time.time()
    generated_text = tokenizer.decode(outputs[0], skip_special_tokens=True)
    generation_time = end_time - start_time
    
    return generated_text, generation_time

def main():
    prompt = "Once upon a time, in a land far away,"
    max_length = 100

    # Mamba 130M model
    mamba_model = MambaForCausalLM.from_pretrained("state-spaces/mamba-130m-hf")
    mamba_tokenizer = AutoTokenizer.from_pretrained("state-spaces/mamba-130m-hf")

    # Pythia-160M model
    pythia_model = AutoModelForCausalLM.from_pretrained("EleutherAI/pythia-160m")
    pythia_tokenizer = AutoTokenizer.from_pretrained("EleutherAI/pythia-160m")

    print("Generating text with Mamba 130M...")
    mamba_text, mamba_time = generate_text(mamba_model, mamba_tokenizer, prompt, max_length)
    
    print("Generating text with Pythia-160M...")
    pythia_text, pythia_time = generate_text(pythia_model, pythia_tokenizer, prompt, max_length)

    print("\nResults:")
    print(f"Mamba 130M generation time: {mamba_time:.4f} seconds")
    print(f"Pythia-160M generation time: {pythia_time:.4f} seconds")
    print(f"\nMamba 130M generated text:\n{mamba_text}, {len(mamba_text)}")
    print(f"\nPythia-160M generated text:\n{pythia_text}, {len(pythia_text)}")

if __name__ == "__main__":
    main()

In [17]:
pip install --no-cache-dir causal-conv1d>=1.2.0

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


  [1;31merror[0m: [1msubprocess-exited-with-error[0m
  
  [31m×[0m [32mpython setup.py bdist_wheel[0m did not run successfully.
  [31m│[0m exit code: [1;36m1[0m
  [31m╰─>[0m [31m[111 lines of output][0m
  [31m   [0m 
  [31m   [0m 
  [31m   [0m torch.__version__  = 2.5.0+cu124
  [31m   [0m 
  [31m   [0m 
  [31m   [0m running bdist_wheel
  [31m   [0m Guessing wheel URL:  https://github.com/Dao-AILab/causal-conv1d/releases/download/v1.4.0/causal_conv1d-1.4.0+cu122torch2.5cxx11abiFALSE-cp310-cp310-linux_x86_64.whl
  [31m   [0m Precompiled wheel not found. Building from source...
  [31m   [0m running build
  [31m   [0m running build_py
  [31m   [0m creating build/lib.linux-x86_64-cpython-310/causal_conv1d
  [31m   [0m copying causal_conv1d/__init__.py -> build/lib.linux-x86_64-cpython-310/causal_conv1d
  [31m   [0m copying causal_conv1d/causal_conv1d_interface.py -> build/lib.linux-x86_64-cpython-310/causal_conv1d
  [31m   [0m copying causal_conv1

In [None]:
import trans