In [5]:
from model_pytorch import Mamba, ModelArgs
from transformers import AutoTokenizer
import torch
import onnxruntime as ort
import numpy as np

In [12]:
# One of:
#     'state-spaces/mamba-2.8b-slimpj'
#     'state-spaces/mamba-2.8b'
#     'state-spaces/mamba-1.4b'
#     'state-spaces/mamba-790m'
#     'state-spaces/mamba-370m'
#     'state-spaces/mamba-130m'
pretrained_model_name = 'state-spaces/mamba-130m'
dummy_input = "Harry Potter"

model = Mamba.from_pretrained(pretrained_model_name)
tokenizer = AutoTokenizer.from_pretrained('EleutherAI/gpt-neox-20b')
input_ids = tokenizer(dummy_input, return_tensors='pt').input_ids

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In [7]:
#Export model
export_name = "mamba_model_130m_cumsum_no_einsum"

torch.onnx.export(
    model,
    input_ids,  
    f"{export_name}.onnx",
    export_params=True,
    do_constant_folding=True,
    input_names=['input_ids'],
    output_names=['output'],
    dynamic_axes={
        'input_ids': {0: 'batch_size', 1: 'seq_length'},  
        'output': {0: 'batch_size', 1: 'seq_length'}
    }
)
torch.save(model, f"{export_name}.pt")

In [None]:
# Export minimal model

args = ModelArgs(
    d_model=5,
    n_layer=1,
    vocab_size=50277
)
model = Mamba(args)
model.eval()
export_name = "mamba_minimal_1_layer_cumsum_no_einsum"

torch.save(model, f"{export_name}.pt")

torch.onnx.export(
    model,
    input_ids,  
    f"{export_name}.onnx",
    export_params=True,
    do_constant_folding=True,
    input_names=['input_ids'],
    output_names=['output'],
    dynamic_axes={
        'input_ids': {0: 'batch_size', 1: 'seq_length'},  
        'output': {0: 'batch_size', 1: 'seq_length'}
    }
)

In [8]:
import onnx

# Load the ONNX model
model = onnx.load("mamba_model_130m_cumsum_no_einsum.onnx")

# Check that the model is well formed
onnx.checker.check_model(model)

# Print a human readable representation of the graph
#print(onnx.helper.printable_graph(model.graph))

In [10]:
ort_session = ort.InferenceSession('mamba_model_130m_cumsum_no_einsum.onnx')

# Generate a model dummy input
dummy_prompt_1 = "Harry Potter test"  
tokens_1 = tokenizer(dummy_prompt_1, return_tensors="pt")
input_ids_1 = tokens_1.input_ids.to(device="cpu")
input_ids_np = np.array(input_ids_1)
print(input_ids_np.shape)

# Inference
inputs = {ort_session.get_inputs()[0].name: input_ids_np}
onnx_out = ort_session.run(None, inputs)

# Output
onnx_out


(1, 3)


[array([[[ -8.417368 , -22.110765 ,  -2.4200068, ..., -21.946516 ,
          -21.980217 , -21.913406 ],
         [  0.3511305, -26.0508   ,   1.545126 , ..., -25.823263 ,
          -26.040113 , -25.818874 ],
         [-40.83871  , -53.83088  , -38.9781   , ..., -53.959682 ,
          -53.70117  , -53.945526 ]]], dtype=float32)]

In [13]:
torch_out = model(input_ids_1)
print(torch_out)
# compare ONNX Runtime and PyTorch results
try:
    np.testing.assert_allclose(torch_out.detach().cpu().numpy(), onnx_out[0], rtol=1e-02, atol=1e-03)
    print("Exported model has been tested with ONNXRuntime, and the result looks good!")
except AssertionError as e:
    print("AssertionError:", str(e))

tensor([[[ -8.4174, -22.1108,  -2.4200,  ..., -21.9465, -21.9803, -21.9134],
         [  0.3512, -26.0508,   1.5452,  ..., -25.8232, -26.0401, -25.8188],
         [-40.8387, -53.8309, -38.9781,  ..., -53.9597, -53.7011, -53.9456]]],
       grad_fn=<UnsafeViewBackward0>)
Exported model has been tested with ONNXRuntime, and the result looks good!


In [14]:
import torch
import torch.nn.functional as F


def generate(model,
             tokenizer,
             prompt: str,
             n_tokens_to_gen: int = 50,
             sample: bool = True,
             top_k: int = 40):
    
    ort_session = ort.InferenceSession(model)

    input_ids = tokenizer(prompt, return_tensors='pt').input_ids
        
    for _ in range(n_tokens_to_gen):
        with torch.no_grad():
            indices_to_input = input_ids
            
            inputs = {ort_session.get_inputs()[0].name: np.array(indices_to_input)}
            output_array = np.array(ort_session.run(None, inputs))
            output_tensor = torch.from_numpy(output_array).squeeze(0)  # Rimuove la dimensione extra in posizione 0
            next_token_logits = output_tensor[:, -1]  # Seleziona l'ultimo token generato


        probs = F.softmax(next_token_logits, dim=-1)
        (batch, vocab_size) = probs.shape
        
        if top_k is not None:
            (values, indices) = torch.topk(probs, k=top_k)
            probs[probs < values[:, -1, None]] = 0
            probs = probs / probs.sum(axis=1, keepdims=True)
        
        if sample:
            next_indices = torch.multinomial(probs, num_samples=1)
        else:
            next_indices = torch.argmax(probs, dim=-1)[:, None]
        
        input_ids = torch.cat([input_ids, next_indices], dim=1)

    output_completions = [tokenizer.decode(output.tolist()) for output in input_ids][0]
    
    return output_completions

In [15]:
print(generate(model='mamba_model_130m_cumsum_no_einsum.onnx', tokenizer=tokenizer, prompt="Harry Potter is", n_tokens_to_gen=20))

Harry Potter is not only written with a happy and happy mind. Its intent – to make us all feel better than


In [59]:
# Definiamo la dimensione di ogni asse
b, l, d, n = 2, 3, 4, 5

# Creiamo tensori di esempio con le dimensioni appropriate
dt = torch.randn(b, l, d)  # Tensore con forma (b, l, d)
A = torch.randn(d, n)     # Tensore con forma (d, n)

print(dt.shape)
print(A.shape)
# Utilizziamo torch.einsum per moltiplicare dt e A secondo la regola specificata
result = torch.einsum('bld,dn->bldn', dt, A)

# Mostreremo la forma del tensore di risultato per confermare che è (b, l, d, n)
print(result.shape)  # Dovrebbe stampare: torch.Size([b, l, d, n])


torch.Size([2, 3, 4])
torch.Size([4, 5])
torch.Size([2, 3, 4, 5])


In [66]:
import torch

# Dimensioni di esempio
b, l, d, n = 2, 3, 4, 5

# Tensori di esempio
dt = torch.randn(b, l, d)
A = torch.randn(d, n)

# Operazione senza einsum
# Passo 1: Ridimensiona A per il broadcasting
A_expanded = A.view(1, 1, d, n).expand(b, l, d, n)

# Passo 2: Moltiplica dt per A_expanded
# Poiché vogliamo mantenere dt inalterato e solo "applicare" A a ogni elemento, 
# dobbiamo prima aggiungere dimensioni a dt per il broadcasting.
dt_expanded = dt.unsqueeze(-1)  # Aggiunge una dimensione alla fine per il broadcasting

# Moltiplicazione elemento per elemento
result = dt_expanded * A_expanded

# Verifica la forma del risultato
print(result.shape)  # Dovrebbe essere torch.Size([b, l, d, n])



torch.Size([2, 3, 4, 5])


In [69]:
import torch

# Dimensioni di esempio
b, l, d, n = 2, 3, 4, 5

# Tensori di esempio
dt = torch.randn(b, l, d)
u = torch.randn(b, l, d)
B = torch.randn(b, l, n)

# Passo 1: Moltiplicazione elemento per elemento di dt e u
dt_u_product = dt * u  # Il risultato ha forma (b, l, d)

# Passo 2: Espandi il risultato aggiungendo una nuova dimensione per il broadcasting
dt_u_expanded = dt_u_product.unsqueeze(-1)  # Aggiunge una dimensione fittizia alla fine, forma (b, l, d, 1)

# Passo 3: Espandi B per il broadcasting
B_expanded = B.unsqueeze(2)  # Aggiunge una dimensione fittizia nella terza posizione, forma (b, l, 1, n)

# Passo 4: Moltiplicazione elemento per elemento con broadcasting
deltaB_u = dt_u_expanded * B_expanded  # Il risultato ha forma (b, l, d, n)

# Controlla la forma del risultato
print(deltaB_u.shape)  # Dovrebbe essere torch.Size([b, l, d, n])


torch.Size([2, 3, 4, 5])


In [1]:
import torch

# Dimensioni di esempio
b, l, d, n = 2, 3, 4, 5

# Tensori di esempio
x = torch.randn(b, l, d, n)
C = torch.randn(b, l, n)

# Espandi C per il broadcasting
C_expanded = C.unsqueeze(2)  # La forma diventa (b, l, 1, n)

# Moltiplica x e C con broadcasting
product = torch.mul(x, C_expanded)

# Somma lungo l'asse n per ottenere la forma finale (b, l, d)
y = torch.sum(product, dim=-1)

# Controlla la forma del risultato
print(y.shape)  # Dovrebbe essere torch.Size([b, l, d])


torch.Size([2, 3, 4])


In [16]:
import torch
import torch.nn.functional as F


def generate(model,
             tokenizer,
             prompt: str,
             n_tokens_to_gen: int = 50,
             sample: bool = True,
             top_k: int = 40):
    model.eval()
    
    input_ids = tokenizer(prompt, return_tensors='pt').input_ids
    
    for _ in range(n_tokens_to_gen):
        with torch.no_grad():
            indices_to_input = input_ids
            next_token_logits = model(indices_to_input)[:, -1]
        
        probs = F.softmax(next_token_logits, dim=-1)
        (batch, vocab_size) = probs.shape
        
        if top_k is not None:
            (values, indices) = torch.topk(probs, k=top_k)
            probs[probs < values[:, -1, None]] = 0
            probs = probs / probs.sum(axis=1, keepdims=True)
        
        if sample:
            next_indices = torch.multinomial(probs, num_samples=1)
        else:
            next_indices = torch.argmax(probs, dim=-1)[:, None]
        
        input_ids = torch.cat([input_ids, next_indices], dim=1)

    output_completions = [tokenizer.decode(output.tolist()) for output in input_ids][0]
    
    return output_completions

In [17]:
print(generate(model=model, tokenizer=tokenizer, prompt="Harry Potter is", n_tokens_to_gen=20))

Harry Potter is a fantastic tool in making your day-to-day life a lot more interesting – and not the


In [None]:
# Export dynamic model (Beta)

args = ModelArgs(
    d_model=5,
    n_layer=1,
    vocab_size=50277
)
model_dyn = Mamba(args)
model_dyn.eval()
export_name = "mamba_minimal_1_layer_dyn"

torch.onnx.dynamo_export(
    model_dyn,
    input_ids,
    export_options=torch.onnx.ExportOptions(dynamic_shapes=True)
).save(f"{export_name}.onnx")