## (1) Load model

In [None]:
from model import Mamba, ModelArgs
from transformers import AutoTokenizer
import torch
import onnxruntime as ort
import numpy as np

In [None]:
# One of:
#     'state-spaces/mamba-2.8b-slimpj'
#     'state-spaces/mamba-2.8b'
#     'state-spaces/mamba-1.4b'
#     'state-spaces/mamba-790m'
#     'state-spaces/mamba-370m'
#     'state-spaces/mamba-130m'
pretrained_model_name = 'state-spaces/mamba-130m'
dummy_input = "test"

model = Mamba.from_pretrained(pretrained_model_name)
tokenizer = AutoTokenizer.from_pretrained('EleutherAI/gpt-neox-20b')
input_ids = tokenizer(dummy_input, return_tensors='pt').input_ids

In [None]:
#Export model
export_name = "mamba_model"

torch.onnx.export(
    model,
    input_ids,  
    f"{export_name}.onnx",
    export_params=True,
    do_constant_folding=True,
    input_names=['input_ids'],
    output_names=['output'],
    dynamic_axes={
        'input_ids': {0: 'batch_size', 1: 'seq_length'},  
        'output': {0: 'batch_size', 1: 'seq_length'}
    }
)
torch.save(model, f"{export_name}.pt")

In [6]:
# Export minimal model

args = ModelArgs(
    d_model=5,
    n_layer=1,
    vocab_size=50277
)
model_1 = Mamba(args)
model_1.eval()
export_name = "mamba_minimal_1_layer"

torch.save(model_1, f"{export_name}.pt")

torch.onnx.export(
    model_1,
    input_ids,  
    f"{export_name}.onnx",
    export_params=True,
    do_constant_folding=True,
    input_names=['input_ids'],
    output_names=['output'],
    dynamic_axes={
        'input_ids': {0: 'batch_size', 1: 'seq_length'},  
        'output': {0: 'batch_size', 1: 'seq_length'}
    }
)

In [None]:
ort_session = ort.InferenceSession('mamba_minimal_1_layer.onnx')

# Generate a model dummy input
dummy_prompt_1 = "Harry test ciao bla bla"  
tokens_1 = tokenizer(dummy_prompt_1, return_tensors="pt")
input_ids_1 = tokens_1.input_ids.to(device="cpu")
input_ids_np = np.array(input_ids_1)
print(input_ids_np.shape)

# Inference
inputs = {ort_session.get_inputs()[0].name: input_ids_np}
out = ort_session.run(None, inputs)

# Output
print(input_ids_np.shape)
out


## Export to Keras

In [None]:
import nobuco
from nobuco import ChannelOrder, ChannelOrderingStrategy
import torch.nn.functional as F
import tensorflow as tf

In [None]:
location_link, source_code = nobuco.locate_converter(F.softplus)
print('Converter location:')
print(location_link)
print('Converter source code:')
print(source_code)

In [32]:
@nobuco.converter(F.softplus, channel_ordering_strategy=ChannelOrderingStrategy.MINIMUM_TRANSPOSITIONS)
def softplus(input: torch.Tensor):
    print(input)
    return lambda input: tf.keras.activations.softplus(input)

In [33]:
args = ModelArgs(
    d_model=5,
    n_layer=1,
    vocab_size=50277
)
model_1 = Mamba(args)
model_1.eval()
export_name = "mamba_minimal_1_layer"

keras_model = nobuco.pytorch_to_keras(
    model_1,
    args=[input_ids], kwargs=None,
    input_shapes={input_ids: (None, None)}, # Annotate dynamic axes with None
    inputs_channel_order=ChannelOrder.TENSORFLOW,
    outputs_channel_order=ChannelOrder.TENSORFLOW,
    constants_to_variables=False,
    trace_shape=True,
    # save_trace_html=True
)
keras_model.save(f'{export_name}.h5')

tensor([[[-0.8311, -1.0473,  0.0900,  0.1249,  0.7722, -0.5626, -0.0159,
          -0.8217, -0.1600,  0.6595]]])
Legend:
    [32mGreen[0m — conversion successful
    [33mYellow[0m — conversion imprecise
    [31mRed[0m — conversion failed
    [31m[7mRed[0m — no converter found
    [0m[1mBold[0m — conversion applied directly
    * — subgraph reused
    [7mTensor[0m — this output is not dependent on any of subgraph's input tensors
    [4mTensor[0m — this input is a parameter / constant
    [90mTensor[0m — this tensor is useless

[32mMamba[model][0m(int64_0<1,1>[0m) -> float32_87<1,1,50280>[0m
[32m │ [0m [32mEmbedding[torch.nn.modules.sparse][0m(int64_0<1,1>[0m) -> float32_2<1,1,5>[0m
[32m │ [0m [32m └·[0m [32m[1membedding[torch.nn.functional][0m(int64_0<1,1>[0m, [4mfloat32_1<50280,5>[0m, None, None, 2.0, False, False) -> float32_2<1,1,5>[0m
[32m │ [0m [32mResidualBlock[model][0m(float32_2<1,1,5>[0m) -> float32_79<1,1,5>[0m
[32m │ [0m [32m │ 

In [None]:
import tensorflow as tf
import numpy as np

# prompt
dummy_prompt_keras = "Harry"
input_ids_keras = tokenizer(dummy_prompt_keras, return_tensors='tf').input_ids  # Usa 'tf' per TensorFlow

# loading model
keras_model = tf.keras.models.load_model(f'{export_name}.h5')
#keras_model.summary()

# inference
out = keras_model.predict(input_ids_keras)

# output
print(out)


## (2) Generate Text

In [6]:
import torch
import torch.nn.functional as F


def generate(model,
             tokenizer,
             prompt: str,
             n_tokens_to_gen: int = 50,
             sample: bool = True,
             top_k: int = 40):
    model.eval()
    
    input_ids = tokenizer(prompt, return_tensors='pt').input_ids
    
    for token_n in range(n_tokens_to_gen):
        with torch.no_grad():
            indices_to_input = input_ids
            next_token_logits = model(indices_to_input)[:, -1]
        
        probs = F.softmax(next_token_logits, dim=-1)
        (batch, vocab_size) = probs.shape
        
        if top_k is not None:
            (values, indices) = torch.topk(probs, k=top_k)
            probs[probs < values[:, -1, None]] = 0
            probs = probs / probs.sum(axis=1, keepdims=True)
        
        if sample:
            next_indices = torch.multinomial(probs, num_samples=1)
        else:
            next_indices = torch.argmax(probs, dim=-1)[:, None]
        
        input_ids = torch.cat([input_ids, next_indices], dim=1)

    output_completions = [tokenizer.decode(output.tolist()) for output in input_ids][0]
    
    return output_completions

In [7]:
print(generate(model, tokenizer, 'Mamba is the'))

Mamba is the only one of the four dragons to be discovered by humans, so he knows that the world is the enemy and he tries to protect them. He is the only one to find out what the truth of the world is and therefore is sent to the dragon


In [9]:
print(generate(model, tokenizer, 'John: Hi!\nSally:'))

John: Hi!
Sally: Hey!
John: So, when's the wedding?
Sally: We haven't decided.
John: It's in September.
Sally: Yeah, we were thinking July or
August.
John: I'm not too


In [8]:
print(generate(model, tokenizer, 'The meaning of life is '))

The meaning of life is 
just this: It is the best you can do.

--K.J.

And finally: How to handle your emotions. 

<|endoftext|>Q:

Error creating an EntityManager instance in JavaEE 7

This is


In [18]:
print(generate(model, tokenizer, 'Harry Potter'))

Harry Potter fan at me! I am sure it’ll go down very well!!

It is so exciting and I have my doubts about one thing. I would love to see the film but I think it will be over before it begins.<|endoftext|>/*
