In [1]:
from transformers import pipeline, AutoModelForCausalLM, AutoConfig, AutoTokenizer
from src.models.modelling_llama_skip import LlamaSkipConnectionForCausalLM
from src.models.configuration_llama_skip import LlamaSkipConnectionConfig
from transformers.models.llama import LlamaForCausalLM
import torch

# Register the custom model and config
AutoConfig.register("llama-skip", LlamaSkipConnectionConfig)
AutoModelForCausalLM.register(LlamaSkipConnectionConfig, LlamaSkipConnectionForCausalLM)

device = torch.device("cpu")

# Load base model and tokenizer
model_id = "vkkhare/llama-skip"
checkpoint = "meta-llama/Llama-3.2-1B-Instruct"
tokenizer = AutoTokenizer.from_pretrained(checkpoint,  trust_remote_code=True)

# Create custom config and model
config = LlamaSkipConnectionConfig.from_pretrained(model_id)

# Load model without device_map
model = LlamaSkipConnectionForCausalLM.from_pretrained(
    checkpoint, 
    config=config,
    torch_dtype=torch.float16 if torch.cuda.is_available() else torch.float32,
).to(device)

# Move all masks to the correct device
for module in model.modules():
    if hasattr(module, 'mask'):
        module.mask = module.mask.to(device)
model.eval()


Using /home/azureuser/.cache/torch_extensions/py39_cu124 as PyTorch extensions root...
Detected CUDA files, patching ldflags
Emitting ninja build file /home/azureuser/.cache/torch_extensions/py39_cu124/sparse_mlp/build.ninja...
If this is not desired, please set os.environ['TORCH_CUDA_ARCH_LIST'].
Building extension module sparse_mlp...
Allowing ninja to set a default number of workers... (overridable by setting the environment variable MAX_JOBS=N)
Loading extension module sparse_mlp...


ninja: no work to do.


Some weights of LlamaSkipConnectionForCausalLM were not initialized from the model checkpoint at meta-llama/Llama-3.2-1B-Instruct and are newly initialized: ['model.layers.0.mlp.lora_gate_proj.0.weight', 'model.layers.0.mlp.lora_gate_proj.1.weight', 'model.layers.1.mlp.lora_gate_proj.0.weight', 'model.layers.1.mlp.lora_gate_proj.1.weight', 'model.layers.10.mlp.lora_gate_proj.0.weight', 'model.layers.10.mlp.lora_gate_proj.1.weight', 'model.layers.11.mlp.lora_gate_proj.0.weight', 'model.layers.11.mlp.lora_gate_proj.1.weight', 'model.layers.12.mlp.lora_gate_proj.0.weight', 'model.layers.12.mlp.lora_gate_proj.1.weight', 'model.layers.13.mlp.lora_gate_proj.0.weight', 'model.layers.13.mlp.lora_gate_proj.1.weight', 'model.layers.14.mlp.lora_gate_proj.0.weight', 'model.layers.14.mlp.lora_gate_proj.1.weight', 'model.layers.15.mlp.lora_gate_proj.0.weight', 'model.layers.15.mlp.lora_gate_proj.1.weight', 'model.layers.2.mlp.lora_gate_proj.0.weight', 'model.layers.2.mlp.lora_gate_proj.1.weight', 'm

In [2]:
model.push_to_hub("vkkhare/llama-skip")
config.push_to_hub("vkkhare/llama-skip")

model-00001-of-00002.safetensors:   0%|          | 0.00/4.95G [00:00<?, ?B/s]

Upload 2 LFS files:   0%|          | 0/2 [00:00<?, ?it/s]

model-00002-of-00002.safetensors:   0%|          | 0.00/1.07G [00:00<?, ?B/s]

CommitInfo(commit_url='https://huggingface.co/vkkhare/llama-skip/commit/390f414e73e4f7631af1f1b41410040aa46d2187', commit_message='Upload config', commit_description='', oid='390f414e73e4f7631af1f1b41410040aa46d2187', pr_url=None, repo_url=RepoUrl('https://huggingface.co/vkkhare/llama-skip', endpoint='https://huggingface.co', repo_type='model', repo_id='vkkhare/llama-skip'), pr_revision=None, pr_num=None)

In [3]:
# Generate text
sequence = "Give recipe of burrito including all the ingredients and their quantity."
inputs = tokenizer(
    sequence, 
    return_tensors='pt',
    padding=True,
    truncation=True,
    max_length=512
)

# Explicitly move all input tensors to the same device as model
input_ids = inputs["input_ids"].to(device)
attention_mask = inputs["attention_mask"].to(device)

# Debug prints
print(f"Model device: {next(model.parameters()).device}")
print(f"Input IDs device: {input_ids.device}")
print(f"Attention Mask device: {attention_mask.device}")

LlamaSkipConnectionForCausalLM(
  (model): LlamaSkipConnectionModel(
    (embed_tokens): Embedding(128256, 2048)
    (layers): ModuleList(
      (0-15): 16 x LlamaSkipDecoderLayer(
        (self_attn): LlamaSdpaAttention(
          (q_proj): Linear(in_features=2048, out_features=2048, bias=False)
          (k_proj): Linear(in_features=2048, out_features=512, bias=False)
          (v_proj): Linear(in_features=2048, out_features=512, bias=False)
          (o_proj): Linear(in_features=2048, out_features=2048, bias=False)
          (rotary_emb): LlamaRotaryEmbedding()
        )
        (mlp): LlamaSkipMLP(
          (gate_proj): Linear(in_features=2048, out_features=8192, bias=False)
          (up_proj): Linear(in_features=2048, out_features=8192, bias=False)
          (down_proj): Linear(in_features=8192, out_features=2048, bias=False)
          (act_fn): SiLU()
          (lora_gate_proj): Sequential(
            (0): Linear(in_features=2048, out_features=1638, bias=False)
            (1)

In [None]:
with torch.no_grad():
    outputs = model.generate(
        input_ids,
        attention_mask=attention_mask,
        max_new_tokens=10,
        temperature=0.7,
        top_p=0.9,
        num_return_sequences=1,
        pad_token_id=tokenizer.pad_token_id,
        eos_token_id=tokenizer.eos_token_id,
    )
    
generated_text = tokenizer.decode(outputs[0], skip_special_tokens=True)
print(generated_text)

In [4]:
pipe = pipeline(
    "text-generation",
    model=model,
    tokenizer=tokenizer,
    max_new_tokens = 1000,
    eos_token_id=tokenizer.eos_token_id
)
# messages = [
#     {"role": "system", "content": "You are a pirate chatbot who always responds in pirate speak!"},
#     {"role": "user", "content": "Who are you?"},
# ]
out = pipe.model.generate(input["input_ids"], max_length=20)
tokenizer.decode(out[0])

Hardware accelerator e.g. GPU is available in the environment, but no `device` argument is passed to the `Pipeline` object. Model will be on CPU.
The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
The attention mask is not set and cannot be inferred from input because pad token is same as eos token. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.


: 

In [5]:
standardPipe = pipeline(
    "text-generation",
    model=checkpoint,
    tokenizer=tokenizer,
    max_new_tokens = 1000,
    eos_token_id=tokenizer.eos_token_id
)

# max_length needs to be set as by default it is 20
out = standardPipe.model.generate(input["input_ids"], max_length=20)
tokenizer.decode(skip_special_tokens=True, token_ids=out[0])

Hardware accelerator e.g. GPU is available in the environment, but no `device` argument is passed to the `Pipeline` object. Model will be on CPU.
The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:None for open-end generation.


'Give recipe of burrito including all the ingredients and their quantity. \nHere are the ingredients required'

In [7]:
import time

start1 = time.time()
for i in range(15):
    out = standardPipe.model.forward(input["input_ids"], use_cache=False)

start2 = time.time()
print("Time taken for 100 inferences.")
print("Standard pipeline time: ", start2 - start1)
for i in range(15):
    out = pipe.model.forward(input["input_ids"],use_cache=False)


start3 = time.time()

print("Pipeline time: ", start3 - start2)


Time taken for 100 inferences.
Standard pipeline time:  14.608644485473633
Pipeline time:  7.280247211456299


In [5]:
from src.models.configuration_llama_skip import LlamaOnnxConfig
from optimum.exporters.onnx import main_export

onnx_config = LlamaOnnxConfig(
    config=config,
    task="text-generation",
    use_past_in_inputs=False
)
onnx_config_with_past = LlamaOnnxConfig(config, task="text-generation", use_past=True)

def get_submodels(model):
    return {
        "decoder_model": model.model,
        "decoder_with_past_model": model.model
    }

custom_onnx_configs = {
    "decoder_model": onnx_config,
    "decoder_with_past_model": onnx_config_with_past,
}

main_export(
    model_id,
    output="mpt_onnx", 
    task="text-generation-with-past",
    trust_remote_code=True,
    custom_onnx_configs=custom_onnx_configs,
    fn_get_submodels=get_submodels,
    no_post_process=True,
)

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

ONNX custom configs for: decoder_model, decoder_with_past_model
Submodels to export: model


ValueError: Trying to export a custom model, but could not find as many custom ONNX configs as the number of submodels to export. Please specifiy the fn_get_submodels argument, that should return a dictionary of submodules with as many items as the provided custom_export_configs dictionary.

In [None]:
# ! pip install executorch==0.5.0.dev20241016+cpu --extra-index-url https://download.pytorch.org/whl/nightly/cpu
import torch
from torch.export import export
from executorch.exir import to_edge, EdgeCompileConfig, to_edge_transform_and_lower
from executorch.backends.xnnpack.partition.xnnpack_partitioner import XnnpackPartitioner
from executorch.runtime import Runtime

# Get the operators supported by executorch
runtime = Runtime.get()
model = llamaSkipModel.eval() # turn into evaluation mode

exported_graph = torch.export.export(model.eval(), args=(input["input_ids"], input["attention_mask"]), strict=False) # Core Aten graph
print("torch.export.export done")
# Using compile_config=EdgeCompileConfig(_check_ir_validity=False) moves to next but then fails therat edge.to_backend()
edge_delegated = to_edge_transform_and_lower(exported_graph, partitioner=[XnnpackPartitioner()]) # Edge Dialect -- Failing here with torch._ops.aten.sym_constrain_range_for_size.default is not Aten Canonical
print("to_edge_transform_and_lower done")
executorch_program = edge_delegated.to_executorch() # ExecuTorch program
print("to_executorch done")
pte_path = "/home/azureuser/weight_caching/checkpoints/llamaskipmodel/llama_skip_model.pte"

with open(pte_path, "wb") as file:
    executorch_program.write_to_file(file) # Serializing into .pte file
print("File created")
program = runtime.load_program(pte_path)
print("load_program successfull")
method = program.load_method("forward")
print("load_method done")
output = method.execute([input])
print("method.execute done")

In [14]:
import torch
torch.save(pipe.model, "skipllama.pt")

In [15]:
pipe.model.save_pretrained("onnx/")

In [21]:
pipe.model.push_to_hub("llama-skip",private=True)

model-00001-of-00002.safetensors:   0%|          | 0.00/4.95G [00:00<?, ?B/s]

model-00002-of-00002.safetensors:   0%|          | 0.00/1.07G [00:00<?, ?B/s]

Upload 2 LFS files:   0%|          | 0/2 [00:00<?, ?it/s]

CommitInfo(commit_url='https://huggingface.co/vkkhare/llama-skip/commit/23c1b3fdcf17cdb28e0752dd890672427a481eca', commit_message='Upload LlamaSkipConnectionForCausalLM', commit_description='', oid='23c1b3fdcf17cdb28e0752dd890672427a481eca', pr_url=None, repo_url=RepoUrl('https://huggingface.co/vkkhare/llama-skip', endpoint='https://huggingface.co', repo_type='model', repo_id='vkkhare/llama-skip'), pr_revision=None, pr_num=None)

In [5]:
pipe.tokenizer.save_pretrained("onnx/")

('onnx/tokenizer_config.json',
 'onnx/special_tokens_map.json',
 'onnx/tokenizer.json')

In [None]:
# Tokens/second benchmarking options
# 1. llama-bench
# 2. tiktoken
# 3. nvidia genai-perf

# Time taken for 15 inferences.
# Standard pipeline time:  14.072068691253662
# Pipeline time:  5.465141773223877 (2.5X)

# Time taken for 100 inferences.
# Standard pipeline time:  67.88692212104797
# Pipeline time:  56.4792857170105 (1.2X)

# Time taken for 1000 inferences.
# Standard pipeline time:  694.2122750282288
# Pipeline time:  545.006334066391 (1.2X)