In [1]:
import torch
from torch.testing import assert_close

import os

dtype = torch.bfloat16
device = torch.device("cuda")

os.environ["WORLD_SIZE"] = "1"
os.environ["RANK"] = "0"
os.environ["MASTER_ADDR"] = "0.0.0.0"
os.environ["MASTER_PORT"] = "6000"

In [2]:
PATH_TO_LLAMA = "/mloscratch/homes/solergib/models/Meta-Llama-3-8B-Instruct"

In [3]:
from transformers import AutoModelForCausalLM
hf_model = AutoModelForCausalLM.from_pretrained(PATH_TO_LLAMA, torch_dtype=dtype, attn_implementation="flash_attention_2").to(device)
# print(hf_model)
# print(hf_model.config)
#print(hf_model.model.rotary_emb.ori_inv_freq.dtype)

  from .autonotebook import tqdm as notebook_tqdm
You are attempting to use Flash Attention 2.0 with a model not initialized on GPU. Make sure to move the model to GPU after initializing it on CPU with `model.to('cuda')`.
Loading checkpoint shards: 100%|██████████| 4/4 [00:00<00:00, 13.15it/s]


In [4]:
from transformers import LlamaConfig
hf_config = LlamaConfig.from_pretrained(PATH_TO_LLAMA)
print(hf_config)

LlamaConfig {
  "architectures": [
    "LlamaForCausalLM"
  ],
  "attention_bias": false,
  "attention_dropout": 0.0,
  "bos_token_id": 128000,
  "eos_token_id": 128001,
  "hidden_act": "silu",
  "hidden_size": 4096,
  "initializer_range": 0.02,
  "intermediate_size": 14336,
  "max_position_embeddings": 8192,
  "mlp_bias": false,
  "model_type": "llama",
  "num_attention_heads": 32,
  "num_hidden_layers": 32,
  "num_key_value_heads": 8,
  "pretraining_tp": 1,
  "rms_norm_eps": 1e-05,
  "rope_scaling": null,
  "rope_theta": 500000.0,
  "tie_word_embeddings": false,
  "torch_dtype": "bfloat16",
  "transformers_version": "4.44.0.dev0",
  "use_cache": true,
  "vocab_size": 128256
}



In [5]:
from nanotron.config import ParallelismArgs
from nanotron.parallel import ParallelContext
from nanotron.parallel.pipeline_parallel.engine import AllForwardAllBackwardPipelineEngine
from nanotron.parallel.tensor_parallel.nn import TensorParallelLinearMode

DP = 1
PP = 1
TP = 1

parallel_config = ParallelismArgs(
    dp=DP,
    pp=PP,
    tp=TP,
    pp_engine=AllForwardAllBackwardPipelineEngine(),
    tp_mode=TensorParallelLinearMode.ALL_REDUCE,
    tp_linear_async_communication=False,
)
assert (
    parallel_config.tp_mode == TensorParallelLinearMode.ALL_REDUCE
    and parallel_config.tp_linear_async_communication is False
)

parallel_context = ParallelContext(
    data_parallel_size=parallel_config.dp,
    pipeline_parallel_size=parallel_config.pp,
    tensor_parallel_size=parallel_config.tp,
)

In [6]:
from nanotron.config.models_config import LlamaConfig as LlamaConfigNanotron

nanotron_config = LlamaConfigNanotron(
    bos_token_id=hf_config.bos_token_id,
    eos_token_id=hf_config.eos_token_id,
    hidden_act=hf_config.hidden_act,
    hidden_size=hf_config.hidden_size,
    initializer_range=hf_config.initializer_range,
    intermediate_size=hf_config.intermediate_size,
    is_llama_config=True,
    max_position_embeddings=hf_config.max_position_embeddings,
    num_attention_heads=hf_config.num_attention_heads,
    num_hidden_layers=hf_config.num_hidden_layers,
    num_key_value_heads=hf_config.num_key_value_heads,
    pad_token_id=None,
    pretraining_tp=hf_config.pretraining_tp,
    rms_norm_eps=hf_config.rms_norm_eps,
    rope_scaling=hf_config.rope_scaling,
    rope_theta=hf_config.rope_theta,
    rope_interleaved=False,
    tie_word_embeddings=hf_config.tie_word_embeddings,
    use_cache=hf_config.use_cache,
    vocab_size=hf_config.vocab_size,
)

In [7]:
from nanotron.models.llama_sft import LlamaForSFT
from nanotron.models import build_model

nanotron_model = build_model(
        model_builder=lambda: LlamaForSFT(
            config=nanotron_config,
            parallel_context=parallel_context,
            parallel_config=parallel_config,
            random_states=None,
        ),
        parallel_context=parallel_context,
        dtype=dtype,
        device=device,
)
# print(nanotron_model)

torch.float32


In [8]:
from nanotron.trainer import mark_tied_parameters

mark_tied_parameters(model=nanotron_model, parallel_context=parallel_context)

In [9]:
nanotron_model.model.token_position_embeddings.pp_block.token_embedding.weight.get_sharded_info()

ShardedInfo(global_ranks=(0,), local_global_slices_pairs=(SlicesPair(local_slices=(slice(None, None, None), slice(None, None, None)), global_slices=(slice(0, 128256, None), slice(None, None, None))),), unsharded_shape=(128256, 4096))

In [10]:
nanotron_model.model.token_position_embeddings.pp_block.token_embedding.weight.is_tied

False

In [11]:
# Final script
# TODO Añadir variables de TP para splitear los parametros de las layers de HF
# TODO Cargar modelo HF en cpu y copiar desde ahi


# Token embeddings
assert nanotron_model.model.token_position_embeddings.pp_block.token_embedding.weight.shape == hf_model.model.embed_tokens.weight.shape

with torch.no_grad():
    nanotron_model.model.token_position_embeddings.pp_block.token_embedding.weight.copy_(hf_model.model.embed_tokens.weight)#  = hf_model.model.embed_tokens.weight.data

# Decoder layers
for i in range(nanotron_config.num_hidden_layers):
    # Input layer norm
    assert hf_model.model.layers[i].input_layernorm.weight.shape == nanotron_model.model.decoder[i].pp_block.input_layernorm.weight.shape
    with torch.no_grad():
        nanotron_model.model.decoder[i].pp_block.input_layernorm.weight.copy_(hf_model.model.layers[i].input_layernorm.weight)#  = hf_model.model.layers[i].input_layernorm.weight
    # Self attn
    ## QKV
    tmp_qkv_proj = torch.cat([
        hf_model.model.layers[i].self_attn.q_proj.weight,
        hf_model.model.layers[i].self_attn.k_proj.weight,
        hf_model.model.layers[i].self_attn.v_proj.weight
    ], dim = 0) 
    assert tmp_qkv_proj.shape == nanotron_model.model.decoder[i].pp_block.attn.qkv_proj.weight.shape
    with torch.no_grad():
        nanotron_model.model.decoder[i].pp_block.attn.qkv_proj.weight.copy_(tmp_qkv_proj)#  = tmp_qkv_proj # torch.nn.Parameter(tmp_qkv_proj)
    
    ## O
    assert hf_model.model.layers[i].self_attn.o_proj.weight.shape == nanotron_model.model.decoder[i].pp_block.attn.o_proj.weight.shape
    with torch.no_grad():
        nanotron_model.model.decoder[i].pp_block.attn.o_proj.weight.copy_(hf_model.model.layers[i].self_attn.o_proj.weight)#  = hf_model.model.layers[i].self_attn.o_proj.weight
    # MLP
    ## Gate Up Proj
    tmp_gate_up_proj = torch.cat([
        hf_model.model.layers[i].mlp.gate_proj.weight,
        hf_model.model.layers[i].mlp.up_proj.weight,
    ], dim = 0)

    assert tmp_gate_up_proj.shape == nanotron_model.model.decoder[i].pp_block.mlp.gate_up_proj.weight.shape
    with torch.no_grad():
        nanotron_model.model.decoder[i].pp_block.mlp.gate_up_proj.weight.copy_(tmp_gate_up_proj)#  = tmp_gate_up_proj
    ## Down Proj
    assert hf_model.model.layers[i].mlp.down_proj.weight.shape == nanotron_model.model.decoder[i].pp_block.mlp.down_proj.weight.shape
    with torch.no_grad():
        nanotron_model.model.decoder[i].pp_block.mlp.down_proj.weight.copy_(hf_model.model.layers[i].mlp.down_proj.weight)#  = hf_model.model.layers[i].mlp.down_proj.weight


    # Post attn layer norm
    assert hf_model.model.layers[i].post_attention_layernorm.weight.shape == nanotron_model.model.decoder[i].pp_block.post_attention_layernorm.weight.shape
    with torch.no_grad():
        nanotron_model.model.decoder[i].pp_block.post_attention_layernorm.weight.copy_(hf_model.model.layers[i].post_attention_layernorm.weight)#  = hf_model.model.layers[i].post_attention_layernorm.weight
    
# Last layer norm
assert nanotron_model.model.final_layer_norm.pp_block.weight.shape == hf_model.model.norm.weight.shape
with torch.no_grad():
    nanotron_model.model.final_layer_norm.pp_block.weight.copy_(hf_model.model.norm.weight)#  = hf_model.model.norm.weight
# LM_Head
assert nanotron_model.model.lm_head.pp_block.weight.shape == hf_model.lm_head.weight.shape
with torch.no_grad():
    nanotron_model.model.lm_head.pp_block.weight.copy_(hf_model.lm_head.weight)# = hf_model.lm_head.weight

In [12]:
"""
import importlib
import nanotron
importlib.reload(nanotron.data.chat_dataset)
importlib.reload(nanotron.data.collator)
"""

from nanotron.data.chat_dataset import ChatDataset
from nanotron.data.dataloader_builder import build_chat_dataloader

train_dataset = ChatDataset(
    dataset_path="Open-Orca/SlimOrca",
    tokenizer_name_or_path=PATH_TO_LLAMA,
    sequence_length=2048,
    train_on_completions_only=True,
    remove_cross_attention=True,
    split="train",
    conversation_column_name="conversations",
    dp_rank=parallel_context.dp_pg.rank(),
    dp_ranks_size=parallel_context.dp_pg.size(),
)

# Prepare dataloader
train_dataloader = build_chat_dataloader(
    dataset=train_dataset,
    sequence_length=2048,
    parallel_context=parallel_context,
    input_pp_rank=0,
    output_pp_rank=0,
)

In [13]:
batch = next(iter(train_dataloader))
batch

{'input_ids': tensor([[128000, 128006,  26380,  ...,  16686,     13, 128009]],
        dtype=torch.int32),
 'position_ids': tensor([[  0,   1,   2,  ..., 576, 577, 578]], dtype=torch.int32),
 'label_ids': tensor([[128006,  26380, 128007,  ...,     13, 128009, 128001]],
        dtype=torch.int32),
 'label_mask': tensor([[False, False, False,  ...,  True,  True,  True]])}

In [14]:
assert batch["input_ids"].shape == batch["label_ids"].shape 
assert batch["input_ids"].shape == batch["position_ids"].shape
assert batch["input_ids"].shape == batch["label_mask"].shape

In [15]:
# TODO(tj.solergibert) Comparar LlamaModel vs LlamaModel, nada de causal ni SFT!
# TODO(tj.solergibert) Vale, ya lo estabamos haciendo.
# TODO(tj.solergibert) Quedaria revisar lo de la LOSS, mierda. Tendremos que hacer una reduccion y usar la de pytorch
# TODO(tj.solergibert) Para asegurarnos que todo bien Y LUEGO YA SI ESO LO DE LA MASK.
hf_model.eval()
nanotron_model.eval()

LlamaForSFT(
  (model): LlamaModel(
    (token_position_embeddings): PipelineBlock(
      pp_rank=0
      (pp_block): Embedding(
        (token_embedding): TensorParallelEmbedding(tp_rank=0, 128256, 4096, unsharded_num_embeddings=128256)
        (position_embedding): LlamaRotaryEmbedding()
      )
    )
    (decoder): ModuleList(
      (0-31): 32 x PipelineBlock(
        pp_rank=0
        (pp_block): LlamaDecoderLayer(
          (input_layernorm): TritonRMSNorm()
          (attn): CausalSelfAttention(
            (qkv_proj): TensorParallelColumnLinear(tp_rank=0, in_features=4096, out_features=6144, bias=False, unsharded_out_features=6144)
            (o_proj): TensorParallelRowLinear(tp_rank=0, in_features=4096, out_features=4096, bias=False, unsharded_in_features=4096)
          )
          (post_attention_layernorm): TritonRMSNorm()
          (mlp): MLP(
            (gate_up_proj): TensorParallelColumnLinear(tp_rank=0, in_features=4096, out_features=28672, bias=False, unsharded_out_f

## 1 a 1

In [16]:
input_ids = batch["input_ids"].cuda()
position_ids = batch["position_ids"].cuda()

In [24]:
n_embedd = nanotron_model.model.token_position_embeddings(input_ids=input_ids, position_ids=position_ids)
n_embedd["hidden_states"] = n_embedd.pop("input_embeds")

In [29]:
hf_embedd = hf_model.model.embed_tokens(input_ids)
hf_position_embeddings = hf_model.model.rotary_emb(hf_embedd, position_ids)

In [30]:
assert_close(n_embedd["hidden_states"].transpose(0,1), hf_embedd) # TODO(tj.solergibert) Embeddings now are equal!
assert_close(n_embedd["cos"], hf_position_embeddings[0])
assert_close(n_embedd["sin"], hf_position_embeddings[1])

In [36]:
n_hidden_encoder_states = nanotron_model.model.decoder[0](**n_embedd)

tensor([   0,  164,  443,  935, 1208, 1318, 1897], device='cuda:0',
       dtype=torch.int32)
tensor(579, device='cuda:0', dtype=torch.int32)


In [37]:
n_hidden_encoder_states

{'hidden_states': tensor([[[ 0.0014,  0.0040, -0.0050,  ...,  0.0093, -0.0007,  0.0005]],
 
         [[ 0.0065,  0.0144,  0.0079,  ..., -0.0157, -0.0422, -0.0073]],
 
         [[-0.0117, -0.0225,  0.0166,  ..., -0.0114, -0.0019,  0.0105]],
 
         ...,
 
         [[ 0.0205,  0.0003, -0.0043,  ..., -0.0337,  0.0027, -0.0114]],
 
         [[ 0.0017, -0.0008,  0.0084,  ...,  0.0054,  0.0016,  0.0060]],
 
         [[-0.0025, -0.0031, -0.0141,  ..., -0.0088,  0.0073,  0.0090]]],
        device='cuda:0', dtype=torch.bfloat16, grad_fn=<AddBackward0>),
 'position_ids': tensor([[  0,   1,   2,  ..., 576, 577, 578]], device='cuda:0',
        dtype=torch.int32),
 'cos': tensor([[[ 1.0000,  1.0000,  1.0000,  ...,  1.0000,  1.0000,  1.0000],
          [ 0.5391,  0.6875,  0.7891,  ...,  1.0000,  1.0000,  1.0000],
          [-0.4160, -0.0583,  0.2412,  ...,  1.0000,  1.0000,  1.0000],
          ...,
          [-0.4629, -0.4336,  0.5078,  ...,  1.0000,  1.0000,  1.0000],
          [ 0.4941,  0.3574

In [38]:
hf_hidden = hf_model.model.layers[0](hf_embedd, position_ids=position_ids, position_embeddings=hf_position_embeddings)

tensor(579, device='cuda:0', dtype=torch.int32)
tensor([   0,  164,  443,  935, 1208, 1318, 1897], device='cuda:0',
       dtype=torch.int32)


In [39]:
hf_hidden

(tensor([[[ 0.0014,  0.0040, -0.0050,  ...,  0.0093, -0.0007,  0.0005],
          [ 0.0064,  0.0146,  0.0078,  ..., -0.0157, -0.0425, -0.0073],
          [-0.0117, -0.0225,  0.0167,  ..., -0.0115, -0.0018,  0.0106],
          ...,
          [ 0.0205,  0.0004, -0.0043,  ..., -0.0334,  0.0027, -0.0114],
          [ 0.0017, -0.0008,  0.0084,  ...,  0.0054,  0.0016,  0.0061],
          [-0.0025, -0.0032, -0.0141,  ..., -0.0087,  0.0073,  0.0090]]],
        device='cuda:0', dtype=torch.bfloat16, grad_fn=<AddBackward0>),)

In [40]:
assert_close(n_hidden_encoder_states["hidden_states"].transpose(0,1), hf_hidden[0])

AssertionError: Tensor-likes are not close!

Mismatched elements: 1151415 / 7770112 (14.8%)
Greatest absolute difference: 0.001953125 at index (0, 442, 3824) (up to 1e-05 allowed)
Greatest relative difference: inf at index (0, 2, 2232) (up to 0.016 allowed)

In [59]:
n_hidden_encoder_states["hidden_states"].transpose(0,1)

tensor([[[ 0.0014,  0.0040, -0.0050,  ...,  0.0093, -0.0007,  0.0005],
         [ 0.0060,  0.0125,  0.0074,  ..., -0.0181, -0.0356, -0.0070],
         [-0.0164, -0.0225,  0.0219,  ..., -0.0098, -0.0084,  0.0156],
         ...,
         [ 0.0121,  0.0106, -0.0149,  ..., -0.0229, -0.0056, -0.0021],
         [ 0.0065,  0.0256, -0.0107,  ..., -0.0027, -0.0085,  0.0192],
         [ 0.0025,  0.0199, -0.0267,  ..., -0.0056, -0.0045,  0.0182]]],
       device='cuda:0', dtype=torch.bfloat16, grad_fn=<TransposeBackward0>)

In [60]:
hf_hidden[0]

tensor([[[ 0.0014,  0.0040, -0.0050,  ...,  0.0093, -0.0007,  0.0005],
         [ 0.0064,  0.0146,  0.0078,  ..., -0.0157, -0.0425, -0.0073],
         [-0.0117, -0.0225,  0.0167,  ..., -0.0115, -0.0018,  0.0106],
         ...,
         [ 0.0205,  0.0004, -0.0043,  ..., -0.0334,  0.0027, -0.0114],
         [ 0.0017, -0.0008,  0.0084,  ...,  0.0054,  0.0016,  0.0061],
         [-0.0025, -0.0032, -0.0141,  ..., -0.0087,  0.0073,  0.0090]]],
       device='cuda:0', dtype=torch.bfloat16, grad_fn=<AddBackward0>)

# Inference

In [41]:
with torch.inference_mode():
    output_nanotron = nanotron_model.model(input_ids=batch["input_ids"].cuda(), position_ids = batch["position_ids"].cuda())

tensor([   0,  164,  443,  935, 1208, 1318, 1897], device='cuda:0',
       dtype=torch.int32)
tensor(579, device='cuda:0', dtype=torch.int32)
tensor([   0,  164,  443,  935, 1208, 1318, 1897], device='cuda:0',
       dtype=torch.int32)
tensor(579, device='cuda:0', dtype=torch.int32)
tensor([   0,  164,  443,  935, 1208, 1318, 1897], device='cuda:0',
       dtype=torch.int32)
tensor(579, device='cuda:0', dtype=torch.int32)
tensor([   0,  164,  443,  935, 1208, 1318, 1897], device='cuda:0',
       dtype=torch.int32)
tensor(579, device='cuda:0', dtype=torch.int32)
tensor([   0,  164,  443,  935, 1208, 1318, 1897], device='cuda:0',
       dtype=torch.int32)
tensor(579, device='cuda:0', dtype=torch.int32)
tensor([   0,  164,  443,  935, 1208, 1318, 1897], device='cuda:0',
       dtype=torch.int32)
tensor(579, device='cuda:0', dtype=torch.int32)
tensor([   0,  164,  443,  935, 1208, 1318, 1897], device='cuda:0',
       dtype=torch.int32)
tensor(579, device='cuda:0', dtype=torch.int32)
tensor

In [42]:
with torch.inference_mode():
    output_hf = hf_model(input_ids=batch["input_ids"].cuda(), position_ids = batch["position_ids"].cuda())

We detected that you are passing `past_key_values` as a tuple and this is deprecated and will be removed in v4.43. Please use an appropriate `Cache` class (https://huggingface.co/docs/transformers/v4.41.3/en/internal/generation_utils#transformers.Cache)


tensor(579, device='cuda:0', dtype=torch.int32)
tensor([   0,  164,  443,  935, 1208, 1318, 1897], device='cuda:0',
       dtype=torch.int32)
tensor(579, device='cuda:0', dtype=torch.int32)
tensor([   0,  164,  443,  935, 1208, 1318, 1897], device='cuda:0',
       dtype=torch.int32)
tensor(579, device='cuda:0', dtype=torch.int32)
tensor([   0,  164,  443,  935, 1208, 1318, 1897], device='cuda:0',
       dtype=torch.int32)
tensor(579, device='cuda:0', dtype=torch.int32)
tensor([   0,  164,  443,  935, 1208, 1318, 1897], device='cuda:0',
       dtype=torch.int32)
tensor(579, device='cuda:0', dtype=torch.int32)
tensor([   0,  164,  443,  935, 1208, 1318, 1897], device='cuda:0',
       dtype=torch.int32)
tensor(579, device='cuda:0', dtype=torch.int32)
tensor([   0,  164,  443,  935, 1208, 1318, 1897], device='cuda:0',
       dtype=torch.int32)
tensor(579, device='cuda:0', dtype=torch.int32)
tensor([   0,  164,  443,  935, 1208, 1318, 1897], device='cuda:0',
       dtype=torch.int32)
tensor

In [43]:
output_hf.logits[:,0,:]

tensor([[ 4.9688,  6.1562, 10.8750,  ..., -3.6406, -3.6406, -3.6406]],
       device='cuda:0')

In [44]:
output_nanotron.transpose(0,1)[:,0,:]

tensor([[ 4.9375,  6.0938, 10.7500,  ..., -3.6719, -3.6719, -3.6719]],
       device='cuda:0')

In [45]:
from torch.testing import assert_close

# TODO(tj.solergibert) Ojo este test es solo de la position 0 jajajjajajajajajajaj

assert_close(output_hf.logits[:,0,:], output_nanotron.transpose(0,1)[:,0,:], rtol=1e-1, atol=1e-1)

AssertionError: Tensor-likes are not close!

Mismatched elements: 1143 / 128256 (0.9%)
Greatest absolute difference: 0.5859375 at index (0, 12592) (up to 0.1 allowed)
Greatest relative difference: 279.8438720703125 at index (0, 40526) (up to 0.1 allowed)

In [46]:
assert_close(output_hf.logits, output_nanotron.transpose(0,1))

AssertionError: Tensor-likes are not close!

Mismatched elements: 217458927 / 243301632 (89.4%)
Greatest absolute difference: 3.58984375 at index (0, 373, 33435) (up to 1e-05 allowed)
Greatest relative difference: 1744897.0 at index (0, 1435, 64528) (up to 1.3e-06 allowed)

In [47]:
predicted_token = 34

next_tokens_hf = torch.softmax(output_hf.logits[0, predicted_token, :], -1)
hf_topk_next_tokens= torch.topk(next_tokens_hf, 10)


print(*[f"[HF Model] Next token: {idx.item()}, probability: {prob}" for idx, prob in zip(hf_topk_next_tokens.indices, hf_topk_next_tokens.values)], sep="\n")

[HF Model] Next token: 11415, probability: 0.10412170737981796
[HF Model] Next token: 1523, probability: 0.04918361455202103
[HF Model] Next token: 47032, probability: 0.043404385447502136
[HF Model] Next token: 72514, probability: 0.03830423951148987
[HF Model] Next token: 3493, probability: 0.03830423951148987
[HF Model] Next token: 10477, probability: 0.03830423951148987
[HF Model] Next token: 16805, probability: 0.03175532445311546
[HF Model] Next token: 10552, probability: 0.026326090097427368
[HF Model] Next token: 7664, probability: 0.021825095638632774
[HF Model] Next token: 3041, probability: 0.018093638122081757


In [48]:
next_tokens_nanotron = torch.softmax(output_nanotron.transpose(0,1)[0, predicted_token, :], -1)
nanotron_topk_next_tokens= torch.topk(next_tokens_nanotron, 10)


print(*[f"[Nanotron Model] Next token: {idx.item()}, probability: {prob}" for idx, prob in zip(nanotron_topk_next_tokens.indices, nanotron_topk_next_tokens.values)], sep="\n")

[Nanotron Model] Next token: 11415, probability: 0.10305546224117279
[Nanotron Model] Next token: 1523, probability: 0.048679955303668976
[Nanotron Model] Next token: 47032, probability: 0.04295990616083145
[Nanotron Model] Next token: 10477, probability: 0.04035709798336029
[Nanotron Model] Next token: 3493, probability: 0.04035709798336029
[Nanotron Model] Next token: 72514, probability: 0.03791198879480362
[Nanotron Model] Next token: 16805, probability: 0.031430136412382126
[Nanotron Model] Next token: 10552, probability: 0.027737000957131386
[Nanotron Model] Next token: 7664, probability: 0.02299478091299534
[Nanotron Model] Next token: 3041, probability: 0.017908351495862007


Save the Nanotron model

In [97]:
from nanotron.parallel.parameters import sanity_check

sanity_check(root_module=nanotron_model)

In [98]:
from pathlib import Path
from nanotron.serialize import save_meta, save_weights, TrainingMetadata
from nanotron.serialize.metadata import DataStageMetadata

out_path = "/mloscratch/homes/solergib/converter/nanotron/n_c/first/"
out_path = Path(out_path)

save_weights(model=nanotron_model, parallel_context=parallel_context, root_folder=out_path)

training_metadata = TrainingMetadata(last_train_step=0, consumed_train_samples=0, data_stages=[DataStageMetadata(name="Empty", consumed_train_samples=0, start_training_step=0)])

save_meta(root_folder=out_path, parallel_context=parallel_context, training_metadata=training_metadata)


Saving weights: 100%|██████████| 195/195 [00:41<00:00,  4.67it/s]


In [99]:
import json 
import yaml
from nanotron.config import GeneralArgs, ModelArgs, TokenizerArgs, Config
from nanotron.config.models_config import ExistingCheckpointInit
from dataclasses import asdict

with open(out_path / "config.yaml", "w") as f:
    config = Config(
        general=GeneralArgs(project="conversion", run="Llama3-8B"),
        parallelism=parallel_config,
        model=ModelArgs(
            init_method=ExistingCheckpointInit(out_path),
            model_config=nanotron_config,
        ),
        tokenizer=TokenizerArgs(PATH_TO_LLAMA),
    )
    print("Saving config ...")
    yaml.dump(config.as_dict(), f)

with open(out_path / "model_config.json", "w") as f:
    print("Saving model config ...")
    json.dump(asdict(nanotron_config), f)

Saving config ...
Saving model config ...


In [11]:
import sys
sys.path.append("/mloscratch/homes/solergib/SFT/transformers")

import torch
from t_tests.models.llama.test_modeling_llama import LlamaModelTester

lmt = LlamaModelTester(parent=None)

_, inputs_dict = lmt.prepare_config_and_inputs_for_common()
dummy_attention_mask = inputs_dict["attention_mask"]
inputs_dict["input_ids"][~dummy_attention_mask.bool()] = 0

padfree_inputs_dict = {
    k: v[dummy_attention_mask.bool()].unsqueeze(0)
    for k, v in inputs_dict.items()
    if not k == "attention_mask"
}

padfree_inputs_dict["position_ids"] = (
    torch.cat([torch.arange(length) for length in dummy_attention_mask.sum(1).tolist()])
    .long()
    .unsqueeze(0)
    .to("cuda")
)

print(padfree_inputs_dict)



{'input_ids': tensor([[27, 22,  0, 97, 13, 49, 56, 35, 70, 91, 38, 30, 26, 94, 68, 46, 89, 32,
         70, 85, 50, 67, 70, 86, 66, 82, 18, 72, 27, 37, 91, 27, 60, 57, 23, 93,
         10, 80, 82, 26, 13, 50, 12, 68, 63, 85, 55,  1,  3, 61, 37, 70, 12, 97,
          1, 59, 90, 45, 74, 62, 66, 54, 94, 18, 54, 89, 49,  3, 66, 55]],
       device='cuda:0'), 'position_ids': tensor([[0, 0, 1, 0, 1, 2, 0, 1, 2, 3, 0, 1, 2, 3, 4, 0, 1, 2, 3, 4, 5, 0, 1, 2,
         3, 4, 5, 6, 0, 1, 2, 3, 4, 5, 6, 0, 1, 2, 3, 4, 5, 6, 0, 1, 2, 3, 4, 5,
         6, 0, 1, 2, 3, 4, 5, 6, 0, 1, 2, 3, 4, 5, 6, 0, 1, 2, 3, 4, 5, 6]],
       device='cuda:0')}
