In [1]:
import torch
from torch.testing import assert_close

import os

dtype = torch.bfloat16
device = torch.device("cuda")

os.environ["WORLD_SIZE"] = "1"
os.environ["RANK"] = "0"
os.environ["MASTER_ADDR"] = "0.0.0.0"
os.environ["MASTER_PORT"] = "6000"

In [2]:
from transformers import AutoModelForCausalLM
PATH_TO_LLAMA = "/mloscratch/homes/solergib/models/Meta-Llama-3-8B-Instruct"
hf_model = AutoModelForCausalLM.from_pretrained(PATH_TO_LLAMA, torch_dtype=dtype, attn_implementation="flash_attention_2").to(device)
# print(hf_model)
# print(hf_model.config)

  from .autonotebook import tqdm as notebook_tqdm
You are attempting to use Flash Attention 2.0 with a model not initialized on GPU. Make sure to move the model to GPU after initializing it on CPU with `model.to('cuda')`.
Loading checkpoint shards: 100%|██████████| 4/4 [00:00<00:00, 13.70it/s]


In [3]:
from transformers import LlamaConfig
hf_config = LlamaConfig.from_pretrained(PATH_TO_LLAMA)
print(hf_config)

LlamaConfig {
  "architectures": [
    "LlamaForCausalLM"
  ],
  "attention_bias": false,
  "attention_dropout": 0.0,
  "bos_token_id": 128000,
  "eos_token_id": 128001,
  "hidden_act": "silu",
  "hidden_size": 4096,
  "initializer_range": 0.02,
  "intermediate_size": 14336,
  "max_position_embeddings": 8192,
  "mlp_bias": false,
  "model_type": "llama",
  "num_attention_heads": 32,
  "num_hidden_layers": 32,
  "num_key_value_heads": 8,
  "pretraining_tp": 1,
  "rms_norm_eps": 1e-05,
  "rope_scaling": null,
  "rope_theta": 500000.0,
  "tie_word_embeddings": false,
  "torch_dtype": "bfloat16",
  "transformers_version": "4.44.0.dev0",
  "use_cache": true,
  "vocab_size": 128256
}



In [4]:
from nanotron.config import ParallelismArgs
from nanotron.parallel import ParallelContext
from nanotron.parallel.pipeline_parallel.engine import AllForwardAllBackwardPipelineEngine
from nanotron.parallel.tensor_parallel.nn import TensorParallelLinearMode

DP = 1
PP = 1
TP = 1

parallel_config = ParallelismArgs(
    dp=DP,
    pp=PP,
    tp=TP,
    pp_engine=AllForwardAllBackwardPipelineEngine(),
    tp_mode=TensorParallelLinearMode.ALL_REDUCE,
    tp_linear_async_communication=False,
)
assert (
    parallel_config.tp_mode == TensorParallelLinearMode.ALL_REDUCE
    and parallel_config.tp_linear_async_communication is False
)

parallel_context = ParallelContext(
    data_parallel_size=parallel_config.dp,
    pipeline_parallel_size=parallel_config.pp,
    tensor_parallel_size=parallel_config.tp,
)

In [5]:
from nanotron.config.models_config import LlamaConfig as LlamaConfigNanotron

nanotron_config = LlamaConfigNanotron(
    bos_token_id=hf_config.bos_token_id,
    eos_token_id=hf_config.eos_token_id,
    hidden_act=hf_config.hidden_act,
    hidden_size=hf_config.hidden_size,
    initializer_range=hf_config.initializer_range,
    intermediate_size=hf_config.intermediate_size,
    is_llama_config=True,
    max_position_embeddings=hf_config.max_position_embeddings,
    num_attention_heads=hf_config.num_attention_heads,
    num_hidden_layers=hf_config.num_hidden_layers,
    num_key_value_heads=hf_config.num_key_value_heads,
    pad_token_id=None,
    pretraining_tp=hf_config.pretraining_tp,
    rms_norm_eps=hf_config.rms_norm_eps,
    rope_scaling=hf_config.rope_scaling,
    rope_theta=hf_config.rope_theta,
    rope_interleaved=False,
    tie_word_embeddings=hf_config.tie_word_embeddings,
    use_cache=hf_config.use_cache,
    vocab_size=hf_config.vocab_size,
)

In [6]:
from nanotron.models.llama_sft import LlamaForSFT
from nanotron.models import build_model

nanotron_model = build_model(
        model_builder=lambda: LlamaForSFT(
            config=nanotron_config,
            parallel_context=parallel_context,
            parallel_config=parallel_config,
            random_states=None,
        ),
        parallel_context=parallel_context,
        dtype=dtype,
        device=device,
)
# print(nanotron_model)

In [7]:
from nanotron.trainer import mark_tied_parameters

mark_tied_parameters(model=nanotron_model, parallel_context=parallel_context)

In [8]:
nanotron_model.model.token_position_embeddings.pp_block.token_embedding.weight.get_sharded_info()

ShardedInfo(global_ranks=(0,), local_global_slices_pairs=(SlicesPair(local_slices=(slice(None, None, None), slice(None, None, None)), global_slices=(slice(0, 128256, None), slice(None, None, None))),), unsharded_shape=(128256, 4096))

In [9]:
nanotron_model.model.token_position_embeddings.pp_block.token_embedding.weight.is_tied

False

In [10]:
# Final script
# TODO Añadir variables de TP para splitear los parametros de las layers de HF
# TODO Cargar modelo HF en cpu y copiar desde ahi


# Token embeddings
assert nanotron_model.model.token_position_embeddings.pp_block.token_embedding.weight.shape == hf_model.model.embed_tokens.weight.shape

with torch.no_grad():
    nanotron_model.model.token_position_embeddings.pp_block.token_embedding.weight.copy_(hf_model.model.embed_tokens.weight)#  = hf_model.model.embed_tokens.weight.data

# Decoder layers
for i in range(nanotron_config.num_hidden_layers):
    # Input layer norm
    assert hf_model.model.layers[i].input_layernorm.weight.shape == nanotron_model.model.decoder[i].pp_block.input_layernorm.weight.shape
    with torch.no_grad():
        nanotron_model.model.decoder[i].pp_block.input_layernorm.weight.copy_(hf_model.model.layers[i].input_layernorm.weight)#  = hf_model.model.layers[i].input_layernorm.weight
    # Self attn
    ## QKV
    tmp_qkv_proj = torch.cat([
        hf_model.model.layers[i].self_attn.q_proj.weight,
        hf_model.model.layers[i].self_attn.k_proj.weight,
        hf_model.model.layers[i].self_attn.v_proj.weight
    ], dim = 0) 
    assert tmp_qkv_proj.shape == nanotron_model.model.decoder[i].pp_block.attn.qkv_proj.weight.shape
    with torch.no_grad():
        nanotron_model.model.decoder[i].pp_block.attn.qkv_proj.weight.copy_(tmp_qkv_proj)#  = tmp_qkv_proj # torch.nn.Parameter(tmp_qkv_proj)
    
    ## O
    assert hf_model.model.layers[i].self_attn.o_proj.weight.shape == nanotron_model.model.decoder[i].pp_block.attn.o_proj.weight.shape
    with torch.no_grad():
        nanotron_model.model.decoder[i].pp_block.attn.o_proj.weight.copy_(hf_model.model.layers[i].self_attn.o_proj.weight)#  = hf_model.model.layers[i].self_attn.o_proj.weight
    # MLP
    ## Gate Up Proj
    tmp_gate_up_proj = torch.cat([
        hf_model.model.layers[i].mlp.gate_proj.weight,
        hf_model.model.layers[i].mlp.up_proj.weight,
    ], dim = 0)

    assert tmp_gate_up_proj.shape == nanotron_model.model.decoder[i].pp_block.mlp.gate_up_proj.weight.shape
    with torch.no_grad():
        nanotron_model.model.decoder[i].pp_block.mlp.gate_up_proj.weight.copy_(tmp_gate_up_proj)#  = tmp_gate_up_proj
    ## Down Proj
    assert hf_model.model.layers[i].mlp.down_proj.weight.shape == nanotron_model.model.decoder[i].pp_block.mlp.down_proj.weight.shape
    with torch.no_grad():
        nanotron_model.model.decoder[i].pp_block.mlp.down_proj.weight.copy_(hf_model.model.layers[i].mlp.down_proj.weight)#  = hf_model.model.layers[i].mlp.down_proj.weight


    # Post attn layer norm
    assert hf_model.model.layers[i].post_attention_layernorm.weight.shape == nanotron_model.model.decoder[i].pp_block.post_attention_layernorm.weight.shape
    with torch.no_grad():
        nanotron_model.model.decoder[i].pp_block.post_attention_layernorm.weight.copy_(hf_model.model.layers[i].post_attention_layernorm.weight)#  = hf_model.model.layers[i].post_attention_layernorm.weight
    
# Last layer norm
assert nanotron_model.model.final_layer_norm.pp_block.weight.shape == hf_model.model.norm.weight.shape
with torch.no_grad():
    nanotron_model.model.final_layer_norm.pp_block.weight.copy_(hf_model.model.norm.weight)#  = hf_model.model.norm.weight
# LM_Head
assert nanotron_model.model.lm_head.pp_block.weight.shape == hf_model.lm_head.weight.shape
with torch.no_grad():
    nanotron_model.model.lm_head.pp_block.weight.copy_(hf_model.lm_head.weight)# = hf_model.lm_head.weight

In [11]:
from nanotron.data.chat_dataset import ChatDataset
from nanotron.data.dataloader_builder import build_chat_dataloader

train_dataset = ChatDataset(
    dataset_path="Open-Orca/SlimOrca",
    tokenizer_name_or_path=PATH_TO_LLAMA,
    sequence_length=2048,
    train_on_completions_only=True,
    remove_cross_attention=True,
    split="train",
    conversation_column_name="conversations",
    dp_rank=parallel_context.dp_pg.rank(),
    dp_ranks_size=parallel_context.dp_pg.size(),
)

# Prepare dataloader
train_dataloader = build_chat_dataloader(
    dataset=train_dataset,
    sequence_length=2048,
    parallel_context=parallel_context,
    input_pp_rank=0,
    output_pp_rank=0,
)

In [12]:
batch = next(iter(train_dataloader))

In [31]:
batch["input_ids"][:, -150:]

tensor([[128009, 128009, 128009, 128009, 128009, 128009, 128009, 128009, 128009,
         128009, 128009, 128009, 128009, 128009, 128009, 128009, 128009, 128009,
         128009, 128009, 128009, 128009, 128009, 128009, 128009, 128009, 128009,
         128009, 128009, 128009, 128009, 128009, 128009, 128009, 128009, 128009,
         128009, 128009, 128009, 128009, 128009, 128009, 128009, 128009, 128009,
         128009, 128009, 128009, 128009, 128009, 128009, 128009, 128009, 128009,
         128009, 128009, 128009, 128009, 128009, 128009, 128009, 128009, 128009,
         128009, 128009, 128009, 128009, 128009, 128009, 128009, 128009, 128009,
         128009, 128009, 128009, 128009, 128009, 128009, 128009, 128009, 128009,
         128009, 128009, 128009, 128009, 128009, 128009, 128009, 128009, 128009,
         128009, 128009, 128009, 128009, 128009, 128009, 128009, 128009, 128009,
         128009, 128009, 128009, 128009, 128009, 128009, 128009, 128009, 128009,
         128009, 128009, 128

In [32]:
batch["input_ids"][:, :-150]

tensor([[128000, 128006,  26380,  ...,     13, 128009, 128001]],
       dtype=torch.int32)

In [14]:
nanotron_model.eval()
hf_model.eval()

LlamaForCausalLM(
  (model): LlamaModel(
    (embed_tokens): Embedding(128256, 4096)
    (layers): ModuleList(
      (0-31): 32 x LlamaDecoderLayer(
        (self_attn): LlamaFlashAttention2(
          (q_proj): Linear(in_features=4096, out_features=4096, bias=False)
          (k_proj): Linear(in_features=4096, out_features=1024, bias=False)
          (v_proj): Linear(in_features=4096, out_features=1024, bias=False)
          (o_proj): Linear(in_features=4096, out_features=4096, bias=False)
          (rotary_emb): LlamaRotaryEmbedding()
        )
        (mlp): LlamaMLP(
          (gate_proj): Linear(in_features=4096, out_features=14336, bias=False)
          (up_proj): Linear(in_features=4096, out_features=14336, bias=False)
          (down_proj): Linear(in_features=14336, out_features=4096, bias=False)
          (act_fn): SiLU()
        )
        (input_layernorm): LlamaRMSNorm()
        (post_attention_layernorm): LlamaRMSNorm()
      )
    )
    (norm): LlamaRMSNorm()
    (rotary_e

In [36]:
with torch.no_grad():
    output_nanotron = nanotron_model.model(input_ids=batch["input_ids"][:, :-150].cuda(), position_ids = batch["position_ids"][:, :-150].cuda())

In [37]:
with torch.no_grad():
    output_hf = hf_model(input_ids=batch["input_ids"][:, :-150].cuda(), position_ids = batch["position_ids"][:, :-150].cuda())

PEPEPEPEPE
PEPEPEPEPE
PEPEPEPEPE
PEPEPEPEPE
PEPEPEPEPE
PEPEPEPEPE
PEPEPEPEPE
PEPEPEPEPE
PEPEPEPEPE
PEPEPEPEPE
PEPEPEPEPE
PEPEPEPEPE
PEPEPEPEPE
PEPEPEPEPE
PEPEPEPEPE
PEPEPEPEPE
PEPEPEPEPE
PEPEPEPEPE
PEPEPEPEPE
PEPEPEPEPE
PEPEPEPEPE
PEPEPEPEPE
PEPEPEPEPE
PEPEPEPEPE
PEPEPEPEPE
PEPEPEPEPE
PEPEPEPEPE
PEPEPEPEPE
PEPEPEPEPE
PEPEPEPEPE
PEPEPEPEPE
PEPEPEPEPE


In [38]:
from torch.testing import assert_close

assert_close(output_hf.logits, output_nanotron.transpose(0,1))

AssertionError: Tensor-likes are not close!

Mismatched elements: 243083431 / 243429888 (99.9%)
Greatest absolute difference: 46.65625 at index (0, 1125, 22) (up to 1e-05 allowed)
Greatest relative difference: 74448896.0 at index (0, 715, 31230) (up to 1.3e-06 allowed)

In [39]:
predicted_token = 34

next_tokens_hf = torch.softmax(output_hf.logits[0, predicted_token, :], -1)
hf_topk_next_tokens= torch.topk(next_tokens_hf, 10)


print(*[f"[HF Model] Next token: {idx.item()}, probability: {prob}" for idx, prob in zip(hf_topk_next_tokens.indices, hf_topk_next_tokens.values)], sep="\n")

[HF Model] Next token: 11415, probability: 0.10412170737981796
[HF Model] Next token: 1523, probability: 0.04918361455202103
[HF Model] Next token: 47032, probability: 0.043404385447502136
[HF Model] Next token: 72514, probability: 0.03830423951148987
[HF Model] Next token: 3493, probability: 0.03830423951148987
[HF Model] Next token: 10477, probability: 0.03830423951148987
[HF Model] Next token: 16805, probability: 0.03175532445311546
[HF Model] Next token: 10552, probability: 0.026326090097427368
[HF Model] Next token: 7664, probability: 0.021825095638632774
[HF Model] Next token: 3041, probability: 0.018093638122081757


In [40]:
next_tokens_nanotron = torch.softmax(output_nanotron.transpose(0,1)[0, predicted_token, :], -1)
nanotron_topk_next_tokens= torch.topk(next_tokens_nanotron, 10)


print(*[f"[Nanotron Model] Next token: {idx.item()}, probability: {prob}" for idx, prob in zip(nanotron_topk_next_tokens.indices, nanotron_topk_next_tokens.values)], sep="\n")

[Nanotron Model] Next token: 220, probability: 0.0804644376039505
[Nanotron Model] Next token: 994, probability: 0.029601214453577995
[Nanotron Model] Next token: 3639, probability: 0.02612297795712948
[Nanotron Model] Next token: 656, probability: 0.024540266022086143
[Nanotron Model] Next token: 279, probability: 0.024540266022086143
[Nanotron Model] Next token: 3277, probability: 0.021656708791851997
[Nanotron Model] Next token: 264, probability: 0.013982621021568775
[Nanotron Model] Next token: 1148, probability: 0.01022990420460701
[Nanotron Model] Next token: 507, probability: 0.01022990420460701
[Nanotron Model] Next token: 323, probability: 0.01022990420460701


Save the Nanotron model

In [97]:
from nanotron.parallel.parameters import sanity_check

sanity_check(root_module=nanotron_model)

In [98]:
from pathlib import Path
from nanotron.serialize import save_meta, save_weights, TrainingMetadata
from nanotron.serialize.metadata import DataStageMetadata

out_path = "/mloscratch/homes/solergib/converter/nanotron/n_c/first/"
out_path = Path(out_path)

save_weights(model=nanotron_model, parallel_context=parallel_context, root_folder=out_path)

training_metadata = TrainingMetadata(last_train_step=0, consumed_train_samples=0, data_stages=[DataStageMetadata(name="Empty", consumed_train_samples=0, start_training_step=0)])

save_meta(root_folder=out_path, parallel_context=parallel_context, training_metadata=training_metadata)


Saving weights: 100%|██████████| 195/195 [00:41<00:00,  4.67it/s]


In [99]:
import json 
import yaml
from nanotron.config import GeneralArgs, ModelArgs, TokenizerArgs, Config
from nanotron.config.models_config import ExistingCheckpointInit
from dataclasses import asdict

with open(out_path / "config.yaml", "w") as f:
    config = Config(
        general=GeneralArgs(project="conversion", run="Llama3-8B"),
        parallelism=parallel_config,
        model=ModelArgs(
            init_method=ExistingCheckpointInit(out_path),
            model_config=nanotron_config,
        ),
        tokenizer=TokenizerArgs(PATH_TO_LLAMA),
    )
    print("Saving config ...")
    yaml.dump(config.as_dict(), f)

with open(out_path / "model_config.json", "w") as f:
    print("Saving model config ...")
    json.dump(asdict(nanotron_config), f)

Saving config ...
Saving model config ...


In [11]:
import sys
sys.path.append("/mloscratch/homes/solergib/SFT/transformers")

import torch
from t_tests.models.llama.test_modeling_llama import LlamaModelTester

lmt = LlamaModelTester(parent=None)

_, inputs_dict = lmt.prepare_config_and_inputs_for_common()
dummy_attention_mask = inputs_dict["attention_mask"]
inputs_dict["input_ids"][~dummy_attention_mask.bool()] = 0

padfree_inputs_dict = {
    k: v[dummy_attention_mask.bool()].unsqueeze(0)
    for k, v in inputs_dict.items()
    if not k == "attention_mask"
}

padfree_inputs_dict["position_ids"] = (
    torch.cat([torch.arange(length) for length in dummy_attention_mask.sum(1).tolist()])
    .long()
    .unsqueeze(0)
    .to("cuda")
)

print(padfree_inputs_dict)



{'input_ids': tensor([[27, 22,  0, 97, 13, 49, 56, 35, 70, 91, 38, 30, 26, 94, 68, 46, 89, 32,
         70, 85, 50, 67, 70, 86, 66, 82, 18, 72, 27, 37, 91, 27, 60, 57, 23, 93,
         10, 80, 82, 26, 13, 50, 12, 68, 63, 85, 55,  1,  3, 61, 37, 70, 12, 97,
          1, 59, 90, 45, 74, 62, 66, 54, 94, 18, 54, 89, 49,  3, 66, 55]],
       device='cuda:0'), 'position_ids': tensor([[0, 0, 1, 0, 1, 2, 0, 1, 2, 3, 0, 1, 2, 3, 4, 0, 1, 2, 3, 4, 5, 0, 1, 2,
         3, 4, 5, 6, 0, 1, 2, 3, 4, 5, 6, 0, 1, 2, 3, 4, 5, 6, 0, 1, 2, 3, 4, 5,
         6, 0, 1, 2, 3, 4, 5, 6, 0, 1, 2, 3, 4, 5, 6, 0, 1, 2, 3, 4, 5, 6]],
       device='cuda:0')}
