In [None]:
import sys
sys.path.append("..")
import src.model_utils as model_utils
import torch
from transformers import LlamaConfig
from transformers import AutoModelForCausalLM

In [None]:
# might need to pass config arguments to the get_model function
model_path = "final_model"
vocab_size = 15_000

config = AutoModelForCausalLM.from_pretrained(model_path).config
print(config)

In [None]:
model = model_utils.get_model(
    model_name_or_path=model_path,
    torch_dtype=torch.bfloat16,
    load_in_4bit=False,
    device_map={"": torch.cuda.current_device()},
    train=False,
    _flash_attn_2_enabled=False,
    config=config,
)

# overwrite the max seq length to what we are targetting 1024 or 4096
model.config.max_position_embeddings = 4096

In [4]:
model = model_utils.patch_model_with_rope(
    pos_emb_name="scaled_rope",
    model=model,
    seq_len_train=128,
    seq_len_test=8192,
    scale_power=0,
)

[32m2023-11-30 16:12:44.277[0m | [1mINFO    [0m | [36msrc.pos_emb_classes[0m:[36mpatch[0m:[36m162[0m - [1mPostion Interpolation - Rotary Position Embedding hyperparameters[0m
[32m2023-11-30 16:12:44.278[0m | [1mINFO    [0m | [36msrc.pos_emb_classes[0m:[36mpatch[0m:[36m163[0m - [1mDimension: 128.0[0m
[32m2023-11-30 16:12:44.278[0m | [1mINFO    [0m | [36msrc.pos_emb_classes[0m:[36mpatch[0m:[36m164[0m - [1mScale (for the position): 64[0m
[32m2023-11-30 16:12:44.279[0m | [1mINFO    [0m | [36msrc.pos_emb_classes[0m:[36mpatch[0m:[36m165[0m - [1mScale Power: 0[0m
[32m2023-11-30 16:12:44.279[0m | [1mINFO    [0m | [36msrc.pos_emb_classes[0m:[36mpatch[0m:[36m166[0m - [1mMax Pos Emb: 8192[0m


In [5]:
model

LlamaForCausalLM(
  (model): LlamaModel(
    (embed_tokens): Embedding(15000, 1024)
    (layers): ModuleList(
      (0-7): 8 x LlamaDecoderLayer(
        (self_attn): LlamaAttention(
          (q_proj): Linear(in_features=1024, out_features=1024, bias=False)
          (k_proj): Linear(in_features=1024, out_features=1024, bias=False)
          (v_proj): Linear(in_features=1024, out_features=1024, bias=False)
          (o_proj): Linear(in_features=1024, out_features=1024, bias=False)
          (rotary_emb): ScaledLlamaRotaryEmbedding()
        )
        (mlp): LlamaMLP(
          (gate_proj): Linear(in_features=1024, out_features=4096, bias=False)
          (up_proj): Linear(in_features=1024, out_features=4096, bias=False)
          (down_proj): Linear(in_features=4096, out_features=1024, bias=False)
          (act_fn): SiLUActivation()
        )
        (input_layernorm): LlamaRMSNorm()
        (post_attention_layernorm): LlamaRMSNorm()
      )
    )
    (norm): LlamaRMSNorm()
  )
  (lm

In [None]:
import os
model_save_path = os.path.join(model_path, "rope_pi")
print(model_save_path)

model.save_pretrained(model_save_path)

## apply rope pi to all the final models

In [22]:
def apply_rope_pi(model_path):
    vocab_size = 15_000

    config = AutoModelForCausalLM.from_pretrained(model_path).config

    model = model_utils.get_model(
        model_name_or_path=model_path,
        torch_dtype=torch.bfloat16,
        load_in_4bit=False,
        device_map={"": torch.cuda.current_device()},
        train=False,
        _flash_attn_2_enabled=False,
        config=config,
    )

    # overwrite the max seq length to what we are targetting 1024 or 8192
    model.config.max_position_embeddings = 1024

    new_model = model_utils.patch_model_with_rope(
        pos_emb_name="scaled_rope",
        model=model,
        seq_len_train=128,
        seq_len_test=1024,
        scale_power=0,
    )

    model_save_path = os.path.join(model_path, "rope_pi")
    new_model.save_pretrained(model_save_path)
    return

In [26]:
import os

base_path = "unfiltered/"
hidden_size = [1024, 512, 256, 128, 64, 32]
num_layers = [8, 4, 2]
int_size = [4096, 2048, 1024, 512, 256, 128]
model_paths = []
for idx, hidden in enumerate(hidden_size):
    for layer in num_layers:
        model_names = f"hidden_{hidden}_num_layer_{layer}_int_{int_size[idx]}"
        model_path = os.path.join(base_path, model_names, "final_model")
        model_paths.append(model_path)

In [None]:
for model_path in model_paths:
    print(model_path)
    apply_rope_pi(model_path)