<a href="https://colab.research.google.com/github/utkarshgupta04092003/notebooks/blob/main/pretrain-llms/2_Prepare_your_model_for_training.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [29]:
# ignore warnings
import warnings
warnings.filterwarnings('ignore')

In [46]:
# Set a seed value for reproducibility
import torch
def fix_torch_seed(seed=32):
  torch.manual_seed(seed)
  torch.cuda.manual_seed(seed)
  torch.backends.cudnn.deterministic = True
  torch.backends.cudnn.benchmark = False

fix_torch_seed()

## 1. Model Configuration

In [62]:
from transformers import LlamaConfig
config = LlamaConfig()
config

LlamaConfig {
  "attention_bias": false,
  "attention_dropout": 0.0,
  "bos_token_id": 1,
  "eos_token_id": 2,
  "head_dim": 128,
  "hidden_act": "silu",
  "hidden_size": 4096,
  "initializer_range": 0.02,
  "intermediate_size": 11008,
  "max_position_embeddings": 2048,
  "mlp_bias": false,
  "model_type": "llama",
  "num_attention_heads": 32,
  "num_hidden_layers": 32,
  "num_key_value_heads": 32,
  "pretraining_tp": 1,
  "rms_norm_eps": 1e-06,
  "rope_scaling": null,
  "rope_theta": 10000.0,
  "tie_word_embeddings": false,
  "transformers_version": "4.56.1",
  "use_cache": true,
  "vocab_size": 32000
}

In [63]:
# Adjust parameters to reduce model size
config.vocab_size = 32000              # must match tokenizer
config.hidden_size = 1024              # embedding dimension
config.intermediate_size = 4096        # ~4x hidden_size
config.num_hidden_layers = 12          # number of transformer blocks
config.num_attention_heads = 8         # must divide hidden_size
config.num_key_value_heads = 8         # usually same as attention heads
config.max_position_embeddings = 2048  # reduce if needed
config.use_cache = False
config

LlamaConfig {
  "attention_bias": false,
  "attention_dropout": 0.0,
  "bos_token_id": 1,
  "eos_token_id": 2,
  "head_dim": 128,
  "hidden_act": "silu",
  "hidden_size": 1024,
  "initializer_range": 0.02,
  "intermediate_size": 4096,
  "max_position_embeddings": 2048,
  "mlp_bias": false,
  "model_type": "llama",
  "num_attention_heads": 8,
  "num_hidden_layers": 12,
  "num_key_value_heads": 8,
  "pretraining_tp": 1,
  "rms_norm_eps": 1e-06,
  "rope_scaling": null,
  "rope_theta": 10000.0,
  "tie_word_embeddings": false,
  "transformers_version": "4.56.1",
  "use_cache": false,
  "vocab_size": 32000
}

## 2. Weight Initialization

### 2.1 Random weight initialization

In [64]:
from transformers import LlamaForCausalLM
model = LlamaForCausalLM(config)
model

LlamaForCausalLM(
  (model): LlamaModel(
    (embed_tokens): Embedding(32000, 1024)
    (layers): ModuleList(
      (0-11): 12 x LlamaDecoderLayer(
        (self_attn): LlamaAttention(
          (q_proj): Linear(in_features=1024, out_features=1024, bias=False)
          (k_proj): Linear(in_features=1024, out_features=1024, bias=False)
          (v_proj): Linear(in_features=1024, out_features=1024, bias=False)
          (o_proj): Linear(in_features=1024, out_features=1024, bias=False)
        )
        (mlp): LlamaMLP(
          (gate_proj): Linear(in_features=1024, out_features=4096, bias=False)
          (up_proj): Linear(in_features=1024, out_features=4096, bias=False)
          (down_proj): Linear(in_features=4096, out_features=1024, bias=False)
          (act_fn): SiLU()
        )
        (input_layernorm): LlamaRMSNorm((1024,), eps=1e-06)
        (post_attention_layernorm): LlamaRMSNorm((1024,), eps=1e-06)
      )
    )
    (norm): LlamaRMSNorm((1024,), eps=1e-06)
    (rotary_emb)

In [66]:
# Calculate the total number of model parameters
def print_nparams(model):
  nparams = sum(p.numel() for p in  model.parameters())
  print(f'Number of parameters: {nparams}')


print_nparams(model) # 342385664 => 342M

Number of parameters: 266888192


In [67]:
# See the weight of randomly assigned to model
layer_name = 'model.layers.0.self_attn.q_proj.weight'
for name, param in model.named_parameters():
  if name == layer_name:
    print(f'First 30 weight of layer: {name}')
    print(param.data.view(-1)[:30])
    break

First 30 weight of layer: model.layers.0.self_attn.q_proj.weight
tensor([ 0.0025,  0.0228,  0.0313,  0.0041,  0.0171,  0.0195,  0.0103, -0.0034,
        -0.0058, -0.0161,  0.0032,  0.0046,  0.0168,  0.0275, -0.0451,  0.0009,
         0.0161, -0.0152, -0.0301, -0.0183, -0.0229, -0.0115,  0.0013, -0.0297,
         0.0235, -0.0275, -0.0398,  0.0039, -0.0172, -0.0306])


In [68]:
# what will be output of the model randomly initialized weight, not trained on any data
# Load the tokenizer from Upstage Solar, which is compatible with the Llama-2 tokenizer
from transformers import LlamaTokenizer
model_dir = 'upstage/SOLAR-10.7B-v1.0'
tokenizer = LlamaTokenizer.from_pretrained(model_dir)

# Run simple interface with prompt
from transformers import TextStreamer
prompt = 'I am an engineer, i love'

inputs = tokenizer(prompt, return_tensors='pt').to(model.device)
streamer = TextStreamer(
    tokenizer,
    skip_prompt=True,
    skip_special_tokens=False

)

outputs = model.generate(
    **inputs,
    streamer=streamer,
    use_cache=True,
    max_new_tokens=128,
    do_sample=False
)

Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


rupal Product Product Product nodeណ recognisedណ recognisedណ recognisediele recognisedieleieleiable altered altered altered altered altered altered☼ frozen дивизи frozen frozen frozen sehricher Z frozen sehr sehr comes Z Z frozen sehr comes Zgz comes comes comeseclipsegetNamegetName comesgetName comesgetName comesgetName Integr comesgetName Integr comesgetName IntegrgetName IntegrgetName IntegrgetNameथgetNameथgetNameथgetNameथgetNameथथथथथथgetName IntegrथgetName Integr Integr Integr Integr Integr Integr Integr Integrренmilmilmilmilmilmilmilmilmilmilmilmilmilmilmilmilmilmilmilmilmilਰਰਰਰਰਰਰਰਰਰਰਰਰ


In [70]:
# release the memory to avoid crashing
import gc
del model
del streamer
del outputs
gc.collect()


610

### 2.2. Reuse general pretrained model weights

In [None]:
# Load the model
from transformers import AutoModelForCausalLM

model_name_or_path = 'upstage/TinySolar-248m-4k'
model = AutoModelForCausalLM.from_pretrained(model_name_or_path, device_map='cpu', torch_dtype=torch.bfloat16)

model

In [72]:
del model
gc.collect()

225

## 3. Downsccaling from a general pretrained model
Not good for small models

In [None]:
from transformers import AutoTokenizer, AutoConfig

model_name_or_path = 'upstage/TinySolar-248m-4k'
model = AutoModelForCausalLM.from_pretrained(model_name_or_path, device_map='cpu', torch_dtype=torch.bfloat16)

tokenizer = AutoTokenizer.from_pretrained(model_name_or_path)

In [74]:
# check how many layers and parameter does model have
print(model)
print_nparams(model)

LlamaForCausalLM(
  (model): LlamaModel(
    (embed_tokens): Embedding(32000, 1024)
    (layers): ModuleList(
      (0-11): 12 x LlamaDecoderLayer(
        (self_attn): LlamaAttention(
          (q_proj): Linear(in_features=1024, out_features=1024, bias=False)
          (k_proj): Linear(in_features=1024, out_features=256, bias=False)
          (v_proj): Linear(in_features=1024, out_features=256, bias=False)
          (o_proj): Linear(in_features=1024, out_features=1024, bias=False)
        )
        (mlp): LlamaMLP(
          (gate_proj): Linear(in_features=1024, out_features=4096, bias=False)
          (up_proj): Linear(in_features=1024, out_features=4096, bias=False)
          (down_proj): Linear(in_features=4096, out_features=1024, bias=False)
          (act_fn): SiLU()
        )
        (input_layernorm): LlamaRMSNorm((1024,), eps=1e-06)
        (post_attention_layernorm): LlamaRMSNorm((1024,), eps=1e-06)
      )
    )
    (norm): LlamaRMSNorm((1024,), eps=1e-06)
    (rotary_emb): 

Number of parameters: 248013824 => 248M


In [77]:
# Create smaller model by remove hidden layers
layers = model.model.layers
model.model.layers = layers[:5] + layers[-5:]

config = AutoConfig.from_pretrained(
    model_name_or_path,
    num_hidden_layers = len(model.model.layers)
)

model.config = config
print_nparams(model)

Number of parameters: 217601024


Number of parameters: 217601024 => 217M


## 4. Depth Upscaling from a general pretrained model


In [78]:
config = LlamaConfig(
    num_hidden_layers=16,
    hidden_size=1024,
    intermediate_size=4096,
    num_attention_heads=32,
    num_key_value_heads=8,
    torch_dtype = 'bfloat16',
    use_cache=False,
)
config

LlamaConfig {
  "attention_bias": false,
  "attention_dropout": 0.0,
  "bos_token_id": 1,
  "dtype": "bfloat16",
  "eos_token_id": 2,
  "head_dim": 32,
  "hidden_act": "silu",
  "hidden_size": 1024,
  "initializer_range": 0.02,
  "intermediate_size": 4096,
  "max_position_embeddings": 2048,
  "mlp_bias": false,
  "model_type": "llama",
  "num_attention_heads": 32,
  "num_hidden_layers": 16,
  "num_key_value_heads": 8,
  "pretraining_tp": 1,
  "rms_norm_eps": 1e-06,
  "rope_scaling": null,
  "rope_theta": 10000.0,
  "tie_word_embeddings": false,
  "transformers_version": "4.56.1",
  "use_cache": false,
  "vocab_size": 32000
}

In [79]:
model = LlamaForCausalLM(config)
model = model.to(dtype=torch.bfloat16)
print_nparams(model)

Number of parameters: 308839424


Number of parameters: 308839424 => 308M

In [81]:
model_name_or_path = 'upstage/TinySolar-248m-4k'
pretrained_model = AutoModelForCausalLM.from_pretrained(model_name_or_path, device_map='cpu', torch_dtype=torch.bfloat16)

tokenizer = AutoTokenizer.from_pretrained(model_name_or_path)
print_nparams(pretrained_model)

Number of parameters: 248013824


Number of parameters: 248013824 => 248M

In [82]:
from copy import deepcopy

model.model.layers = deepcopy(pretrained_model.model.layers[:-4]) + deepcopy(pretrained_model.model.layers[4:])
model.model.embed_tokens = deepcopy(pretrained_model.model.embed_tokens)
model.lm_head = deepcopy(pretrained_model.lm_head)

print(model.config)

LlamaConfig {
  "attention_bias": false,
  "attention_dropout": 0.0,
  "bos_token_id": 1,
  "dtype": "bfloat16",
  "eos_token_id": 2,
  "head_dim": 32,
  "hidden_act": "silu",
  "hidden_size": 1024,
  "initializer_range": 0.02,
  "intermediate_size": 4096,
  "max_position_embeddings": 2048,
  "mlp_bias": false,
  "model_type": "llama",
  "num_attention_heads": 32,
  "num_hidden_layers": 16,
  "num_key_value_heads": 8,
  "pretraining_tp": 1,
  "rms_norm_eps": 1e-06,
  "rope_scaling": null,
  "rope_theta": 10000.0,
  "tie_word_embeddings": false,
  "transformers_version": "4.56.1",
  "use_cache": false,
  "vocab_size": 32000
}



In [83]:
print_nparams(model)

Number of parameters: 308839424


Number of parameters: 308839424 => 308M (by deepcopy 2 248M models)

In [84]:
prompt = 'I am an engineer, i love'

inputs = tokenizer(prompt, return_tensors='pt').to(model.device)
streamer = TextStreamer(
    tokenizer,
    skip_prompt=True,
    skip_special_tokens=False

)

outputs = model.generate(
    **inputs,
    streamer=streamer,
    use_cache=True,
    max_new_tokens=128,
    do_sample=False
)

Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


to use the word "miscellaneous" to describe the "miscellaneous" of a website.
I have been a long-time user of the site, and I have been a fan of it. I have been a fan of the site, and I have been a fan of the site's history. I have been a fan of the site's history, and I have been a fan of the site's history.
I have been a fan of the site's history, and I have been a fan of the site's history. I have been a fan of the site's


The model is initialized, but since it hasn't been pretrained, its responses aren't fluent. That's why pretraining is necessary after model preparation

In [85]:
# weight will be optimized at the time of pretraining, save the model for now
model.save_pretrained('TinySolar-308m-4k-init')