<a href="https://colab.research.google.com/github/utkarshgupta04092003/notebooks/blob/main/pretrain-llms/2_Train%2C_Test_and_Evaluate_Pretrain_Model.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
# ignore warnings
import warnings
warnings.filterwarnings('ignore')

In [2]:
# Set a seed value for reproducibility
import torch
def fix_torch_seed(seed=32):
  torch.manual_seed(seed)
  torch.cuda.manual_seed(seed)
  torch.backends.cudnn.deterministic = True
  torch.backends.cudnn.benchmark = False

fix_torch_seed()

## 1. Model Configuration

In [3]:
from transformers import LlamaConfig
config = LlamaConfig()
config

LlamaConfig {
  "attention_bias": false,
  "attention_dropout": 0.0,
  "bos_token_id": 1,
  "eos_token_id": 2,
  "head_dim": 128,
  "hidden_act": "silu",
  "hidden_size": 4096,
  "initializer_range": 0.02,
  "intermediate_size": 11008,
  "max_position_embeddings": 2048,
  "mlp_bias": false,
  "model_type": "llama",
  "num_attention_heads": 32,
  "num_hidden_layers": 32,
  "num_key_value_heads": 32,
  "pretraining_tp": 1,
  "rms_norm_eps": 1e-06,
  "rope_scaling": null,
  "rope_theta": 10000.0,
  "tie_word_embeddings": false,
  "transformers_version": "4.56.2",
  "use_cache": true,
  "vocab_size": 32000
}

In [4]:
# Adjust parameters to reduce model size
config.vocab_size = 32000              # must match tokenizer
config.hidden_size = 1024              # embedding dimension
config.intermediate_size = 4096        # ~4x hidden_size
config.num_hidden_layers = 12          # number of transformer blocks
config.num_attention_heads = 8         # must divide hidden_size
config.num_key_value_heads = 8         # usually same as attention heads
config.max_position_embeddings = 2048  # reduce if needed
config.use_cache = False
config

LlamaConfig {
  "attention_bias": false,
  "attention_dropout": 0.0,
  "bos_token_id": 1,
  "eos_token_id": 2,
  "head_dim": 128,
  "hidden_act": "silu",
  "hidden_size": 1024,
  "initializer_range": 0.02,
  "intermediate_size": 4096,
  "max_position_embeddings": 2048,
  "mlp_bias": false,
  "model_type": "llama",
  "num_attention_heads": 8,
  "num_hidden_layers": 12,
  "num_key_value_heads": 8,
  "pretraining_tp": 1,
  "rms_norm_eps": 1e-06,
  "rope_scaling": null,
  "rope_theta": 10000.0,
  "tie_word_embeddings": false,
  "transformers_version": "4.56.2",
  "use_cache": false,
  "vocab_size": 32000
}

## 2. Weight Initialization

### 2.1 Random weight initialization

In [5]:
from transformers import LlamaForCausalLM
model = LlamaForCausalLM(config)
model

LlamaForCausalLM(
  (model): LlamaModel(
    (embed_tokens): Embedding(32000, 1024)
    (layers): ModuleList(
      (0-11): 12 x LlamaDecoderLayer(
        (self_attn): LlamaAttention(
          (q_proj): Linear(in_features=1024, out_features=1024, bias=False)
          (k_proj): Linear(in_features=1024, out_features=1024, bias=False)
          (v_proj): Linear(in_features=1024, out_features=1024, bias=False)
          (o_proj): Linear(in_features=1024, out_features=1024, bias=False)
        )
        (mlp): LlamaMLP(
          (gate_proj): Linear(in_features=1024, out_features=4096, bias=False)
          (up_proj): Linear(in_features=1024, out_features=4096, bias=False)
          (down_proj): Linear(in_features=4096, out_features=1024, bias=False)
          (act_fn): SiLU()
        )
        (input_layernorm): LlamaRMSNorm((1024,), eps=1e-06)
        (post_attention_layernorm): LlamaRMSNorm((1024,), eps=1e-06)
      )
    )
    (norm): LlamaRMSNorm((1024,), eps=1e-06)
    (rotary_emb)

In [6]:
# Calculate the total number of model parameters
def print_nparams(model):
  nparams = sum(p.numel() for p in  model.parameters())
  print(f'Number of parameters: {nparams}')


print_nparams(model) # 342385664 => 342M

Number of parameters: 266888192


In [7]:
# See the weight of randomly assigned to model
layer_name = 'model.layers.0.self_attn.q_proj.weight'
for name, param in model.named_parameters():
  if name == layer_name:
    print(f'First 30 weight of layer: {name}')
    print(param.data.view(-1)[:30])
    break

First 30 weight of layer: model.layers.0.self_attn.q_proj.weight
tensor([-0.0223,  0.0217,  0.0219, -0.0016, -0.0030, -0.0191,  0.0135,  0.0194,
         0.0134, -0.0154, -0.0067,  0.0170, -0.0082, -0.0104,  0.0338,  0.0293,
         0.0101, -0.0118, -0.0038,  0.0047,  0.0164, -0.0023,  0.0143, -0.0099,
        -0.0117,  0.0167,  0.0182,  0.0135,  0.0076,  0.0169])


In [None]:
# what will be output of the model randomly initialized weight, not trained on any data
# Load the tokenizer from Upstage Solar, which is compatible with the Llama-2 tokenizer
from transformers import LlamaTokenizer
model_dir = 'upstage/SOLAR-10.7B-v1.0'
tokenizer = LlamaTokenizer.from_pretrained(model_dir)

# Run simple interface with prompt
from transformers import TextStreamer
prompt = 'I am an engineer, i love'

inputs = tokenizer(prompt, return_tensors='pt').to(model.device)
streamer = TextStreamer(
    tokenizer,
    skip_prompt=True,
    skip_special_tokens=False

)

outputs = model.generate(
    **inputs,
    streamer=streamer,
    use_cache=True,
    max_new_tokens=128,
    do_sample=False
)

In [9]:
# release the memory to avoid crashing
import gc
del model
del streamer
del outputs
gc.collect()


196

### 2.2. Reuse general pretrained model weights

In [None]:
# Load the model
from transformers import AutoModelForCausalLM

model_name_or_path = 'upstage/TinySolar-248m-4k'
model = AutoModelForCausalLM.from_pretrained(model_name_or_path, device_map='cpu', torch_dtype=torch.bfloat16)

model

In [11]:
del model
gc.collect()

0

## 3. Downsccaling from a general pretrained model
Not good for small models

In [None]:
from transformers import AutoTokenizer, AutoConfig

model_name_or_path = 'upstage/TinySolar-248m-4k'
model = AutoModelForCausalLM.from_pretrained(model_name_or_path, device_map='cpu', torch_dtype=torch.bfloat16)

tokenizer = AutoTokenizer.from_pretrained(model_name_or_path)

In [13]:
# check how many layers and parameter does model have
print(model)
print_nparams(model)

LlamaForCausalLM(
  (model): LlamaModel(
    (embed_tokens): Embedding(32000, 1024)
    (layers): ModuleList(
      (0-11): 12 x LlamaDecoderLayer(
        (self_attn): LlamaAttention(
          (q_proj): Linear(in_features=1024, out_features=1024, bias=False)
          (k_proj): Linear(in_features=1024, out_features=256, bias=False)
          (v_proj): Linear(in_features=1024, out_features=256, bias=False)
          (o_proj): Linear(in_features=1024, out_features=1024, bias=False)
        )
        (mlp): LlamaMLP(
          (gate_proj): Linear(in_features=1024, out_features=4096, bias=False)
          (up_proj): Linear(in_features=1024, out_features=4096, bias=False)
          (down_proj): Linear(in_features=4096, out_features=1024, bias=False)
          (act_fn): SiLU()
        )
        (input_layernorm): LlamaRMSNorm((1024,), eps=1e-06)
        (post_attention_layernorm): LlamaRMSNorm((1024,), eps=1e-06)
      )
    )
    (norm): LlamaRMSNorm((1024,), eps=1e-06)
    (rotary_emb): 

Number of parameters: 248013824 => 248M


In [14]:
# Create smaller model by remove hidden layers
layers = model.model.layers
model.model.layers = layers[:5] + layers[-5:]

config = AutoConfig.from_pretrained(
    model_name_or_path,
    num_hidden_layers = len(model.model.layers)
)

model.config = config
print_nparams(model)

Number of parameters: 217601024


Number of parameters: 217601024 => 217M


## 4. Depth Upscaling from a general pretrained model


In [15]:
config = LlamaConfig(
    num_hidden_layers=16,
    hidden_size=1024,
    intermediate_size=4096,
    num_attention_heads=32,
    num_key_value_heads=8,
    torch_dtype = 'bfloat16',
    use_cache=False,
)
config

LlamaConfig {
  "attention_bias": false,
  "attention_dropout": 0.0,
  "bos_token_id": 1,
  "dtype": "bfloat16",
  "eos_token_id": 2,
  "head_dim": 32,
  "hidden_act": "silu",
  "hidden_size": 1024,
  "initializer_range": 0.02,
  "intermediate_size": 4096,
  "max_position_embeddings": 2048,
  "mlp_bias": false,
  "model_type": "llama",
  "num_attention_heads": 32,
  "num_hidden_layers": 16,
  "num_key_value_heads": 8,
  "pretraining_tp": 1,
  "rms_norm_eps": 1e-06,
  "rope_scaling": null,
  "rope_theta": 10000.0,
  "tie_word_embeddings": false,
  "transformers_version": "4.56.2",
  "use_cache": false,
  "vocab_size": 32000
}

In [16]:
model = LlamaForCausalLM(config)
model = model.to(dtype=torch.bfloat16)
print_nparams(model)

Number of parameters: 308839424


Number of parameters: 308839424 => 308M

In [17]:
model_name_or_path = 'upstage/TinySolar-248m-4k'
pretrained_model = AutoModelForCausalLM.from_pretrained(model_name_or_path, device_map='cpu', torch_dtype=torch.bfloat16)

tokenizer = AutoTokenizer.from_pretrained(model_name_or_path)
print_nparams(pretrained_model)

Number of parameters: 248013824


Number of parameters: 248013824 => 248M

In [18]:
from copy import deepcopy

model.model.layers = deepcopy(pretrained_model.model.layers[:-4]) + deepcopy(pretrained_model.model.layers[4:])
model.model.embed_tokens = deepcopy(pretrained_model.model.embed_tokens)
model.lm_head = deepcopy(pretrained_model.lm_head)

print(model.config)

LlamaConfig {
  "attention_bias": false,
  "attention_dropout": 0.0,
  "bos_token_id": 1,
  "dtype": "bfloat16",
  "eos_token_id": 2,
  "head_dim": 32,
  "hidden_act": "silu",
  "hidden_size": 1024,
  "initializer_range": 0.02,
  "intermediate_size": 4096,
  "max_position_embeddings": 2048,
  "mlp_bias": false,
  "model_type": "llama",
  "num_attention_heads": 32,
  "num_hidden_layers": 16,
  "num_key_value_heads": 8,
  "pretraining_tp": 1,
  "rms_norm_eps": 1e-06,
  "rope_scaling": null,
  "rope_theta": 10000.0,
  "tie_word_embeddings": false,
  "transformers_version": "4.56.2",
  "use_cache": false,
  "vocab_size": 32000
}



In [19]:
print_nparams(model)

Number of parameters: 308839424


Number of parameters: 308839424 => 308M (by deepcopy 2 248M models)

In [20]:
prompt = 'I am an engineer, i love'

inputs = tokenizer(prompt, return_tensors='pt').to(model.device)
streamer = TextStreamer(
    tokenizer,
    skip_prompt=True,
    skip_special_tokens=False

)

outputs = model.generate(
    **inputs,
    streamer=streamer,
    use_cache=True,
    max_new_tokens=128,
    do_sample=False
)

Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


to use the word "miscellaneous" to describe the "miscellaneous" of a website.
I have been a long-time user of the site, and I have been a fan of it. I have been a fan of the site, and I have been a fan of the site's history. I have been a fan of the site's history, and I have been a fan of the site's history.
I have been a fan of the site's history, and I have been a fan of the site's history. I have been a fan of the site's


The model is initialized, but since it hasn't been pretrained, its responses aren't fluent. That's why pretraining is necessary after model preparation

In [21]:
# weight will be optimized at the time of pretraining, save the model for now
model.save_pretrained('TinySolar-308m-4k-init')

# Training Model

## 1. Load Model

In [22]:
import torch
from transformers import AutoModelForCausalLM

pretrained_model = AutoModelForCausalLM.from_pretrained(
    '/content/TinySolar-308m-4k-init',
    device_map='cpu',
    torch_dtype=torch.bfloat16,
    use_cache=False
)
pretrained_model

LlamaForCausalLM(
  (model): LlamaModel(
    (embed_tokens): Embedding(32000, 1024)
    (layers): ModuleList(
      (0-15): 16 x LlamaDecoderLayer(
        (self_attn): LlamaAttention(
          (q_proj): Linear(in_features=1024, out_features=1024, bias=False)
          (k_proj): Linear(in_features=1024, out_features=256, bias=False)
          (v_proj): Linear(in_features=1024, out_features=256, bias=False)
          (o_proj): Linear(in_features=1024, out_features=1024, bias=False)
        )
        (mlp): LlamaMLP(
          (gate_proj): Linear(in_features=1024, out_features=4096, bias=False)
          (up_proj): Linear(in_features=1024, out_features=4096, bias=False)
          (down_proj): Linear(in_features=4096, out_features=1024, bias=False)
          (act_fn): SiLU()
        )
        (input_layernorm): LlamaRMSNorm((1024,), eps=1e-06)
        (post_attention_layernorm): LlamaRMSNorm((1024,), eps=1e-06)
      )
    )
    (norm): LlamaRMSNorm((1024,), eps=1e-06)
    (rotary_emb): 

## 2. Load Dataset

In [23]:
import datasets
from torch.utils.data import Dataset

class CustomDataset(Dataset):
  def __init__(self, args, split='train'):
    self.args = args
    self.dataset = datasets.load_dataset(
        'parquet',
        data_files=args.dataset_name,
        split=split
    )
  def __len__(self):
    return len(self.dataset)

  def __getitem__(self, idx):
    # convert the lists to a LongTensor for Pytorch
    input_ids = torch.LongTensor(self.dataset[idx]['input_ids'])
    labels = torch.LongTensor(self.dataset[idx]['input_ids'])
    # return the sample as a dictionary
    return {'input_ids': input_ids, 'labels': labels}

## 3. Configure Trainng Arguments

In [24]:
from dataclasses import dataclass, field
import transformers

@dataclass
class CustomArguments(transformers.TrainingArguments):
  # Dataset configuration
  dataset_name: str = field(default='/content/packged_pretrain_dataset.parquet')
  num_proc: int = field(default=1)
  max_seq_length: int = field(default = 32)

  #core training configurations
  optim: str = field(default='adamw_torch') # trial and test which optimizer will be good for your usecases
  max_steps: int = field(default=30)
  per_device_train_batch_size: int = field(default=2)

  # Other training configurations
  seed: int = field(default=0)
  learning_rate: float = field(default=1e-4)
  weight_decay: float = field(default=0)
  warmup_steps: int = field(default=10)
  lr_scheduler_type: str = field(default='linear')
  gradient_checkpointing: bool = field(default=True)
  dataloader_num_workers: int = field(default=2)
  bf16: bool = field(default=False)
  gradient_accumulation_steps: int = field(default=1)

  # Logging configuration
  logging_steps: int = field(default=3)
  report_to: str = field(default='none')

  # # Save configuration (save intermediate checkpoint of the model)
  # save_strategy: str = field(default='steps')
  # save_steps: int = field(default=3)
  # save_total_limit: int = field(default=2)


In [25]:
parser = transformers.HfArgumentParser(CustomArguments)
args,  = parser.parse_args_into_dataclasses(
    args=['--output_dir', 'output', '--fp16']
)

In [None]:
train_dataset = CustomDataset(args)

In [27]:
print("Input shape: ", train_dataset[0]['input_ids'].shape)

Input shape:  torch.Size([32])


## 4. Run the trainer and monitor the loss

In [28]:
from transformers import Trainer, TrainingArguments, TrainerCallback

# Define a custom callback to log the loss value
class LossLoggingCallback(TrainerCallback):
  def on_log(self, args, state, control, logs=None, **kwargs):
    if logs is not None:
      self.logs.append(logs)

  def __init__(self):
    self.logs = []

# Initialize the callback
loss_logging_callback = LossLoggingCallback()

In [29]:
trainer = Trainer(
    model = pretrained_model,
    args=args,
    train_dataset=train_dataset,
    eval_dataset=None,
    callbacks=[loss_logging_callback]
)
trainer.train()

Step,Training Loss
3,5.3311
6,5.1631
9,4.8775
12,5.0319
15,5.0012
18,4.765
21,5.2578
24,4.9414
27,4.6946
30,4.7301


TrainOutput(global_step=30, training_loss=4.979365571339925, metrics={'train_runtime': 3050.574, 'train_samples_per_second': 0.02, 'train_steps_per_second': 0.01, 'total_flos': 3180342804480.0, 'train_loss': 4.979365571339925, 'epoch': 0.0001575274491580158})

TrainOutput(global_step=30, training_loss=4.979365571339925, metrics={'train_runtime': 3050.574, 'train_samples_per_second': 0.02, 'train_steps_per_second': 0.01, 'total_flos': 3180342804480.0, 'train_loss': 4.979365571339925, 'epoch': 0.0001575274491580158})

In [46]:
# Save the model training checkpoint
trainer.save_model('output/checkpoint-10000')

In [47]:
# # Saving configuration
# save_strategy:str = field(default='steps')
# save_steps: int = field(default=3)
# save_total_limit: int = field(default=2)

In [48]:
from transformers import AutoTokenizer, TextStreamer
model_name_or_path = 'upstage/TinySolar-248m-4k'
tokenizer = AutoTokenizer.from_pretrained(model_name_or_path)

In [49]:
from transformers import AutoTokenizer, TextStreamer, AutoModelForCausalLM
import torch

model_name_or_path = '/content/output/checkpoint-10000'
model2 = AutoModelForCausalLM.from_pretrained(model_name_or_path, device_map='cpu', torch_dtype=torch.bfloat16)
#

In [54]:
# Lets look at this output
prompt = "I am an engineer, i love"
inputs = tokenizer(prompt, return_tensors='pt').to(model2.device)

streamer = TextStreamer(
    tokenizer,
    skip_prompt=True,
    skip_special_tokens=False
)

outputs = model2.generate(
    **inputs,
    streamer=streamer,
    use_cache=True,
    max_new_tokens=64,
    do_sample=True,
    temperature=1.0
)

Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


my family , which turned to a person. During the year 20018 , the first album was released to Music and Music World . The third decade and album album and band started to come 2006 to be the first album album ever released , a first album that I used to play a


## Model Evaluation

In [55]:
# we will use open popular open source evaluation library called lm-evaluation-harness of eluetherAI
# harness serves many, many tasks for evaluation, among them we will choose to evaluate tiny solar on the `TruthfulQA MC2` task.
# The MC2 task comprised of multiple choice questions developed by the University of Oxford and OpenAI, and is one of the evaluation task includedin the HuggingFace open LLM leaderboard.
# MC2 task works as follow: Give an question and multiple true false reference answers, the score is the normalized  total probability assigned to the set of true answers.
# We will run this evaluation on CPU, as always and will run only 5 examples, so that we can end our evaluation within ten minutes.

!pip install -U git+http://github.com/EleutherAI/lm-evaluation-harness

Collecting git+http://github.com/EleutherAI/lm-evaluation-harness
  Cloning http://github.com/EleutherAI/lm-evaluation-harness to /tmp/pip-req-build-kpmd67cf
  Running command git clone --filter=blob:none --quiet http://github.com/EleutherAI/lm-evaluation-harness /tmp/pip-req-build-kpmd67cf
  Resolved http://github.com/EleutherAI/lm-evaluation-harness to commit c0fc717240032aec738c3199ae344887b5f34c23
  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone


In [56]:
# this will take quite along time because thelog likelihood is calculated for every candidate
!lm_eval --model hf \
  --model_args pretrained=upstage/tinySolar-248m-4k \
  --tasks truthfulqa_mc2 \
  --device cpu \
  --limit 5

2025-10-04 02:43:33.992736: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:467] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1759545814.065488   21250 cuda_dnn.cc:8579] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1759545814.088254   21250 cuda_blas.cc:1407] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
W0000 00:00:1759545814.163520   21250 computation_placer.cc:177] computation placer already registered. Please check linkage and avoid linking the same target more than once.
W0000 00:00:1759545814.163632   21250 computation_placer.cc:177] computation placer already registered. Please check linkage and avoid linking the same target more than once.
W0000 00:00:1759545814.163664   21250 computation_placer.cc:177] computation placer alr

You get score as around 0.4 and its unfair to compare this with another model that has billions of parameters.