In [None]:
import sys
print(sys.executable)
print(sys.version)


In [None]:
!nvidia-smi

In [None]:
from transformers import BitsAndBytesConfig
import os
from datasets import load_dataset

# Data

In [None]:
!pwd

In [None]:
trn_data = %pwd
trn_data = trn_data.replace('Notebooks','Files/training_set.jsonl')
val_data = trn_data.replace('training_set','validation_set')
print(trn_data)      
print(val_data)      
print(os.path.exists(trn_data))
print(os.path.exists(val_data))


In [None]:
## Convert to huggingface dataset

data = load_dataset("json", data_files={
    "train":trn_data,
    "valid":val_data
})
data
                    

In [None]:
data['train'][:5]

In [None]:
data['valid'][:5]

In [None]:
data['train'][0]['messages']

# Model

In [None]:
# https://huggingface.co/ibm-granite/granite-4.0-h-1b

In [None]:
from huggingface_hub import login
from dotenv import load_dotenv

In [None]:
load_dotenv('/app/cloned_repo/LLM-World/.env')

In [None]:
login(token=os.getenv('HF_TOKEN'))

In [None]:
file_dir = %pwd
outp_dir = file_dir.replace('Notebooks','Files/sm_output')
outp_dir

In [None]:
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM
from peft import get_peft_model, LoraConfig, TaskType

In [None]:
model_id = "ibm-granite/granite-4.0-h-1b"

tokenizer = AutoTokenizer.from_pretrained(model_id) # Load Tokenizer
model = AutoModelForCausalLM.from_pretrained(model_id) # Load base model

In [None]:
## Serialization (dict -> str) for ability for model to read (flatten)
def serialize_message(input_example):
    chat_str = ""
    for message in input_example['messages']:
        role = message['role']
        content = message['content']
        if role == "system":
            chat_str += f"System: {content}\n"
        elif role == "user":
            chat_str += f"User: {content}\n"
        elif role == "assistant":
            chat_str += f"Assistant: {content}\n"
    input_example["text"] = chat_str
    return input_example
        

In [None]:
## Tokenize the data
def tokenize_text(example):
    return tokenizer(
        example['text'],
        padding='max_length',
        truncation=True,
        max_length=512)

In [None]:
data = data.map(serialize_message)
data

In [None]:
data['train'][0]

In [None]:
data = data.map(tokenize_text,batched=True)
data

In [None]:
data['train'][0]

In [None]:
# HugingFace Trainer (Basic setup)
from transformers import Trainer, TrainingArguments

train_args = TrainingArguments(
    output_dir=outp_dir,
    per_device_train_batch_size=1,
    per_device_eval_batch_size=4,
    gradient_accumulation_steps=8,
    eval_strategy="steps",
    save_steps=500,
    logging_steps=100,
    num_train_epochs=3,
    learning_rate=2e-5,
    # bf16=True,
    no_cuda=True, # dont use gpu when setting up variables If not using LoRA
    # use_cpu=True, # dont use gpu when setting up variables If not using LoRA
    fp16=True,
    push_to_hub=False,
    hub_model_id=None,
    hub_token=None
)

# better for training 1B+ OOM error possible without
lora_config = LoraConfig(
    r=16,
    lora_alpha=32,
    lora_dropout=0.05,
    task_type=TaskType.CAUSAL_LM,
    target_modules=["q_proj","v_proj"]
)

model = get_peft_model(model,lora_config)

trainer = Trainer(
    model=model,
    args=train_args,
    train_dataset=data['train'],
    eval_dataset=data['valid'],
    processing_class=tokenizer
)

In [None]:
save_dir = file_dir.replace('Notebooks','Files/sm_artifacts')
save_dir

In [None]:
trainer.train()
trainer.save_model(save_dir)
trainer.tokenizer.save_pretrained(save_dir)

## With TRL

In [1]:
from datasets import load_dataset
from loguru import logger

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
trn_data = %pwd
trn_data = trn_data.replace('Notebooks','Files/training_set.jsonl')
val_data = trn_data.replace('training_set','validation_set')

In [3]:
## Convert to huggingface dataset

data = load_dataset("json", data_files={
    "train":trn_data,
    "valid":val_data
})
data
                    

DatasetDict({
    train: Dataset({
        features: ['messages'],
        num_rows: 409
    })
    valid: Dataset({
        features: ['messages'],
        num_rows: 20
    })
})

In [26]:
print(len(data['train']['messages']))
data['train']['messages'][0] # Conversation 1

409


[{'role': 'system',
  'content': 'You are a mathematician who is specialized in linear algebra and also statistics.'},
 {'role': 'user', 'content': 'What is a vector space in linear algebra?'},
 {'role': 'assistant',
  'content': 'A vector space is a collection of vectors where you can add them together and multiply them by scalars, following specific rules.'}]

In [None]:
## Serialization (dict -> str) for ability for model to read (flatten)
def serialize_message(input_example):
    chat_str = ""
    for message in input_example['messages']:
        role = message['role']
        content = message['content']
        if role == "system":
            chat_str += f"System: {content}\n"
        elif role == "user":
            chat_str += f"User: {content}\n"
        elif role == "assistant":
            chat_str += f"Assistant: {content}\n"
    input_example["text"] = chat_str
    return input_example
        

In [33]:
# Group by conversation so chat template can be used (alpac
flat_data = 

[{'conversation': [{'role': 'system',
    'content': 'You are a mathematician who is specialized in linear algebra and also statistics.'},
   {'role': 'user', 'content': 'What is a vector space in linear algebra?'},
   {'role': 'assistant',
    'content': 'A vector space is a collection of vectors where you can add them together and multiply them by scalars, following specific rules.'}]},
 {'conversation': [{'role': 'system',
    'content': 'You are a mathematician who is specialized in linear algebra and also statistics.'},
   {'role': 'user', 'content': 'Can you explain eigenvalues in simple terms?'},
   {'role': 'assistant',
    'content': 'Eigenvalues are special numbers that show how a matrix stretches or shrinks vectors along certain directions.'}]},
 {'conversation': [{'role': 'system',
    'content': 'You are a mathematician who is specialized in linear algebra and also statistics.'},
   {'role': 'user',
    'content': 'What is the difference between variance and standard devia

In [34]:
from trl import apply_chat_template

# single strinh prompt for causal lm (needs a list of dictionaris)
train_data_lst = data['train'].to_list()

formatted_train = apply_chat_template(
    conversations,
    tokenizer=tokenizer,
    max_seq_length=2048,
    template_name="alpaca"
)

formatted_train

AttributeError: 'list' object has no attribute 'keys'

In [4]:
from transformers import AutoTokenizer, AutoModelForCausalLM
import torch

In [5]:
model_name = "ibm-granite/granite-4.0-h-1b"


logger.info("SFT with TRL")
logger.info(f"Loading model: {model_name}")

[32m2026-01-17 10:45:23.857[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m4[0m - [1mSFT with TRL[0m
[32m2026-01-17 10:45:23.858[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m5[0m - [1mLoading model: ibm-granite/granite-4.0-h-1b[0m


In [6]:
model_id = model_name

tokenizer = AutoTokenizer.from_pretrained(model_id) # Load Tokenizer
logger.info(f"pad_token:{tokenizer.pad_token} (allows tokens to have the same length)")

if tokenizer.pad_token is None:    
    tokenizer.pad_token = tokenizer.eos_token
    logger.info(f"set pad_token=eos_token:{tokenizer.eos_token} (allows tokens to have the same length)")

# model = AutoModelForCausalLM.from_pretrained(model_id,device_map="auto",dtype=torch.bfloat16) # Load base model
model = AutoModelForCausalLM.from_pretrained(model_id) # Load base model

[32m2026-01-17 10:45:24.193[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m4[0m - [1mpad_token:<|pad|> (allows tokens to have the same length)[0m
The fast path is not available because one of `(selective_state_update, causal_conv1d_fn, causal_conv1d_update)` is None. Falling back to the naive implementation. To install follow https://github.com/state-spaces/mamba/#installation and https://github.com/Dao-AILab/causal-conv1d


In [7]:
model

GraniteMoeHybridForCausalLM(
  (model): GraniteMoeHybridModel(
    (embed_tokens): Embedding(100352, 1536, padding_idx=100256)
    (layers): ModuleList(
      (0-4): 5 x GraniteMoeHybridDecoderLayer(
        (input_layernorm): GraniteMoeHybridRMSNorm((1536,), eps=1e-05)
        (post_attention_layernorm): GraniteMoeHybridRMSNorm((1536,), eps=1e-05)
        (shared_mlp): GraniteMoeHybridMLP(
          (activation): SiLUActivation()
          (input_linear): Linear(in_features=1536, out_features=8192, bias=False)
          (output_linear): Linear(in_features=4096, out_features=1536, bias=False)
        )
        (mamba): GraniteMoeHybridMambaLayer(
          (act): SiLUActivation()
          (conv1d): Conv1d(3328, 3328, kernel_size=(4,), stride=(1,), padding=(3,), groups=3328)
          (in_proj): Linear(in_features=1536, out_features=6448, bias=False)
          (norm): GraniteMoeHybridRMSNormGated()
          (out_proj): Linear(in_features=3072, out_features=1536, bias=False)
        )


In [8]:
model.config.use_cache = False # required for gradient checkpointing

In [9]:
from peft import LoraConfig, get_peft_model

In [10]:
peft_config = LoraConfig(
    r=16,
    lora_alpha=32,
    lora_dropout=0.05,
    target_modules=[
        "q_proj", "k_proj","v_proj","o_proj"],
    bias="none",
    task_type="CAUSAL_LM"
)
pt_model = get_peft_model(model,peft_config)
pt_model.gradient_checkpointing_enable()
pt_model

PeftModelForCausalLM(
  (base_model): LoraModel(
    (model): GraniteMoeHybridForCausalLM(
      (model): GraniteMoeHybridModel(
        (embed_tokens): Embedding(100352, 1536, padding_idx=100256)
        (layers): ModuleList(
          (0-4): 5 x GraniteMoeHybridDecoderLayer(
            (input_layernorm): GraniteMoeHybridRMSNorm((1536,), eps=1e-05)
            (post_attention_layernorm): GraniteMoeHybridRMSNorm((1536,), eps=1e-05)
            (shared_mlp): GraniteMoeHybridMLP(
              (activation): SiLUActivation()
              (input_linear): Linear(in_features=1536, out_features=8192, bias=False)
              (output_linear): Linear(in_features=4096, out_features=1536, bias=False)
            )
            (mamba): GraniteMoeHybridMambaLayer(
              (act): SiLUActivation()
              (conv1d): Conv1d(3328, 3328, kernel_size=(4,), stride=(1,), padding=(3,), groups=3328)
              (in_proj): Linear(in_features=1536, out_features=6448, bias=False)
              (

In [11]:
# trainable_params = sum(p.numel() for p in model.parameters() if p.requires_grad)
# total_params = sum(p.numel() for p in model.parameters())
trainable_params = sum(p.numel() for p in pt_model.parameters() if p.requires_grad)
total_params = sum(p.numel() for p in pt_model.parameters())


print(trainable_params)
print(total_params)

logger.info(f"Trainable parameters: {trainable_params:,}/{total_params:,} ({100*trainable_params/total_params:.2f}%)")

[32m2026-01-17 10:45:27.587[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m10[0m - [1mTrainable parameters: 655,360/1,462,193,728 (0.04%)[0m


655360
1462193728


In [12]:
from trl import SFTTrainer, SFTConfig

In [13]:
pwd_dir = %pwd
file_dir = pwd_dir.replace('Notebooks','Files/granite_ckp')
logger.info(f"output_dir={file_dir}")

[32m2026-01-17 10:45:30.580[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m3[0m - [1moutput_dir=/app/home/marfok/LLM-World/Files/granite_ckp[0m


In [14]:
sft_training_args = SFTConfig(
    output_dir=str(file_dir),
    num_train_epochs=4,
    per_device_train_batch_size=8,
    gradient_accumulation_steps=2,
    lr_scheduler_type="cosine",
    learning_rate=2e-5,
    max_length=2048,
    warmup_ratio=0.1,
    logging_steps=10,
    save_strategy="epoch",
    save_total_limit=2,
    bf16=True,
    optim="adamw_torch",
    seed=42,
    gradient_checkpointing=True,
    report_to="none",#"wandb",
    packing=False,
    assistant_only_loss=False
)


trainer = SFTTrainer(
    model = pt_model,
    processing_class=tokenizer,
    train_dataset=data['train'],
    # peft_config=peft_config,
    args=sft_training_args
)

In [15]:
trainer.train()

Step,Training Loss
10,4.9035
20,4.7899
30,4.4428
40,4.1826
50,3.9265
60,3.6556
70,3.4964
80,3.352
90,3.3462
100,3.3105


TrainOutput(global_step=104, training_loss=3.911779146928054, metrics={'train_runtime': 1051.0639, 'train_samples_per_second': 1.557, 'train_steps_per_second': 0.099, 'total_flos': 1362585787904640.0, 'train_loss': 3.911779146928054})

In [None]:
pt_model.save_pretrained(file_dir)
tokenizer.save_pretrained(file_dir)

In [None]:
checkpoint_info = {
    "model_name":model_name,
    "output_dir":file_dir,
    "num_epochs":epochs,
    "total_examples":len(dataset),
    "loar_config":{"r":LORA_R,"alpha":LORA_ALPHA, "target_modules":TARGET_MODULES},
    "training_config":{"batch_size":batch_size, "gradient_accumulation":gradient_accululation,"learning_rate":learning_rate,"max_length":max_length}
    }

info_path = file_dir/"checkpont_info.json"

with open(info_path,"w") as f:
    json.dump(checkpoint_info, f, indent=2)

logger.info(f"Checkpoint info saved to {info_path}")

logger.info(f"Model saved to {file_dir}")

In [None]:
import os
os.path.join(file_dir,'hi')

In [None]:
## Load model
from peft import PeftModel
chk_dir = pwd.replace
ft_model = 