In [1]:
import sys
print(sys.executable)
print(sys.version)


/app/home/marfok/LLM-World/.venv/bin/python
3.11.14 (main, Nov 19 2025, 22:47:14) [Clang 21.1.4 ]


In [2]:
!nvidia-smi

Fri Jan 16 22:33:22 2026       
+---------------------------------------------------------------------------------------+
| NVIDIA-SMI 535.129.03             Driver Version: 535.129.03   CUDA Version: 12.6     |
|-----------------------------------------+----------------------+----------------------+
| GPU  Name                 Persistence-M | Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp   Perf          Pwr:Usage/Cap |         Memory-Usage | GPU-Util  Compute M. |
|                                         |                      |               MIG M. |
|   0  NVIDIA H100 80GB HBM3          On  | 00000000:9D:00.0 Off |                    0 |
| N/A   29C    P0              71W / 700W |      4MiB / 81559MiB |      0%      Default |
|                                         |                      |             Disabled |
+-----------------------------------------+----------------------+----------------------+
                                                                    

In [3]:
from transformers import BitsAndBytesConfig
import os
from datasets import load_dataset

  from .autonotebook import tqdm as notebook_tqdm


# Data

In [4]:
!pwd

/app/home/marfok/LLM-World/Notebooks


In [5]:
trn_data = %pwd
trn_data = trn_data.replace('Notebooks','Files/training_set.jsonl')
val_data = trn_data.replace('training_set','validation_set')
print(trn_data)      
print(val_data)      
print(os.path.exists(trn_data))
print(os.path.exists(val_data))


/app/home/marfok/LLM-World/Files/training_set.jsonl
/app/home/marfok/LLM-World/Files/validation_set.jsonl
True
True


In [6]:
## Convert to huggingface dataset

data = load_dataset("json", data_files={
    "train":trn_data,
    "valid":val_data
})
data
                    

Generating train split: 409 examples [00:00, 37803.18 examples/s]
Generating valid split: 20 examples [00:00, 13598.00 examples/s]


DatasetDict({
    train: Dataset({
        features: ['messages'],
        num_rows: 409
    })
    valid: Dataset({
        features: ['messages'],
        num_rows: 20
    })
})

In [7]:
data['train'][:5]

{'messages': [[{'role': 'system',
    'content': 'You are a mathematician who is specialized in linear algebra and also statistics.'},
   {'role': 'user', 'content': 'What is a vector space in linear algebra?'},
   {'role': 'assistant',
    'content': 'A vector space is a collection of vectors where you can add them together and multiply them by scalars, following specific rules.'}],
  [{'role': 'system',
    'content': 'You are a mathematician who is specialized in linear algebra and also statistics.'},
   {'role': 'user', 'content': 'Can you explain eigenvalues in simple terms?'},
   {'role': 'assistant',
    'content': 'Eigenvalues are special numbers that show how a matrix stretches or shrinks vectors along certain directions.'}],
  [{'role': 'system',
    'content': 'You are a mathematician who is specialized in linear algebra and also statistics.'},
   {'role': 'user',
    'content': 'What is the difference between variance and standard deviation?'},
   {'role': 'assistant',
    

In [8]:
data['valid'][:5]

{'messages': [[{'role': 'system',
    'content': 'You are a mathematician who is specialized in linear algebra and also statistics.'},
   {'role': 'user',
    'content': 'What is the difference between matrix multiplication and element-wise multiplication?'},
   {'role': 'assistant',
    'content': 'Matrix multiplication involves dot products between rows and columns, while element-wise multiplication multiplies corresponding entries directly.'}],
  [{'role': 'system',
    'content': 'You are a mathematician who is specialized in linear algebra and also statistics.'},
   {'role': 'user', 'content': 'What is an orthogonal matrix?'},
   {'role': 'assistant',
    'content': 'An orthogonal matrix is a square matrix whose rows and columns are orthonormal vectors. Its inverse is the same as its transpose.'}],
  [{'role': 'system',
    'content': 'You are a mathematician who is specialized in linear algebra and also statistics.'},
   {'role': 'user', 'content': 'What is covariance in statisti

In [9]:
data['train'][0]['messages']

[{'role': 'system',
  'content': 'You are a mathematician who is specialized in linear algebra and also statistics.'},
 {'role': 'user', 'content': 'What is a vector space in linear algebra?'},
 {'role': 'assistant',
  'content': 'A vector space is a collection of vectors where you can add them together and multiply them by scalars, following specific rules.'}]

# Model

In [10]:
# https://huggingface.co/ibm-granite/granite-4.0-h-1b

In [11]:
from huggingface_hub import login
from dotenv import load_dotenv

In [12]:
load_dotenv('/app/cloned_repo/LLM-World/.env')

True

In [13]:
login(token=os.getenv('HF_TOKEN'))

Note: Environment variable`HF_TOKEN` is set and is the current active token independently from the token you've just configured.


In [10]:
file_dir = %pwd
outp_dir = file_dir.replace('Notebooks','Files/sm_output')
outp_dir

'/app/home/marfok/LLM-World/Files/sm_output'

In [11]:
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM
from peft import get_peft_model, LoraConfig, TaskType

In [16]:
model_id = "ibm-granite/granite-4.0-h-1b"

tokenizer = AutoTokenizer.from_pretrained(model_id) # Load Tokenizer
model = AutoModelForCausalLM.from_pretrained(model_id) # Load base model

The fast path is not available because one of `(selective_state_update, causal_conv1d_fn, causal_conv1d_update)` is None. Falling back to the naive implementation. To install follow https://github.com/state-spaces/mamba/#installation and https://github.com/Dao-AILab/causal-conv1d


In [17]:
## Serialization (dict -> str) for ability for model to read
def serialize_message(input_example):
    chat_str = ""
    for message in input_example['messages']:
        role = message['role']
        content = message['content']
        if role == "system":
            chat_str += f"System: {content}\n"
        elif role == "user":
            chat_str += f"User: {content}\n"
        elif role == "assistant":
            chat_str += f"Assistant: {content}\n"
    input_example["text"] = chat_str
    return input_example
        

In [18]:
## Tokenize the data
def tokenize_text(example):
    return tokenizer(
        example['text'],
        padding='max_length',
        truncation=True,
        max_length=512)

In [19]:
data = data.map(serialize_message)
data

DatasetDict({
    train: Dataset({
        features: ['messages', 'text'],
        num_rows: 409
    })
    valid: Dataset({
        features: ['messages', 'text'],
        num_rows: 20
    })
})

In [20]:
data['train'][0]

{'messages': [{'role': 'system',
   'content': 'You are a mathematician who is specialized in linear algebra and also statistics.'},
  {'role': 'user', 'content': 'What is a vector space in linear algebra?'},
  {'role': 'assistant',
   'content': 'A vector space is a collection of vectors where you can add them together and multiply them by scalars, following specific rules.'}],
 'text': 'System: You are a mathematician who is specialized in linear algebra and also statistics.\nUser: What is a vector space in linear algebra?\nAssistant: A vector space is a collection of vectors where you can add them together and multiply them by scalars, following specific rules.\n'}

In [21]:
data = data.map(tokenize_text,batched=True)
data

DatasetDict({
    train: Dataset({
        features: ['messages', 'text', 'input_ids', 'attention_mask'],
        num_rows: 409
    })
    valid: Dataset({
        features: ['messages', 'text', 'input_ids', 'attention_mask'],
        num_rows: 20
    })
})

In [22]:
data['train'][0]

{'messages': [{'role': 'system',
   'content': 'You are a mathematician who is specialized in linear algebra and also statistics.'},
  {'role': 'user', 'content': 'What is a vector space in linear algebra?'},
  {'role': 'assistant',
   'content': 'A vector space is a collection of vectors where you can add them together and multiply them by scalars, following specific rules.'}],
 'text': 'System: You are a mathematician who is specialized in linear algebra and also statistics.\nUser: What is a vector space in linear algebra?\nAssistant: A vector space is a collection of vectors where you can add them together and multiply them by scalars, following specific rules.\n',
 'input_ids': [100256,
  100256,
  100256,
  100256,
  100256,
  100256,
  100256,
  100256,
  100256,
  100256,
  100256,
  100256,
  100256,
  100256,
  100256,
  100256,
  100256,
  100256,
  100256,
  100256,
  100256,
  100256,
  100256,
  100256,
  100256,
  100256,
  100256,
  100256,
  100256,
  100256,
  100256,


In [23]:
# HugingFace Trainer (Basic setup)
from transformers import Trainer, TrainingArguments

train_args = TrainingArguments(
    output_dir=outp_dir,
    per_device_train_batch_size=1,
    per_device_eval_batch_size=4,
    gradient_accumulation_steps=8,
    eval_strategy="steps",
    save_steps=500,
    logging_steps=100,
    num_train_epochs=3,
    learning_rate=2e-5,
    # bf16=True,
    no_cuda=True, # dont use gpu when setting up variables If not using LoRA
    # use_cpu=True, # dont use gpu when setting up variables If not using LoRA
    fp16=True,
    push_to_hub=False,
    hub_model_id=None,
    hub_token=None
)

# better for training 1B+ OOM error possible without
lora_config = LoraConfig(
    r=16,
    lora_alpha=32,
    lora_dropout=0.05,
    task_type=TaskType.CAUSAL_LM,
    target_modules=["q_proj","v_proj"]
)

model = get_peft_model(model,lora_config)

trainer = Trainer(
    model=model,
    args=train_args,
    train_dataset=data['train'],
    eval_dataset=data['valid'],
    processing_class=tokenizer
)



In [24]:
save_dir = file_dir.replace('Notebooks','Files/sm_artifacts')
save_dir

'/app/cloned_repo/LLM-World/Files/sm_artifacts'

In [None]:
trainer.train()
trainer.save_model(save_dir)
trainer.tokenizer.save_pretrained(save_dir)

[34m[1mwandb[0m: [32m[41mERROR[0m Failed to detect the name of this notebook. You can set it manually with the WANDB_NOTEBOOK_NAME environment variable to enable code saving.
[34m[1mwandb[0m: (1) Create a W&B account
[34m[1mwandb[0m: (2) Use an existing W&B account
[34m[1mwandb[0m: (3) Don't visualize my results
[34m[1mwandb[0m: Enter your choice:

  3


[34m[1mwandb[0m: You chose "Don't visualize my results"


[34m[1mwandb[0m: Detected [huggingface_hub.inference, mcp] in use.
[34m[1mwandb[0m: Use W&B Weave for improved LLM call tracing. Install Weave with `pip install weave` then add `import weave` to the top of your script.
[34m[1mwandb[0m: For more information, check out the docs at: https://weave-docs.wandb.ai/


* Trackio project initialized: huggingface
* Trackio metrics will be synced to Hugging Face Dataset: marfok/trackio-dataset
* Creating new space: https://huggingface.co/spaces/marfok/trackio
* View dashboard by going to: https://marfok-trackio.hf.space/


* Created new run: marfok-1767974757


GraniteMoeHybrid requires an initialized `HybridMambaAttentionDynamicCache` to return a cache. Because one was not provided, no cache will be returned.


## With TRL

In [3]:
from datasets import load_dataset
from loguru import logger

  from .autonotebook import tqdm as notebook_tqdm


In [4]:
trn_data = %pwd
trn_data = trn_data.replace('Notebooks','Files/training_set.jsonl')
val_data = trn_data.replace('training_set','validation_set')

In [5]:
## Convert to huggingface dataset

data = load_dataset("json", data_files={
    "train":trn_data,
    "valid":val_data
})
data
                    

DatasetDict({
    train: Dataset({
        features: ['messages'],
        num_rows: 409
    })
    valid: Dataset({
        features: ['messages'],
        num_rows: 20
    })
})

In [6]:
from transformers import AutoTokenizer, AutoModelForCausalLM
import torch

In [7]:
model_name = "ibm-granite/granite-4.0-h-1b"


logger.info("SFT with TRL")
logger.info(f"Loading model: {model_name}")

[32m2026-01-17 08:07:35.224[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m4[0m - [1mSFT with TRL[0m
[32m2026-01-17 08:07:35.225[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m5[0m - [1mLoading model: ibm-granite/granite-4.0-h-1b[0m


In [8]:
model_id = model_name

tokenizer = AutoTokenizer.from_pretrained(model_id) # Load Tokenizer
logger.info(f"pad_token:{tokenizer.pad_token} (allows tokens to have the same length)")

if tokenizer.pad_token is None:    
    tokenizer.pad_token = tokenizer.eos_token
    logger.info(f"set pad_token=eos_token:{tokenizer.eos_token} (allows tokens to have the same length)")

# model = AutoModelForCausalLM.from_pretrained(model_id,device_map="auto",dtype=torch.bfloat16) # Load base model
model = AutoModelForCausalLM.from_pretrained(model_id) # Load base model

[32m2026-01-17 08:07:35.829[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m4[0m - [1mpad_token:<|pad|> (allows tokens to have the same length)[0m
The fast path is not available because one of `(selective_state_update, causal_conv1d_fn, causal_conv1d_update)` is None. Falling back to the naive implementation. To install follow https://github.com/state-spaces/mamba/#installation and https://github.com/Dao-AILab/causal-conv1d


In [9]:
model

GraniteMoeHybridForCausalLM(
  (model): GraniteMoeHybridModel(
    (embed_tokens): Embedding(100352, 1536, padding_idx=100256)
    (layers): ModuleList(
      (0-4): 5 x GraniteMoeHybridDecoderLayer(
        (input_layernorm): GraniteMoeHybridRMSNorm((1536,), eps=1e-05)
        (post_attention_layernorm): GraniteMoeHybridRMSNorm((1536,), eps=1e-05)
        (shared_mlp): GraniteMoeHybridMLP(
          (activation): SiLUActivation()
          (input_linear): Linear(in_features=1536, out_features=8192, bias=False)
          (output_linear): Linear(in_features=4096, out_features=1536, bias=False)
        )
        (mamba): GraniteMoeHybridMambaLayer(
          (act): SiLUActivation()
          (conv1d): Conv1d(3328, 3328, kernel_size=(4,), stride=(1,), padding=(3,), groups=3328)
          (in_proj): Linear(in_features=1536, out_features=6448, bias=False)
          (norm): GraniteMoeHybridRMSNormGated()
          (out_proj): Linear(in_features=3072, out_features=1536, bias=False)
        )


In [10]:
from peft import LoraConfig, get_peft_model

In [15]:
peft_config = LoraConfig(
    r=16,
    lora_alpha=32,
    target_modules=[
        "q_proj", "k_proj","v_proj","o_proj"],
    task_type="CAUSAL_LM"
)
pt_model = get_peft_model(model,peft_config)
pt_model

PeftModelForCausalLM(
  (base_model): LoraModel(
    (model): GraniteMoeHybridForCausalLM(
      (model): GraniteMoeHybridModel(
        (embed_tokens): Embedding(100352, 1536, padding_idx=100256)
        (layers): ModuleList(
          (0-4): 5 x GraniteMoeHybridDecoderLayer(
            (input_layernorm): GraniteMoeHybridRMSNorm((1536,), eps=1e-05)
            (post_attention_layernorm): GraniteMoeHybridRMSNorm((1536,), eps=1e-05)
            (shared_mlp): GraniteMoeHybridMLP(
              (activation): SiLUActivation()
              (input_linear): Linear(in_features=1536, out_features=8192, bias=False)
              (output_linear): Linear(in_features=4096, out_features=1536, bias=False)
            )
            (mamba): GraniteMoeHybridMambaLayer(
              (act): SiLUActivation()
              (conv1d): Conv1d(3328, 3328, kernel_size=(4,), stride=(1,), padding=(3,), groups=3328)
              (in_proj): Linear(in_features=1536, out_features=6448, bias=False)
              (

In [27]:
# trainable_params = sum(p.numel() for p in model.parameters() if p.requires_grad)
# total_params = sum(p.numel() for p in model.parameters())
trainable_params = sum(p.numel() for p in pt_model.parameters() if p.requires_grad)
total_params = sum(p.numel() for p in pt_model.parameters())


print(trainable_params)
print(total_params)

logger.info(f"Trainable parameters: {trainable_params:,}/{total_params:,} ({100*trainable_params/total_params:.2f}%)")

[32m2026-01-17 08:32:12.749[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m10[0m - [1mTrainable parameters: 655,360/1,462,193,728 (0.04%)[0m


655360
1462193728


In [12]:
from trl import SFTTrainer, SFTConfig

In [13]:
pwd_dir = %pwd
file_dir = pwd_dir.replace('Notebooks','Files')
logger.info(f"output_dir={file_dir}")

[32m2026-01-17 08:07:37.607[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m3[0m - [1moutput_dir=/app/home/marfok/LLM-World/Files[0m


In [29]:
sft_training_args = SFTConfig(
    output_dir=str(file_dir),
    num_train_epochs=4,
    per_device_train_batch_size=8,
    gradient_accumulation_steps=2,
    learning_rate=2e-5,
    max_length=2048,
    assistant_only_loss=True,
    warmup_ratio=0.1,
    logging_steps=10,
    save_strategy="epoch",
    save_total_limit=2,
    bf16=True,
    optim="adamw_torch",
    seed=42,
    gradient_checkpointing=True,
    # report_to="wandb",
    packing=False
)


trainer = SFTTrainer(
    model = model_name,
    processing_class=tokenizer,
    train_dataset=data['train'],
    peft_config=peft_config,
    args=sft_training_args
)

SyntaxError: invalid syntax. Perhaps you forgot a comma? (1133212435.py, line 24)

In [10]:
trainer.train()

[34m[1mwandb[0m: [32m[41mERROR[0m Failed to detect the name of this notebook. You can set it manually with the WANDB_NOTEBOOK_NAME environment variable to enable code saving.
[34m[1mwandb[0m: (1) Create a W&B account
[34m[1mwandb[0m: (2) Use an existing W&B account
[34m[1mwandb[0m: (3) Don't visualize my results
[34m[1mwandb[0m: Enter your choice:

  3


[34m[1mwandb[0m: You chose "Don't visualize my results"
[34m[1mwandb[0m: Using W&B in offline mode.
[34m[1mwandb[0m: W&B API key is configured. Use [1m`wandb login --relogin`[0m to force relogin


[34m[1mwandb[0m: Detected [huggingface_hub.inference, mcp] in use.
[34m[1mwandb[0m: Use W&B Weave for improved LLM call tracing. Install Weave with `pip install weave` then add `import weave` to the top of your script.
[34m[1mwandb[0m: For more information, check out the docs at: https://weave-docs.wandb.ai/


LocalTokenNotFoundError: You must be logged in to Hugging Face locally when `space_id` is provided to deploy to a Space. Token is required (`token=True`), but no token found. You need to provide a token or be logged in to Hugging Face with `hf auth login` or `huggingface_hub.login`. See https://huggingface.co/settings/tokens.