# Welcome to Yuna Ai LLM trainer

In [None]:
!pip install --no-deps bitsandbytes accelerate xformers==0.0.29.post3 peft trl triton cut_cross_entropy unsloth_zoo
!pip install sentencepiece protobuf "datasets>=3.4.1" huggingface_hub hf_transfer
!pip install --no-deps unsloth
!pip install --no-deps git+https://github.com/huggingface/transformers.git # Only for Gemma 3N
!pip install --no-deps --upgrade timm # Only for Gemma 3N
!pip install transformers==4.53.0

# Initialize the Model

In [None]:
from unsloth import FastModel
max_seq_length = 16384

model, tokenizer = FastModel.from_pretrained(
    model_name = "modelpath",
    max_seq_length = max_seq_length,
    full_finetuning = True, # use without LoRA
    dtype = None,
    load_in_4bit = True,
    #load_in_8bit = True
)

In [None]:
model = FastLanguageModel.get_peft_model(
    model,
    r = 128, # 128 or 256
    target_modules = ["q_proj", "k_proj", "v_proj", "o_proj",
                      "gate_proj", "up_proj", "down_proj",
                      "embed_tokens", "lm_head"],
    lora_alpha = 128, # 128 or 256
    lora_dropout = 0,
    bias = "all", # "all" or "lora_only"
    use_gradient_checkpointing = "unsloth",
    random_state = 8,
    use_rslora = True,
    loftq_config = None, # And LoftQ
)

In [None]:
# Freeze the vision tower and multi-modal projector
for name, param in model.named_parameters():
    if "vision_tower" in name or "multi_modal_projector" in name:
        param.requires_grad_(False)

# Bug Fixes

In [None]:
# Add this line right here if doesn't work!
model.resize_token_embeddings(262164)

# Training time

In [None]:
# Connect the stuff
from google.colab import drive
drive.mount('/content/drive')

In [None]:
from datasets import load_dataset
dataset = load_dataset("json", data_files="datapath.jsonl")

In [None]:
# This is where the magic happens
from unsloth import UnslothTrainer, UnslothTrainingArguments

trainer = UnslothTrainer(
    model=model,
    tokenizer=tokenizer,
    train_dataset=dataset["train"],
    eval_dataset = None,
    dataset_text_field="text",
    max_seq_length=max_seq_length,
    packing=False, # only for tiny datasets
    args=UnslothTrainingArguments(
        output_dir="OutputFolder",
        per_device_train_batch_size=1,
        gradient_accumulation_steps=4,
        num_train_epochs = 50,
        warmup_ratio=0.1,
        learning_rate=1e-5,
        embedding_learning_rate=1e-6,  # 2-10x smaller than learning_rate
        bf16=True,
        logging_steps=1,
        optim="adamw_8bit",
        weight_decay=0.01,
        lr_scheduler_type="cosine",
        seed=8,
        max_grad_norm=2.0,
        report_to="tensorboard",
        save_strategy = "epoch",
        save_steps=1,
        gradient_checkpointing=True,
        save_safetensors = False # safetensors are bad!
    ),
)

trainer_stats = trainer.train()

# Inference

In [None]:
from transformers import Gemma3ForConditionalGeneration, AutoTokenizer
import torch

# Load the model and tokenizer
model = Gemma3ForConditionalGeneration.from_pretrained("modelpath", trust_remote_code=True, torch_dtype=torch.float16, device_map="auto")
tokenizer = AutoTokenizer.from_pretrained("modelpath", trust_remote_code=True)
prompt = "Once upon a time,"  # Your input prompt
inputs = tokenizer(prompt, return_tensors="pt").to(model.device)

# Apply softmax temperature scaling to the logits before sampling
with torch.no_grad():
    outputs = model(**inputs, max_new_tokens=50, output_hidden_states=True, return_dict_in_generate=True)
    logits = outputs.logits[:, -1, :]
    temperature = 0.7
    probs = torch.softmax(logits / temperature, dim=-1)
    next_token = torch.multinomial(probs, num_samples=1)

# Append the sampled token to the input IDs and continue generation
input_ids = torch.cat([inputs["input_ids"], next_token], dim=-1)
generated_ids = model.generate(input_ids=input_ids, max_new_tokens=49)  # Adjust max_new_tokens accordingly

generated_text = tokenizer.decode(generated_ids[0], skip_special_tokens=False)
print(generated_text)

# Export

In [None]:
import torch
from transformers import Gemma3ForConditionalGeneration, AutoTokenizer
from peft import PeftModel

# Load the base model in FP16
base_model = Gemma3ForConditionalGeneration.from_pretrained(
    "modelpath",
    low_cpu_mem_usage=True,
    return_dict=True,
    torch_dtype=torch.float16,
    device_map="cpu",
    tie_word_embeddings=False
)

base_model.lm_head.weight.data = base_model.get_input_embeddings().weight.data.clone()

# Load the PEFT model
model = PeftModel.from_pretrained(base_model, "checkpointpath")

# Merge the LoRA weights into the base model
merged_model = model.merge_and_unload()

# Update the generation config
merged_model.generation_config.temperature = 0.7
merged_model.generation_config.top_p = 1.0
merged_model.generation_config.do_sample = True

# Load and configure the tokenizer
tokenizer = AutoTokenizer.from_pretrained("checkpointpath", trust_remote_code=True)
tokenizer.padding_side = "right"

# Save the full merged model and tokenizer
merged_model.save_pretrained("outputpath", max_shard_size="40GB", safe_serialization=False)
tokenizer.save_pretrained("outputpath")

In [None]:
# check the folder size

!du -sh folderpath

In [None]:
!huggingface-cli upload username/model "modelfolder" .

# Create

In [None]:
from transformers import AutoModelForCausalLM, AutoTokenizer
import torch
import os

# 1. Load the base model
base_model = AutoModelForCausalLM.from_pretrained(
    "modelpath",
    low_cpu_mem_usage=True,
    torch_dtype=torch.bfloat16,
    device_map="cuda",
    trust_remote_code=True
)

# 2. Initialize tokenizer
tokenizer = AutoTokenizer.from_pretrained("modelpath", trust_remote_code=True)

# Retrieve existing special tokens
existing_special_tokens = tokenizer.special_tokens_map.get("additional_special_tokens", [])

# Define new special tokens
new_special_tokens = [
    "<memory>", "</memory>", "<shujinko>", "</shujinko>",
    "<kanojo>", "</kanojo>", "<dialog>", "<yuki>", "</yuki>",
    "<yuna>", "</yuna>", "<hito>", "</hito>", "<qt>", "</qt>",
    "<action>", "</action>", "<data>", "</data>"]

# Combine existing and new special tokens, ensuring no duplicates
all_special_tokens = list(set(existing_special_tokens + new_special_tokens))

# Add them to the tokenizer
tokenizer.add_special_tokens({"additional_special_tokens": all_special_tokens})

print(f"New vocab size: {len(tokenizer)}")

# 3. Resize model embeddings
base_model.resize_token_embeddings(len(tokenizer))

# 4. Save the updated model and tokenizer
base_model.save_pretrained("new_model_dir", max_shard_size="40GB", safe_serialization=False)
tokenizer.save_pretrained("new_model_dir")

print("New model with additional special tokens created and saved.")

In [None]:
# HF conversion
import torch
from transformers import Gemma3ForConditionalGeneration, AutoTokenizer

# Step 2: Load the PyTorch model (this automatically unties the weights)
print("Reloading model to untie weights...")
untied_model = Gemma3ForConditionalGeneration.from_pretrained(
    "modelpath",
    low_cpu_mem_usage=True,
    torch_dtype=torch.float16,
    device_map="cpu",
)
# Step 1: Load the tokenizer
print("Loading tokenizer...")
tokenizer = AutoTokenizer.from_pretrained("modelpath", use_fast=True)
untied_model.lm_head.weight = torch.nn.Parameter(untied_model.language_model.embed_tokens.weight.clone())

# Step 3: Save in safetensors format
print("Converting to safetensors format...")
untied_model.save_pretrained("outputpath", max_shard_size="40GB", safe_serialization=True)
tokenizer.save_pretrained("outputpath")

print("Conversion complete!")