In [1]:
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM, TrainingArguments, Trainer, DataCollatorForLanguageModeling
from peft import LoraConfig, get_peft_model
from datasets import Dataset
import json

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
def create_model(
    lm_model_name = "google/gemma-3-1b-pt", 
    device = 'cuda' if torch.cuda.is_available() else 'cpu',
):
    tokenizer = AutoTokenizer.from_pretrained(lm_model_name)
    model = AutoModelForCausalLM.from_pretrained(
        lm_model_name,
        device_map="auto",
        attn_implementation="eager",
    ).to(device).train()
    # model.config.use_cache = False

    return model, tokenizer

In [3]:
def preprocess_function(
    examples
):
    out = tokenizer(
        examples["text"],
        truncation=True,
        max_length=512,
        padding="max_length",
    )
    labels = out["input_ids"].copy()
    if "attention_mask" in out:
        for i, mask in enumerate(out["attention_mask"]):
            labels[i] = [
                (tok if m == 1 else -100) for tok, m in zip(labels[i], mask)
            ]
    else:
        pad_id = tokenizer.pad_token_id
        labels = [[(tok if tok != pad_id else -100) for tok in row] for row in labels]
    out["labels"] = labels
    return out

def create_dataset(
    documents_path: str, 
    tokenizer, 

):
    custom_data = []
    with open(documents_path, 'r', encoding='utf-8') as file:
        for line in file:
            line = line.strip()
            if line:
                data = json.loads(line)
                custom_data.append(data)

    dataset = Dataset.from_list(custom_data)

    if tokenizer.pad_token is None:
        tokenizer.pad_token = tokenizer.eos_token

    tokenized_dataset = dataset.map(
        preprocess_function,
        batched=True,
        remove_columns=dataset.column_names
    )
    train_test_split = tokenized_dataset.train_test_split(test_size=0.1)
    return train_test_split["train"], train_test_split["test"]

In [4]:
model, tokenizer = create_model(
    lm_model_name="google/gemma-3-1b-pt"
)


In [5]:
documents_path = './dataset/ncku_wikipedia_2510080406.jsonl'

train_dataset, eval_dataset = create_dataset(
    documents_path=documents_path, 
    tokenizer=tokenizer, 
)

Map: 100%|██████████| 20/20 [00:00<00:00, 2699.91 examples/s]


In [6]:
lora_config = LoraConfig(
    r=16,
    lora_alpha=32,
    target_modules=[
        "q_proj",
        "k_proj", 
        "v_proj",
        "o_proj",
        "gate_proj",
        "up_proj",
        "down_proj"
    ],
    lora_dropout=0.05,
    bias="none",
    task_type="CAUSAL_LM"
)
model = get_peft_model(model, lora_config)
model.print_trainable_parameters()

trainable params: 13,045,760 || all params: 1,012,931,712 || trainable%: 1.2879


In [7]:
training_args = TrainingArguments(
    output_dir="./gemma-3-1b-pt-lora",
    num_train_epochs=100,
    per_device_train_batch_size=4,
    per_device_eval_batch_size=4,
    gradient_accumulation_steps=4,
    learning_rate=2e-4,
    logging_steps=10,
    save_steps=100,
    eval_steps=100,
    save_total_limit=3,
    optim="adamw_torch_fused",
    warmup_steps=50,
    report_to="none",
)

data_collator = DataCollatorForLanguageModeling(
    tokenizer=tokenizer,
    mlm=False
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=eval_dataset,
    data_collator=data_collator,
)

trainer.train()

trainer.save_model("./gemma-3-1b-pt-lora-final")

No label_names provided for model class `PeftModelForCausalLM`. Since `PeftModel` hides base models input arguments, if label_names is not given, label_names can't be set automatically within `Trainer`. Note that empty label_names list will be used instead.


Step,Training Loss
10,2.3558
20,2.0626
30,1.8244
40,1.2496
50,0.5236
60,0.1624
70,0.075
80,0.0448
90,0.0336
100,0.0316
