In [None]:
!pip install -q wandb --upgrade
!pip install -q numpy --upgrade

In [None]:
from unsloth import FastLanguageModel
import torch
max_seq_length = 2048 # Choose any! We auto support RoPE Scaling internally!
dtype = None # None for auto detection. Float16 for Tesla T4, V100, Bfloat16 for Ampere+
load_in_4bit = True 

model, tokenizer = FastLanguageModel.from_pretrained(
    model_name = "unsloth/Qwen2.5-3B-bnb-4bit",
    max_seq_length = max_seq_length,
    dtype = dtype,
    load_in_4bit = load_in_4bit,
)

Thêm `embed_tokens` và `lm_head` cho phép mô hình học phân phối của dữ liệu

In [None]:
model = FastLanguageModel.get_peft_model(
    model,
    r = 128,
    target_modules = ["q_proj", "k_proj", "v_proj", "o_proj",
                      "gate_proj", "up_proj", "down_proj",
                      "embed_tokens", "lm_head",],
    lora_alpha = 32,
    lora_dropout = 0,
    bias = "none",
    # [NEW] "unsloth" uses 30% less VRAM, fits 2x larger batch sizes!
    use_gradient_checkpointing = "unsloth", # True or "unsloth" for very long context
    random_state = 2512,
    use_rslora = True,   # We support rank stabilized LoRA
    loftq_config = None, # And LoftQ
)

# Data Pre

In [5]:
EOS_TOKEN = tokenizer.eos_token
prompt = """
Tiêu đề: {}

Nội dung: {}
"""

def formatting_prompts_func(examples):
    outputs = []
    titles = examples["title"]
    texts = examples['content']
    for title, text in zip(titles, texts):
        # Add EOS_TOKEN to avoid generate token forever.
        text = prompt.format(title, text) + EOS_TOKEN
        outputs.append(text)
    return { "text" : outputs, }

In [None]:
# read file raw_data.csv
from datasets import load_dataset
# from kaggle_secrets import UserSecretsClient
# user_secrets = UserSecretsClient()
token = "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"
dataset = load_dataset("NCKH-2425/raw_data_chunked", token=token, split="train")

# format data
dataset = dataset.map(formatting_prompts_func, batched = True,)

In [7]:
!wandb login baba1131230af0f5eedc5757d3520b0a2a861463

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /teamspace/studios/this_studio/.netrc
[34m[1mwandb[0m: W&B API key is configured. Use [1m`wandb login --relogin`[0m to force relogin


In [8]:
import os

os.environ["WANDB_PROJECT"] = "NCKH_2425_Qwen_pretrained"
os.environ["WANDB_LOG_MODEL"] = "checkpoint"

In [None]:
from transformers import TrainingArguments
from unsloth import is_bfloat16_supported
from unsloth import UnslothTrainer, UnslothTrainingArguments

trainer = UnslothTrainer(
    model = model,
    tokenizer = tokenizer,
    train_dataset = dataset,
    dataset_text_field = "text",
    max_seq_length = max_seq_length,
    dataset_num_proc = 2,

    args = UnslothTrainingArguments(
        per_device_train_batch_size = 64,
        gradient_accumulation_steps = 4,

        warmup_steps = 50,
        # max_steps = 500,
        # warmup_ratio = 0.1,
        num_train_epochs = 20,

        # Select a 2 to 10x smaller learning rate for the embedding matrices!
        learning_rate = 5e-5,
        embedding_learning_rate = 1e-5,

        fp16 = not is_bfloat16_supported(),
        bf16 = is_bfloat16_supported(),
        logging_steps = 25,
        optim = "adamw_8bit",
        weight_decay = 0.01,
        lr_scheduler_type = "linear",
        seed = 2512,
        output_dir = "outputs",
        report_to = "wandb", # Use this for WandB etc
        save_strategy = "steps",
        save_steps = 50,
        run_name = "NCKH_2425_Qwen_pretrained" # (Optional)
    ),
)

In [None]:
#@title Show current memory stats
gpu_stats = torch.cuda.get_device_properties(0)
start_gpu_memory = round(torch.cuda.max_memory_reserved() / 1024 / 1024 / 1024, 3)
max_memory = round(gpu_stats.total_memory / 1024 / 1024 / 1024, 3)
print(f"GPU = {gpu_stats.name}. Max memory = {max_memory} GB.")
print(f"{start_gpu_memory} GB of memory reserved.")

In [None]:
import wandb
import numpy as np

run = wandb.init()
artifact = run.use_artifact('voluyen-tdtu/NCKH_2425_Qwen_pretrained/model-NCKH_2425_Qwen_pretrained:v2', type='model')
artifact_dir = artifact.download()
torch.serialization.add_safe_globals([np.core.multiarray._reconstruct])
trainer_stats = trainer.train(resume_from_checkpoint=artifact_dir)

In [None]:
# trainer_stats = trainer.train()

<a name="Inference"></a>
### Suy luận

 You can also use a `TextStreamer` for continuous inference - so you can see the generation token by token, instead of waiting the whole time!

In [None]:
# alpaca_prompt = Copied from above
FastLanguageModel.for_inference(model) # Enable native 2x faster inference
inputs = tokenizer(
[
    "Các kiểu chồng lười",
], return_tensors = "pt").to("cuda")
outputs = model.generate(**inputs, max_new_tokens = 1024, use_cache = True)
tokenizer.batch_decode(outputs)
# from transformers import TextStreamer
# text_streamer = TextStreamer(tokenizer)
# _ = model.generate(**inputs, streamer = text_streamer, max_new_tokens = 2048)

In [None]:
inputs = tokenizer(
[
    "Kỹ năng chung sống trong gia đình",
], return_tensors = "pt").to("cuda")
outputs = model.generate(**inputs, max_new_tokens = 1024, use_cache = True)
tokenizer.batch_decode(outputs)

In [None]:
inputs = tokenizer(
[
    "5 điều phụ nữ không nên làm khi chồng ngoại tình",
], return_tensors = "pt").to("cuda")
outputs = model.generate(**inputs, max_new_tokens = 1024, use_cache = True)
tokenizer.batch_decode(outputs)

<a name="Save"></a>
### Saving, loading finetuned models
To save the final model as LoRA adapters, either use Huggingface's `push_to_hub` for an online save or `save_pretrained` for a local save.

**[NOTE]** This ONLY saves the LoRA adapters, and not the full model. To save to 16bit or GGUF, scroll down!

In [None]:
# Merge to 16bit
model.push_to_hub_merged("baesad/Pretrained-Qwen2.5-3B-NCKH", tokenizer, save_method = "merged_16bit", token = "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")