# Setup environment

## Environment variables

In [1]:
import os
os.environ["CUDA_VISIBLE_DEVICES"] = "0" # Only use 1 GPU
os.environ["TOKENIZERS_PARALLELISM"] = "false"

## Get secrets

In [2]:
from kaggle_secrets import UserSecretsClient
user_secrets = UserSecretsClient()
HF_TOKEN = user_secrets.get_secret("HF_TOKEN")
WANDB_API_KEY = user_secrets.get_secret("WANDB_API_KEY")

## Import modules

In [3]:
!pip install -qU transformers accelerate bitsandbytes

[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m10.5/10.5 MB[0m [31m83.5 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m365.3/365.3 kB[0m [31m29.6 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m67.0/67.0 MB[0m [31m29.2 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m363.4/363.4 MB[0m [31m4.7 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m664.8/664.8 MB[0m [31m2.6 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m211.5/211.5 MB[0m [31m8.1 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m56.3/56.3 MB[0m [31m31.5 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m127.9/127.9 MB[0m [31m13.6 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━

In [4]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, Dataset

from transformers import (
    AutoTokenizer, 
    AutoModelForCausalLM,
    BitsAndBytesConfig,
    get_linear_schedule_with_warmup
)
from datasets import load_dataset

import wandb
import numpy as np
from datetime import datetime
import json
from tqdm.auto import tqdm
import gc
import math
import time

## Random seed & device

In [5]:
# Set random seeds for reproducibility
torch.manual_seed(42)
np.random.seed(42)

# Check GPU availability
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

Using device: cuda


# Finetune config

In [6]:
class Config:
    # Model configuration
    model_name = "Qwen/Qwen3-0.6B"
    # model_name = "Qwen/Qwen3-1.7B"
    dataset_name = "vietgpt/wikipedia_vi"
    use_quantization = False
    
    # Training configuration
    output_dir = "./qwen-vietnamese-wiki-finetuned"
    # output_dir = "./qwen-vietnamese-wiki-finetuned-2"
    num_train_epochs = 5
    per_device_train_batch_size = 2
    per_device_valid_batch_size = 2
    gradient_accumulation_steps = 8
    learning_rate = 5e-5
    weight_decay = 0.01
    warmup_ratio = 0.1
    max_length = 128

    # Optimization settings
    adam_epsilon = 1e-8
    max_grad_norm = 1.0
    
    # Logging and saving
    logging_steps = 40
    save_strategy = "epoch"
    valid_strategy = "epoch"
    
    # Other settings
    fp16 = True
    num_workers = os.cpu_count()
    
    # W&B configuration
    use_wandb = True
    wandb_run_id = None
    wandb_project = "PARADIS-Qwen3_0.6B"
    # wandb_project = "PARADIS-Qwen3_1.7B"
    wandb_run_name = "1GPU"

    # HuggingFace configuration
    use_hf = True
    hf_repo = "h9art/PARADIS-Qwen3_0.6B-10kWikiVi-1GPU"
    # hf_repo = "h9art/PARADIS-Qwen3_1.7B-10kWikiVi-1GPU"
    
    # Dataset
    train_size = 10000
    valid_size = 10000
    test_size = 5000
    min_text_length = 50
    random_seed = 42

config = Config()

In [7]:
config_dict = {k: v for k, v in Config.__dict__.items() if not k.startswith("__") and not callable(v)}
config_dict

{'model_name': 'Qwen/Qwen3-0.6B',
 'dataset_name': 'vietgpt/wikipedia_vi',
 'use_quantization': False,
 'output_dir': './qwen-vietnamese-wiki-finetuned',
 'num_train_epochs': 5,
 'per_device_train_batch_size': 2,
 'per_device_valid_batch_size': 2,
 'gradient_accumulation_steps': 8,
 'learning_rate': 5e-05,
 'weight_decay': 0.01,
 'warmup_ratio': 0.1,
 'max_length': 128,
 'adam_epsilon': 1e-08,
 'max_grad_norm': 1.0,
 'logging_steps': 40,
 'save_strategy': 'epoch',
 'valid_strategy': 'epoch',
 'fp16': True,
 'num_workers': 4,
 'use_wandb': True,
 'wandb_run_id': None,
 'wandb_project': 'PARADIS-Qwen3_0.6B',
 'wandb_run_name': '1GPU',
 'use_hf': True,
 'hf_repo': 'h9art/PARADIS-Qwen3_0.6B-10kWikiVi-1GPU',
 'train_size': 10000,
 'valid_size': 10000,
 'test_size': 5000,
 'min_text_length': 50,
 'random_seed': 42}

# Setup wandb

In [8]:
wandb.login(key=WANDB_API_KEY)
if config.use_wandb:
    if config.wandb_run_id is None:
        wandb.init( # New run
            project=config.wandb_project,
            name=config.wandb_run_name,
            config=config_dict,
        )
    else:
        wandb.init( # Resume to created run
            project=config.wandb_project,
            id=config.wandb_run_id,
            resume='allow',
        )

[34m[1mwandb[0m: Using wandb-core as the SDK backend.  Please refer to https://wandb.me/wandb-core for more information.
[34m[1mwandb[0m: No netrc file found, creating one.
[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc
[34m[1mwandb[0m: Currently logged in as: [33mtqhb2502[0m ([33mtqhb2502-hanoi-university-of-science-and-technology[0m) to [32mhttps://api.wandb.ai[0m. Use [1m`wandb login --relogin`[0m to force relogin
[34m[1mwandb[0m: Tracking run with wandb version 0.19.9
[34m[1mwandb[0m: Run data is saved locally in [35m[1m/kaggle/working/wandb/run-20250621_074238-h7cnq6zi[0m
[34m[1mwandb[0m: Run [1m`wandb offline`[0m to turn off syncing.
[34m[1mwandb[0m: Syncing run [33m1GPU[0m
[34m[1mwandb[0m: ⭐️ View project at [34m[4mhttps://wandb.ai/tqhb2502-hanoi-university-of-science-and-technology/PARADIS-Qwen3_0.6B[0m
[34m[1mwandb[0m: 🚀 View run at [34m[4mhttps://wandb.ai/tqhb2502-hanoi-university-of-science-a

# Setup HuggingFace

In [9]:
if config.use_hf:
    from huggingface_hub import login, HfApi
    login(HF_TOKEN)
    hf_api = HfApi()

# Model and tokenizer

## Download and quantization

In [10]:
print("Loading tokenizer and model...")

# Load tokenizer
tokenizer = AutoTokenizer.from_pretrained(
    config.model_name,
    trust_remote_code=True,
    padding_side="right"
)

# Add pad token if it doesn't exist
if tokenizer.pad_token is None:
    tokenizer.pad_token = tokenizer.eos_token

# Cấu hình 4-bit quantization
quantization_config = BitsAndBytesConfig(load_in_4bit=True) if config.use_quantization else None

# Load model
model = AutoModelForCausalLM.from_pretrained(
    config.model_name,
    device_map="auto", # automatically move to correct device
    quantization_config=quantization_config,
    torch_dtype=torch.float32,
    trust_remote_code=True
)

# Turn on gradient checkpointing to save memory
model.config.use_cache = False
model.gradient_checkpointing_enable()

# Num parameters
print(f"Model loaded. Parameters: {model.num_parameters():,}")

Loading tokenizer and model...


tokenizer_config.json:   0%|          | 0.00/9.73k [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/2.78M [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/1.67M [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/11.4M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/726 [00:00<?, ?B/s]

2025-06-21 07:42:49.102616: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1750491769.261990      19 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1750491769.308099      19 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


model.safetensors:   0%|          | 0.00/1.50G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/239 [00:00<?, ?B/s]

Model loaded. Parameters: 596,049,920


In [11]:
print(model)

Qwen3ForCausalLM(
  (model): Qwen3Model(
    (embed_tokens): Embedding(151936, 1024)
    (layers): ModuleList(
      (0-27): 28 x Qwen3DecoderLayer(
        (self_attn): Qwen3Attention(
          (q_proj): Linear(in_features=1024, out_features=2048, bias=False)
          (k_proj): Linear(in_features=1024, out_features=1024, bias=False)
          (v_proj): Linear(in_features=1024, out_features=1024, bias=False)
          (o_proj): Linear(in_features=2048, out_features=1024, bias=False)
          (q_norm): Qwen3RMSNorm((128,), eps=1e-06)
          (k_norm): Qwen3RMSNorm((128,), eps=1e-06)
        )
        (mlp): Qwen3MLP(
          (gate_proj): Linear(in_features=1024, out_features=3072, bias=False)
          (up_proj): Linear(in_features=1024, out_features=3072, bias=False)
          (down_proj): Linear(in_features=3072, out_features=1024, bias=False)
          (act_fn): SiLU()
        )
        (input_layernorm): Qwen3RMSNorm((1024,), eps=1e-06)
        (post_attention_layernorm): Qwe

## Generation function

In [12]:
def generate_text(
    prompt,
    max_length=config.max_length,
    temperature=0.7,
    top_p=0.9,
    top_k=50
):
    """Generate text using the model."""
    
    model.eval()
    
    # Tokenize input
    inputs = tokenizer.encode(prompt, return_tensors='pt').to(device)
    
    with torch.no_grad():
        # Generate
        outputs = model.generate(
            inputs,
            max_length=max_length,
            temperature=temperature,
            top_p=top_p,
            top_k=top_k,
            do_sample=True,
            pad_token_id=tokenizer.eos_token_id,
            eos_token_id=tokenizer.eos_token_id,
            repetition_penalty=1.1
        )
    
    # Decode generated text
    generated_text = tokenizer.decode(outputs[0], skip_special_tokens=True)
    return generated_text

# Dataset

## Custom dataset

In [13]:
class WikiViDataset(Dataset):
    def __init__(self, dataset, tokenizer, max_length):
        self.dataset = dataset
        self.tokenizer = tokenizer
        self.max_length = max_length
        
    def __len__(self):
        return len(self.dataset)

    def __getitem__(self, idx):
        # Get data
        item = self.dataset[idx]
        combined_text = f"Tiêu đề: {item['title']}\n\nNội dung: {item['text']}"

        # Tokenize data
        tokenized_text = self.tokenizer(
            combined_text,
            truncation=True,
            padding="max_length",
            max_length=self.max_length,
            return_tensors="pt",
        )

        # # Print a tokenized sample
        # print(tokenized_text)

        # Prepare data from tokenizer output
        input_ids = tokenized_text["input_ids"].squeeze()
        attention_mask = tokenized_text["attention_mask"].squeeze()
        labels = input_ids.clone() # In causal LM, labels is the same with input_ids
        labels[attention_mask == 0] = -100 # Do not calculate loss on padding tokens

        return {
            "input_ids": input_ids,
            "attention_mask": attention_mask,
            "labels": labels,
        }

## Load wikipedia_vi dataset

In [14]:
print("Loading dataset...")
dataset = load_dataset(config.dataset_name, split="train")
print(f"Dataset loaded. Total samples: {len(dataset)}")

Loading dataset...


README.md:   0%|          | 0.00/632 [00:00<?, ?B/s]

(…)-00000-of-00003-6218d2963e302058.parquet:   0%|          | 0.00/245M [00:00<?, ?B/s]

(…)-00001-of-00003-12e6c4fadbec91d4.parquet:   0%|          | 0.00/55.2M [00:00<?, ?B/s]

(…)-00002-of-00003-175fcfe1c45b0b85.parquet:   0%|          | 0.00/270M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/1284930 [00:00<?, ? examples/s]

Dataset loaded. Total samples: 1284930


In [15]:
dataset[0]

{'id': 2,
 'revid': '90949',
 'url': 'https://vi.wikipedia.org/wiki?curid=2',
 'title': 'Trang Chính',
 'text': '&lt;templatestyles src="Wiki2021/styles.css" /&gt;__NOEDITSECTION__\n \n \n \n '}

## Preprocess data

In [16]:
# keep only title and text column
dataset = dataset.select_columns(['title', 'text'])

In [17]:
dataset[0]

{'title': 'Trang Chính',
 'text': '&lt;templatestyles src="Wiki2021/styles.css" /&gt;__NOEDITSECTION__\n \n \n \n '}

In [18]:
def filter_function(example):
    """Filter out empty or very short texts"""
    
    return (
        example['text'] is not None and 
        example['title'] is not None and
        len(example['text'].strip()) > config.min_text_length
    )

dataset = dataset.filter(filter_function)
print(f"After filtering: {len(dataset)} samples")

Filter:   0%|          | 0/1284930 [00:00<?, ? examples/s]

After filtering: 1263196 samples


## Create splits

In [19]:
dataset = dataset.shuffle(seed=config.random_seed)

train_split = dataset.select(range(
    config.train_size
))

valid_split = dataset.select(range(
    config.train_size,
    config.train_size + config.valid_size
))

test_split = dataset.select(range(
    config.train_size + config.valid_size,
    config.train_size + config.valid_size + config.test_size
))

print(f'train split: {len(train_split)} samples')
print(f'valid split: {len(valid_split)} samples')
print(f'test split: {len(test_split)} samples')

train split: 10000 samples
valid split: 10000 samples
test split: 5000 samples


In [20]:
train_ds = WikiViDataset(train_split, tokenizer, config.max_length)
valid_ds = WikiViDataset(valid_split, tokenizer, config.max_length)
test_ds = WikiViDataset(test_split, tokenizer, config.max_length)

In [21]:
# # Display a sample
# train_ds[0]

## Data loader

In [22]:
train_dataloader = DataLoader(
    train_ds,
    batch_size=config.per_device_train_batch_size,
    shuffle=True,
    num_workers=config.num_workers,
    pin_memory=True,
)

valid_dataloader = DataLoader(
    valid_ds,
    batch_size=config.per_device_valid_batch_size,
    shuffle=True,
    num_workers=config.num_workers,
    pin_memory=True,
)

In [23]:
print(f"Train batches: {len(train_dataloader)}")
print(f"Valid batches: {len(valid_dataloader)}")

Train batches: 5000
Valid batches: 5000


# Optimizer & scheduler

In [24]:
total_steps = len(train_dataloader) * config.num_train_epochs // config.gradient_accumulation_steps
warmup_steps = int(total_steps * config.warmup_ratio)

print(f"Total training steps: {total_steps}")
print(f"Warmup steps: {warmup_steps}")

Total training steps: 3125
Warmup steps: 312


In [25]:
# Setup optimizer
optimizer = optim.AdamW(
    model.parameters(),
    lr=config.learning_rate,
    weight_decay=config.weight_decay,
    eps=config.adam_epsilon
)

# Setup learning rate scheduler
scheduler = get_linear_schedule_with_warmup(
    optimizer,
    num_warmup_steps=warmup_steps,
    num_training_steps=total_steps
)

# Setup gradient scaler for mixed precision training
scaler = torch.amp.GradScaler(device) if config.fp16 else None

# Training function

In [26]:
def train_epoch(model, dataloader, optimizer, scheduler, scaler, epoch):
    """Train for one epoch."""
    
    model.train()
    total_loss = 0
    optimizer.zero_grad()
    
    progress_bar = tqdm(dataloader, desc=f"Training Epoch {epoch + 1}")
    
    for step, batch in enumerate(progress_bar):
        # Move batch to device
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['labels'].to(device)
        
        # Forward pass with mixed precision
        if config.fp16:
            # For mixed precision
            with torch.autocast(device_type=device.type):
                outputs = model(
                    input_ids=input_ids,
                    attention_mask=attention_mask,
                    labels=labels
                )
                # Chia loss cho gradient_accumulation_steps
                # Nếu không nhận được loss sẽ gấp <gradient_accumulation_steps> lần loss thực sự
                loss = outputs.loss / config.gradient_accumulation_steps
        else:
            outputs = model(
                input_ids=input_ids,
                attention_mask=attention_mask,
                labels=labels
            )
            loss = outputs.loss / config.gradient_accumulation_steps
        
        # Backward pass
        if config.fp16:
            scaler.scale(loss).backward()
        else:
            loss.backward()
        
        total_loss += loss.item()
        
        # Update weights every gradient_accumulation_steps
        if (step + 1) % config.gradient_accumulation_steps == 0:
            if config.fp16:
                scaler.unscale_(optimizer)
                torch.nn.utils.clip_grad_norm_(model.parameters(), config.max_grad_norm)
                scaler.step(optimizer)
                scaler.update()
            else:
                torch.nn.utils.clip_grad_norm_(model.parameters(), config.max_grad_norm)
                optimizer.step()
            
            scheduler.step()
            optimizer.zero_grad()
        
        # Update progress bar
        progress_bar.set_postfix({
            'loss': f"{loss.item() * config.gradient_accumulation_steps:.4f}",
            'lr': f"{scheduler.get_last_lr()[0]:.2e}"
        })
        
        # Logging
        if (step + 1) % config.logging_steps == 0:
            
            avg_loss = total_loss / (step + 1) * config.gradient_accumulation_steps
            print(f"Step {step + 1}/{len(dataloader)}, Loss: {avg_loss:.4f}, LR: {scheduler.get_last_lr()[0]:.2e}")

            if config.use_wandb:
                wandb.log({
                    "train_loss": avg_loss,
                    "learning_rate": scheduler.get_last_lr()[0],
                    "train_step": epoch * len(dataloader) + step + 1
                })
    
    return total_loss / len(dataloader) * config.gradient_accumulation_steps

# Validation function

In [27]:
def validate(model, dataloader):
    """Validate the model."""
    
    model.eval()
    total_loss = 0
    total_steps = 0
    
    with torch.no_grad():
        progress_bar = tqdm(dataloader, desc="Validating")
        
        for batch in progress_bar:
            # Move batch to device
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['labels'].to(device)
            
            # Forward pass
            if config.fp16:
                with torch.autocast(device_type=device.type):
                    outputs = model(
                        input_ids=input_ids,
                        attention_mask=attention_mask,
                        labels=labels
                    )
            else:
                outputs = model(
                    input_ids=input_ids,
                    attention_mask=attention_mask,
                    labels=labels
                )
            
            loss = outputs.loss
            total_loss += loss.item()
            total_steps += 1
            
            progress_bar.set_postfix({'valid_loss': f"{loss.item():.4f}"})
    
    avg_loss = total_loss / total_steps
    perplexity = math.exp(avg_loss)
    
    return avg_loss, perplexity

# Training loop

## Test before training

In [28]:
test_prompts = [
    "Việt Nam là một quốc gia",
    "Tiêu đề: Hà Nội\n\nNội dung:",
    "Lịch sử Việt Nam bắt đầu từ",
    "Văn hóa truyền thống của người Việt",
    "Tiêu đề: Phở\n\nNội dung: Phở là"
]

print("\n" + "=" * 50)
print("TESTING THE ORIGINAL MODEL")
print("=" * 50)

for i, prompt in enumerate(test_prompts, 1):
    print(f"\n--- Test {i} ---")
    print(f"Prompt: {prompt}")
    print("-" * 40)
    
    generated = generate_text(prompt, max_length=150, temperature=0.7)
    print(f"Generated: {generated}")

The attention mask is not set and cannot be inferred from input because pad token is same as eos token. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.



TESTING THE ORIGINAL MODEL

--- Test 1 ---
Prompt: Việt Nam là một quốc gia
----------------------------------------
Generated: Việt Nam là một quốc gia có nhiều yếu tố bùng nổ trong phát triển kinh tế, và có thể ảnh hưởng đến các vùng liên bang như Việt Nam. Hãy phân tích tại sao có sự khác biệt trong mức độ ổn định của các khu vực này?

**A. Vì sự cạnh tranh giữa các nền kinh tế địa phương và quốc tế**

B. Vì sự cạnh tranh giữa các nền kinh tế địa phương và quốc tế không đều bị đánh giá cao**

C. Vì sự cạnh tranh giữa các nền kinh tế địa phương và quốc tế trong một số khu vực**

D. Vì sự cạnh tranh giữa các nền kinh tế địa phương và quốc tế trong các khu vực không

--- Test 2 ---
Prompt: Tiêu đề: Hà Nội

Nội dung:
----------------------------------------
Generated: Tiêu đề: Hà Nội

Nội dung: Hãy cho biết về người nổi tiếng nhất trong lịch sử các nước phương Tây, có thể là một quốc gia hay một nhân dân tộc thiểu số?

Trả lời: ?

Giải thích: ... Vì sao?

Câu trả lời:

Hà Nội là một th

## Main loop

In [29]:
print("Starting training...")

# Create output directory
os.makedirs(config.output_dir, exist_ok=True)

# Training history
training_history = {
    'train_losses': [],
    'train_times': [],
    'valid_losses': [],
    'valid_perplexities': [],
    'valid_times': [],
    'learning_rates': []
}

best_valid_loss = float('inf')
step_count = 0

for epoch in range(config.num_train_epochs):
    print(f"\n{'=' * 50}")
    print(f"Epoch {epoch + 1}/{config.num_train_epochs}")
    print(f"{'=' * 50}")
    
    # Training
    start_time = time.time()
    train_loss = train_epoch(model, train_dataloader, optimizer, scheduler, scaler, epoch)
    end_time = time.time()
    
    elapsed_time = end_time - start_time
    train_mins, train_secs = divmod(elapsed_time, 60)
    training_history['train_times'].append(train_mins)
    print(f"Training Time: {int(train_mins)} mins {int(train_secs)} seconds")
    
    training_history['train_losses'].append(train_loss)
    print(f"Training Loss: {train_loss:.4f}")
    
    # Validation
    start_time = time.time()
    valid_loss, perplexity = validate(model, valid_dataloader)
    end_time = time.time()
    
    elapsed_time = end_time - start_time
    valid_mins, valid_secs = divmod(elapsed_time, 60)
    training_history['valid_times'].append(valid_mins)
    print(f"Training Time: {int(valid_mins)} mins {int(valid_secs)} seconds")
    
    training_history['valid_losses'].append(valid_loss)
    training_history['valid_perplexities'].append(perplexity)
    print(f"Validation Loss: {valid_loss:.4f}")
    print(f"Perplexity: {perplexity:.2f}")
    
    # Log to wandb
    if config.use_wandb:
        wandb.log({
            "epoch": epoch + 1,
            "train_time (m)": train_mins,
            "valid_time (m)": valid_mins,
            "valid_loss": valid_loss,
            "perplexity": perplexity,
        })
    
    # Save best model
    if valid_loss < best_valid_loss:
        best_valid_loss = valid_loss
        
        model.save_pretrained(config.output_dir)
        tokenizer.save_pretrained(config.output_dir)
        print(f"New best model! Saved to {config.output_dir}")
        
        if config.use_hf:
            model.push_to_hub(config.hf_repo)
            tokenizer.push_to_hub(config.hf_repo)
            print(f"Also saved to repo {config.hf_repo}")
        
    # Save training state
    torch.save({
        'epoch': epoch + 1,
        'model_state_dict': model.state_dict(),
        'optimizer_state_dict': optimizer.state_dict(),
        'scheduler_state_dict': scheduler.state_dict(),
        'best_valid_loss': best_valid_loss,
        'training_history': training_history
    }, os.path.join(config.output_dir, 'training_state.pt'))
    print(f"Training state saved to {config.output_dir}!")

    if config.use_hf:
        hf_api.upload_file(
            path_or_fileobj=os.path.join(config.output_dir, 'training_state.pt'),
            path_in_repo="training_state.pt",
            repo_id=config.hf_repo,
            repo_type="model",
        )
    print(f"Training state pushed to repo {config.hf_repo}!")
    
    # Clean up GPU memory
    torch.cuda.empty_cache()
    gc.collect()

Starting training...

Epoch 1/5


Training Epoch 1:   0%|          | 0/5000 [00:00<?, ?it/s]

Step 40/5000, Loss: 2.9772, LR: 8.01e-07
Step 80/5000, Loss: 2.9300, LR: 1.60e-06
Step 120/5000, Loss: 2.7960, LR: 2.40e-06
Step 160/5000, Loss: 2.6377, LR: 3.21e-06
Step 200/5000, Loss: 2.5421, LR: 4.01e-06
Step 240/5000, Loss: 2.4431, LR: 4.81e-06
Step 280/5000, Loss: 2.3852, LR: 5.61e-06
Step 320/5000, Loss: 2.3035, LR: 6.41e-06
Step 360/5000, Loss: 2.2501, LR: 7.21e-06
Step 400/5000, Loss: 2.1892, LR: 8.01e-06
Step 440/5000, Loss: 2.1509, LR: 8.81e-06
Step 480/5000, Loss: 2.0961, LR: 9.62e-06
Step 520/5000, Loss: 2.0647, LR: 1.04e-05
Step 560/5000, Loss: 2.0323, LR: 1.12e-05
Step 600/5000, Loss: 2.0074, LR: 1.20e-05
Step 640/5000, Loss: 1.9844, LR: 1.28e-05
Step 680/5000, Loss: 1.9614, LR: 1.36e-05
Step 720/5000, Loss: 1.9424, LR: 1.44e-05
Step 760/5000, Loss: 1.9239, LR: 1.52e-05
Step 800/5000, Loss: 1.9086, LR: 1.60e-05
Step 840/5000, Loss: 1.8966, LR: 1.68e-05
Step 880/5000, Loss: 1.8783, LR: 1.76e-05
Step 920/5000, Loss: 1.8626, LR: 1.84e-05
Step 960/5000, Loss: 1.8498, LR: 1.9

Validating:   0%|          | 0/5000 [00:00<?, ?it/s]

Training Time: 4 mins 55 seconds
Validation Loss: 1.4762
Perplexity: 4.38
New best model! Saved to ./qwen-vietnamese-wiki-finetuned


README.md:   0%|          | 0.00/5.17k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/2.38G [00:00<?, ?B/s]

No files have been modified since last commit. Skipping to prevent empty commit.


Also saved to repo h9art/PARADIS-Qwen3_0.6B-10kWikiVi-1GPU
Training state saved to ./qwen-vietnamese-wiki-finetuned!


training_state.pt:   0%|          | 0.00/7.15G [00:00<?, ?B/s]

Training state pushed to repo h9art/PARADIS-Qwen3_0.6B-10kWikiVi-1GPU!

Epoch 2/5


Training Epoch 2:   0%|          | 0/5000 [00:00<?, ?it/s]

Step 40/5000, Loss: 1.2121, LR: 4.43e-05
Step 80/5000, Loss: 1.1833, LR: 4.43e-05
Step 120/5000, Loss: 1.2089, LR: 4.42e-05
Step 160/5000, Loss: 1.1850, LR: 4.41e-05
Step 200/5000, Loss: 1.1826, LR: 4.40e-05
Step 240/5000, Loss: 1.1689, LR: 4.39e-05
Step 280/5000, Loss: 1.1654, LR: 4.38e-05
Step 320/5000, Loss: 1.1567, LR: 4.37e-05
Step 360/5000, Loss: 1.1619, LR: 4.36e-05
Step 400/5000, Loss: 1.1635, LR: 4.35e-05
Step 440/5000, Loss: 1.1709, LR: 4.35e-05
Step 480/5000, Loss: 1.1732, LR: 4.34e-05
Step 520/5000, Loss: 1.1713, LR: 4.33e-05
Step 560/5000, Loss: 1.1623, LR: 4.32e-05
Step 600/5000, Loss: 1.1532, LR: 4.31e-05
Step 640/5000, Loss: 1.1454, LR: 4.30e-05
Step 680/5000, Loss: 1.1385, LR: 4.29e-05
Step 720/5000, Loss: 1.1474, LR: 4.28e-05
Step 760/5000, Loss: 1.1465, LR: 4.27e-05
Step 800/5000, Loss: 1.1475, LR: 4.27e-05
Step 840/5000, Loss: 1.1429, LR: 4.26e-05
Step 880/5000, Loss: 1.1434, LR: 4.25e-05
Step 920/5000, Loss: 1.1381, LR: 4.24e-05
Step 960/5000, Loss: 1.1415, LR: 4.2

Validating:   0%|          | 0/5000 [00:00<?, ?it/s]

Training Time: 4 mins 57 seconds
Validation Loss: 1.4178
Perplexity: 4.13
New best model! Saved to ./qwen-vietnamese-wiki-finetuned


model.safetensors:   0%|          | 0.00/2.38G [00:00<?, ?B/s]

No files have been modified since last commit. Skipping to prevent empty commit.


Also saved to repo h9art/PARADIS-Qwen3_0.6B-10kWikiVi-1GPU
Training state saved to ./qwen-vietnamese-wiki-finetuned!


training_state.pt:   0%|          | 0.00/7.15G [00:00<?, ?B/s]

Training state pushed to repo h9art/PARADIS-Qwen3_0.6B-10kWikiVi-1GPU!

Epoch 3/5


Training Epoch 3:   0%|          | 0/5000 [00:00<?, ?it/s]

Step 40/5000, Loss: 0.7559, LR: 3.32e-05
Step 80/5000, Loss: 0.7570, LR: 3.31e-05
Step 120/5000, Loss: 0.7659, LR: 3.31e-05
Step 160/5000, Loss: 0.7618, LR: 3.30e-05
Step 200/5000, Loss: 0.7811, LR: 3.29e-05
Step 240/5000, Loss: 0.7777, LR: 3.28e-05
Step 280/5000, Loss: 0.7775, LR: 3.27e-05
Step 320/5000, Loss: 0.7773, LR: 3.26e-05
Step 360/5000, Loss: 0.7757, LR: 3.25e-05
Step 400/5000, Loss: 0.7695, LR: 3.24e-05
Step 440/5000, Loss: 0.7726, LR: 3.23e-05
Step 480/5000, Loss: 0.7716, LR: 3.23e-05
Step 520/5000, Loss: 0.7724, LR: 3.22e-05
Step 560/5000, Loss: 0.7791, LR: 3.21e-05
Step 600/5000, Loss: 0.7772, LR: 3.20e-05
Step 640/5000, Loss: 0.7786, LR: 3.19e-05
Step 680/5000, Loss: 0.7792, LR: 3.18e-05
Step 720/5000, Loss: 0.7796, LR: 3.17e-05
Step 760/5000, Loss: 0.7809, LR: 3.16e-05
Step 800/5000, Loss: 0.7802, LR: 3.15e-05
Step 840/5000, Loss: 0.7793, LR: 3.15e-05
Step 880/5000, Loss: 0.7786, LR: 3.14e-05
Step 920/5000, Loss: 0.7783, LR: 3.13e-05
Step 960/5000, Loss: 0.7814, LR: 3.1

Validating:   0%|          | 0/5000 [00:00<?, ?it/s]

Training Time: 4 mins 56 seconds
Validation Loss: 1.4680
Perplexity: 4.34
Training state saved to ./qwen-vietnamese-wiki-finetuned!


training_state.pt:   0%|          | 0.00/7.15G [00:00<?, ?B/s]

Training state pushed to repo h9art/PARADIS-Qwen3_0.6B-10kWikiVi-1GPU!

Epoch 4/5


Training Epoch 4:   0%|          | 0/5000 [00:00<?, ?it/s]

Step 40/5000, Loss: 0.5109, LR: 2.21e-05
Step 80/5000, Loss: 0.5108, LR: 2.20e-05
Step 120/5000, Loss: 0.5012, LR: 2.20e-05
Step 160/5000, Loss: 0.4995, LR: 2.19e-05
Step 200/5000, Loss: 0.5013, LR: 2.18e-05
Step 240/5000, Loss: 0.5021, LR: 2.17e-05
Step 280/5000, Loss: 0.5013, LR: 2.16e-05
Step 320/5000, Loss: 0.5002, LR: 2.15e-05
Step 360/5000, Loss: 0.4999, LR: 2.14e-05
Step 400/5000, Loss: 0.5016, LR: 2.13e-05
Step 440/5000, Loss: 0.4993, LR: 2.12e-05
Step 480/5000, Loss: 0.4999, LR: 2.12e-05
Step 520/5000, Loss: 0.4985, LR: 2.11e-05
Step 560/5000, Loss: 0.4974, LR: 2.10e-05
Step 600/5000, Loss: 0.4976, LR: 2.09e-05
Step 640/5000, Loss: 0.4989, LR: 2.08e-05
Step 680/5000, Loss: 0.4989, LR: 2.07e-05
Step 720/5000, Loss: 0.4985, LR: 2.06e-05
Step 760/5000, Loss: 0.4985, LR: 2.05e-05
Step 800/5000, Loss: 0.4985, LR: 2.04e-05
Step 840/5000, Loss: 0.4975, LR: 2.04e-05
Step 880/5000, Loss: 0.4985, LR: 2.03e-05
Step 920/5000, Loss: 0.4991, LR: 2.02e-05
Step 960/5000, Loss: 0.4987, LR: 2.0

Validating:   0%|          | 0/5000 [00:00<?, ?it/s]

Training Time: 4 mins 57 seconds
Validation Loss: 1.6295
Perplexity: 5.10
Training state saved to ./qwen-vietnamese-wiki-finetuned!


training_state.pt:   0%|          | 0.00/7.15G [00:00<?, ?B/s]

Training state pushed to repo h9art/PARADIS-Qwen3_0.6B-10kWikiVi-1GPU!

Epoch 5/5


Training Epoch 5:   0%|          | 0/5000 [00:00<?, ?it/s]

Step 40/5000, Loss: 0.3492, LR: 1.10e-05
Step 80/5000, Loss: 0.3350, LR: 1.09e-05
Step 120/5000, Loss: 0.3391, LR: 1.08e-05
Step 160/5000, Loss: 0.3359, LR: 1.08e-05
Step 200/5000, Loss: 0.3311, LR: 1.07e-05
Step 240/5000, Loss: 0.3278, LR: 1.06e-05
Step 280/5000, Loss: 0.3234, LR: 1.05e-05
Step 320/5000, Loss: 0.3239, LR: 1.04e-05
Step 360/5000, Loss: 0.3217, LR: 1.03e-05
Step 400/5000, Loss: 0.3211, LR: 1.02e-05
Step 440/5000, Loss: 0.3197, LR: 1.01e-05
Step 480/5000, Loss: 0.3195, LR: 1.00e-05
Step 520/5000, Loss: 0.3189, LR: 9.95e-06
Step 560/5000, Loss: 0.3182, LR: 9.86e-06
Step 600/5000, Loss: 0.3181, LR: 9.78e-06
Step 640/5000, Loss: 0.3189, LR: 9.69e-06
Step 680/5000, Loss: 0.3194, LR: 9.60e-06
Step 720/5000, Loss: 0.3186, LR: 9.51e-06
Step 760/5000, Loss: 0.3186, LR: 9.42e-06
Step 800/5000, Loss: 0.3186, LR: 9.33e-06
Step 840/5000, Loss: 0.3186, LR: 9.24e-06
Step 880/5000, Loss: 0.3181, LR: 9.15e-06
Step 920/5000, Loss: 0.3180, LR: 9.07e-06
Step 960/5000, Loss: 0.3174, LR: 8.9

Validating:   0%|          | 0/5000 [00:00<?, ?it/s]

Training Time: 4 mins 57 seconds
Validation Loss: 1.7830
Perplexity: 5.95
Training state saved to ./qwen-vietnamese-wiki-finetuned!


training_state.pt:   0%|          | 0.00/7.15G [00:00<?, ?B/s]

Training state pushed to repo h9art/PARADIS-Qwen3_0.6B-10kWikiVi-1GPU!


# After training

## Test after training

In [30]:
test_prompts = [
    "Việt Nam là một quốc gia",
    "Tiêu đề: Hà Nội\n\nNội dung:",
    "Lịch sử Việt Nam bắt đầu từ",
    "Văn hóa truyền thống của người Việt",
    "Tiêu đề: Phở\n\nNội dung: Phở là"
]

print("\n" + "=" * 60)
print("TESTING THE FINE-TUNED MODEL")
print("=" * 60)

for i, prompt in enumerate(test_prompts, 1):
    print(f"\n--- Test {i} ---")
    print(f"Prompt: {prompt}")
    print("-" * 40)
    
    generated = generate_text(prompt, max_length=150, temperature=0.7)
    print(f"Generated: {generated}")


TESTING THE FINE-TUNED MODEL

--- Test 1 ---
Prompt: Việt Nam là một quốc gia
----------------------------------------
Generated: Việt Nam là một quốc gia có chủ quyền tại Mặt trận Thái Bình Dương. Quyền kiểm soát mặt trận thuộc về Việt Nam, và hiện nay đang bị các nước Ả Rập Xê Út và Đài Loan kiểm soát.
Bối cảnh.
Trong phạm vi quản lý, Mặt trận Thái Bình Dương được xem như là một phần của dải biển Thái Bình Dương, và do đó, việc quản lý theo Luật Liên Hợp Quốc về Sequestration của Mặt trận Thái Bình Dương là một vấn đề pháp lý có tranh cãi giữa Việt Nam, Đài Loan và Trung Quốc. Hiện tại, trên thực tế, Quân đội nhân dân Việt Nam đã điều hành kiểm soát

--- Test 2 ---
Prompt: Tiêu đề: Hà Nội

Nội dung:
----------------------------------------
Generated: Tiêu đề: Hà Nội

Nội dung: Hà Nội () là một thành phố và khu đô thị của tỉnh Hà Nội, Việt Nam.
Địa lý.
Hà Nội nằm ở trung tâm của tỉnh, có vị trí địa lý:
Nhân khẩu.
Trong năm 2019, Hà Nội có dân số 38.754 người, trong đó có 14.068 nam v

## Save training log

In [31]:
# Save comprehensive training log
training_log = {
    'config': vars(config),
    'model_info': {
        'model_name': config.model_name,
        'num_parameters': model.num_parameters(),
        'dataset_name': config.dataset_name,
        'train_samples': len(train_ds),
        'valid_samples': len(valid_ds)
    },
    'training_results': {
        'best_valid_loss': best_valid_loss,
        'final_perplexity': training_history['valid_perplexities'][-1],
        'total_epochs': config.num_train_epochs,
        'total_steps': total_steps
    },
    'training_history': training_history,
    'training_date': datetime.now().isoformat()
}

with open(os.path.join(config.output_dir, 'training_log.json'), 'w', encoding='utf-8') as f:
    json.dump(training_log, f, indent=2, ensure_ascii=False)
print(f"\nTraining log saved to {config.output_dir}/training_log.json")

if config.use_hf:
    hf_api.upload_file(
        path_or_fileobj=os.path.join(config.output_dir, 'training_log.json'),
        path_in_repo="training_log.json",
        repo_id=config.hf_repo,
        repo_type="model",
    )
print(f"\nTraining log pushed to repo {config.hf_repo}")


Training log saved to ./qwen-vietnamese-wiki-finetuned/training_log.json

Training log pushed to repo h9art/PARADIS-Qwen3_0.6B-10kWikiVi-1GPU


## Clean up

In [32]:
if config.use_wandb:
    wandb.finish()

[34m[1mwandb[0m:                                                                                
[34m[1mwandb[0m: 
[34m[1mwandb[0m: Run history:
[34m[1mwandb[0m:          epoch ▁▃▅▆█
[34m[1mwandb[0m:  learning_rate ▁▂▂▃███▇▇▇▇▇▆▆▆▆▆▆▆▅▅▅▅▄▄▄▄▄▃▂▂▂▂▂▂▂▂▁▁▁
[34m[1mwandb[0m:     perplexity ▂▁▂▅█
[34m[1mwandb[0m:     train_loss █████▆▅▅▅▅▅▅▅▅▃▃▃▃▃▃▃▃▃▃▃▂▂▂▂▂▂▂▂▂▂▁▁▁▁▁
[34m[1mwandb[0m:     train_step ▁▁▁▂▂▂▂▃▃▃▃▃▃▃▃▄▄▄▄▄▅▅▅▅▅▅▅▆▆▆▇▇▇▇▇▇▇▇▇█
[34m[1mwandb[0m: train_time (m) ▁▁▁▁▁
[34m[1mwandb[0m:     valid_loss ▂▁▂▅█
[34m[1mwandb[0m: valid_time (m) ▁▁▁▁▁
[34m[1mwandb[0m: 
[34m[1mwandb[0m: Run summary:
[34m[1mwandb[0m:          epoch 5
[34m[1mwandb[0m:  learning_rate 0
[34m[1mwandb[0m:     perplexity 5.9479
[34m[1mwandb[0m:     train_loss 0.31824
[34m[1mwandb[0m:     train_step 25000
[34m[1mwandb[0m: train_time (m) 21
[34m[1mwandb[0m:     valid_loss 1.78304
[34m[1mwandb[0m: valid_time (m) 4
[34m[1mwandb[0m: 
[34m[1mwandb[0m: 