In [None]:
from kaggle_secrets import UserSecretsClient
user_secrets = UserSecretsClient()
WANDB_API_KEY = user_secrets.get_secret("WANDB_API_KEY")

In [None]:
import wandb
wandb.login(key=WANDB_API_KEY)

In [None]:
%%writefile train_single_gpu.py

import torch
import time
import math
import argparse
import wandb
from transformers import AutoModelForCausalLM, AutoTokenizer, default_data_collator, get_scheduler
from torch.optim import AdamW
from datasets import load_dataset
from torch.utils.data import DataLoader
from tqdm.auto import tqdm

def set_seed(seed):
    """Hàm để set random seed cho reproducibility."""
    torch.manual_seed(seed)
    if torch.cuda.is_available():
        torch.cuda.manual_seed_all(seed)

def evaluate_model(model, dataloader, device):
    """Hàm đánh giá model trên một GPU duy nhất."""
    model.eval()
    losses = []
    eval_start_time = time.time()
    for batch in dataloader:
        # Chuyển batch dữ liệu sang device
        batch = {k: v.to(device) for k, v in batch.items()}
        with torch.no_grad():
            outputs = model(**batch)
        
        loss = outputs.loss
        losses.append(loss.item())

    try:
        eval_loss = sum(losses) / len(losses)
        perplexity = math.exp(eval_loss)
    except OverflowError:
        eval_loss = float("inf")
        perplexity = float("inf")
    
    eval_time = time.time() - eval_start_time
    model.train() # Chuyển model về lại chế độ train
    return eval_loss, perplexity, eval_time

def main():
    # --- Cấu hình tham số ---
    parser = argparse.ArgumentParser(description="Finetune BLOOM on a single GPU")
    
    parser.add_argument("--model_name", type=str, default="bigscience/bloom-560m")
    parser.add_argument("--dataset_name", type=str, default="Salesforce/wikitext")
    parser.add_argument("--dataset_config", type=str, default="wikitext-2-raw-v1")
    
    parser.add_argument("--seed", type=int, default=42)
    parser.add_argument("--num_epochs", type=int, default=2)
    parser.add_argument("--batch_size", type=int, default=4)
    parser.add_argument("--block_size", type=int, default=128)
    
    # Thêm các tham số cho optimizer và scheduler
    parser.add_argument("--lr", type=float, default=3e-6)
    parser.add_argument("--weight_decay", type=float, default=0.01)
    parser.add_argument("--warmup_steps", type=int, default=20)

    parser.add_argument("--logging_steps", type=int, default=5)
    parser.add_argument("--eval_steps", type=int, default=20)
    parser.add_argument("--wandb_project", type=str, default="single_gpu_finetune")
    parser.add_argument("--wandb_run_name", type=str, default=None)
    
    args = parser.parse_args()
    
    if args.wandb_run_name:
        args.wandb_run_name += f"-{int(time.time())}"

    # --- Thiết lập chung ---
    set_seed(args.seed)
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    print(f"Đang sử dụng device: {device}")

    # Khởi tạo W&B
    wandb.init(project=args.wandb_project, name=args.wandb_run_name, config=vars(args))

    # --- Tải Model & Tokenizer ---
    print("Đang tải tokenizer và model...")
    tokenizer = AutoTokenizer.from_pretrained(args.model_name)
    model = AutoModelForCausalLM.from_pretrained(args.model_name)
    model.to(device) # Chuyển model sang device
    
    if tokenizer.pad_token is None:
        tokenizer.pad_token = tokenizer.eos_token
        model.config.pad_token_id = model.config.eos_token_id

    # --- Tải và xử lý Dataset ---
    print("Đang tải và xử lý dữ liệu...")
    raw_datasets = load_dataset(args.dataset_name, args.dataset_config)
    raw_datasets['train'] = raw_datasets['train'].select(range(1000))
    raw_datasets['validation'] = raw_datasets['validation'].select(range(100))
    del raw_datasets['test']
    
    column_names = raw_datasets["train"].column_names
    text_column_name = "text" if "text" in column_names else column_names[0]

    def tokenize_function(examples):
        return tokenizer(examples[text_column_name])

    tokenized_datasets = raw_datasets.map(
        tokenize_function, batched=True, remove_columns=column_names, desc="Running tokenizer on dataset"
    )

    def group_texts(examples):
        concatenated_examples = {k: sum(examples[k], []) for k in examples.keys()}
        total_length = len(concatenated_examples[list(examples.keys())[0]])
        total_length = (total_length // args.block_size) * args.block_size
        result = {
            k: [t[i : i + args.block_size] for i in range(0, total_length, args.block_size)]
            for k, t in concatenated_examples.items()
        }
        result["labels"] = result["input_ids"].copy()
        return result

    lm_datasets = tokenized_datasets.map(
        group_texts, batched=True, desc=f"Grouping texts in chunks of {args.block_size}"
    )
    print(lm_datasets)
    
    train_dataset = lm_datasets["train"]
    eval_dataset = lm_datasets["validation"]

    data_collator = default_data_collator
    train_dataloader = DataLoader(train_dataset, shuffle=True, batch_size=args.batch_size, collate_fn=data_collator)
    eval_dataloader = DataLoader(eval_dataset, batch_size=args.batch_size, collate_fn=data_collator)

    # --- Optimizer & Scheduler ---
    optimizer = AdamW(model.parameters(), lr=args.lr, weight_decay=args.weight_decay)
    num_training_steps = args.num_epochs * len(train_dataloader)
    lr_scheduler = get_scheduler(
        name="linear",
        optimizer=optimizer,
        num_warmup_steps=args.warmup_steps,
        num_training_steps=num_training_steps,
    )

    # --- Vòng lặp huấn luyện ---
    print("\n*** Bắt đầu huấn luyện ***")
    global_step = 0
    start_training_time = time.time()
    
    for epoch in range(args.num_epochs):
        model.train()
        progress_bar = tqdm(train_dataloader, desc=f"Epoch {epoch+1}/{args.num_epochs}")
        start_epoch_time = time.time()
        
        for batch in progress_bar:
            # Chuyển batch dữ liệu sang device
            batch = {k: v.to(device) for k, v in batch.items()}
            
            outputs = model(**batch)
            loss = outputs.loss
            
            loss.backward() # PyTorch backpropagation
            
            optimizer.step()
            lr_scheduler.step()
            optimizer.zero_grad()
            
            global_step += 1
            
            # Logging
            if global_step % args.logging_steps == 0:
                log_metrics = {
                    "train_loss": loss.item(),
                    "learning_rate": lr_scheduler.get_last_lr()[0],
                    "epoch": global_step / len(train_dataloader),
                }
                wandb.log(log_metrics, step=global_step)

            # Đánh giá
            if global_step % args.eval_steps == 0:
                eval_loss, perplexity, eval_time = evaluate_model(model, eval_dataloader, device)
                log_metrics = {
                    "eval_loss": eval_loss,
                    "perplexity": perplexity,
                    "eval_time (s)": eval_time,
                }
                wandb.log(log_metrics, step=global_step)
                print(f"\nStep {global_step}: eval_loss = {eval_loss:.2f}")

            progress_bar.set_postfix({"loss": loss.item()})
        
        # Log thời gian mỗi epoch
        epoch_time = time.time() - start_epoch_time
        log_metrics = {
            "epoch_time (s)": epoch_time,
            "epoch": epoch + 1,
        }
        wandb.log(log_metrics, step=global_step)
        print(f"--- Epoch {epoch+1} hoàn tất trong {epoch_time:.2f} giây ---")

    # --- Kết thúc ---
    total_training_time = time.time() - start_training_time
    print(f"*** Huấn luyện hoàn tất trong: {total_training_time:.2f} giây ***\n")
    
    print("*** Bắt đầu đánh giá cuối cùng ***")
    final_eval_loss, final_perplexity, _ = evaluate_model(model, eval_dataloader, device)

    print(f"*** Kết quả đánh giá cuối cùng trên tập validation ***")
    print(f"Epoch: {args.num_epochs}")
    print(f"Loss: {final_eval_loss:.4f}")
    print(f"Perplexity: {final_perplexity:.4f}")
    
    wandb.finish()

if __name__ == "__main__":
    main()

In [None]:
!python train_single_gpu.py \
    --model_name "bigscience/bloom-1b7" \
    --batch_size 4 \
    --num_epochs 3 \
    --logging_steps 2 \
    --eval_steps 10 \
    --wandb_project "PARADIS-bloom_1b7" \
    --wandb_run_name "1GPU"