# vAGI L-KAN train tren Google Colab (NVIDIA T4)

Notebook nay duoc thiet ke de train `vagi-kernel` binary `train_lkan` tren Colab GPU.

## Tham khao chinh
- Candle installation guide: https://huggingface.github.io/candle/guide/installation.html
- Candle CUDA feature flags (crate): https://docs.rs/crate/candle-core/0.8.4/features
- Rust toolchain (rustup): https://rustup.rs/
- TinyShakespeare corpus: https://raw.githubusercontent.com/karpathy/char-rnn/master/data/tinyshakespeare/input.txt

## Output
- Checkpoint se duoc luu tai: `models/lkan-genesis.safetensors`


In [None]:
import os
import pathlib
import re
import subprocess
import sys
import urllib.request

def run(cmd: str, check: bool = True):
    print(f"$ {cmd}")
    return subprocess.run(cmd, shell=True, check=check, text=True)

print("Python:", sys.version)
run("nvidia-smi")


In [None]:
# Neu ban da upload source vao /content/vagi thi giu nguyen.
# Neu chua co source, cap nhat REPO_URL roi chay cell clone o duoi.
REPO_URL = "https://github.com/<YOUR_GITHUB_USER>/vagi.git"
BRANCH = "main"
WORKDIR = pathlib.Path("/content/vagi")

# Cau hinh train khuyen nghi cho T4 (vua toc do, vua on dinh).
MODEL_OUT = "models/lkan-genesis.safetensors"
TRAIN_STEPS = 5_000
BATCH_SIZE = 32
SEQ_LEN = 64
HIDDEN_DIM = 128

print("WORKDIR:", WORKDIR)
print("MODEL_OUT:", MODEL_OUT)
print("TRAIN_STEPS:", TRAIN_STEPS)


In [None]:
if not WORKDIR.exists():
    if "<YOUR_GITHUB_USER>" in REPO_URL:
        raise ValueError("Hay cap nhat REPO_URL hoac upload source vao /content/vagi truoc khi chay.")
    run(f"git clone --depth 1 --branch {BRANCH} {REPO_URL} {WORKDIR}")

os.chdir(WORKDIR)
run("git rev-parse --short HEAD", check=False)
print("Current dir:", pathlib.Path.cwd())


In [None]:
run("apt-get -y update")
run("apt-get -y install build-essential pkg-config libssl-dev curl git")

if not pathlib.Path("/root/.cargo/bin/rustup").exists():
    run("curl https://sh.rustup.rs -sSf | sh -s -- -y --profile minimal")

os.environ["PATH"] = f"/root/.cargo/bin:{os.environ['PATH']}"
run("rustup default stable")
run("rustc --version")
run("cargo --version")


In [None]:
# Cau hinh CUDA env cho Colab.
os.environ["CUDA_HOME"] = "/usr/local/cuda"
os.environ["PATH"] = f"/usr/local/cuda/bin:{os.environ['PATH']}"
os.environ["LD_LIBRARY_PATH"] = f"/usr/local/cuda/lib64:{os.environ.get('LD_LIBRARY_PATH', '')}"

run("which nvcc", check=False)
run("nvcc --version", check=False)


In [None]:
data_path = WORKDIR / "data" / "input.txt"
data_path.parent.mkdir(parents=True, exist_ok=True)

if not data_path.exists():
    url = "https://raw.githubusercontent.com/karpathy/char-rnn/master/data/tinyshakespeare/input.txt"
    print("Downloading dataset from:", url)
    urllib.request.urlretrieve(url, data_path)

print("Dataset:", data_path)
print("Size (bytes):", data_path.stat().st_size)


In [None]:
# Patch source de:
# 1) Bat CUDA cho candle-core.
# 2) Dung Device::new_cuda(0) (fallback CPU neu khong co CUDA).
# 3) Dong bo train config de checkpoint co the load boi runtime hien tai.

kernel_cargo = WORKDIR / "kernel" / "Cargo.toml"
cargo_text = kernel_cargo.read_text(encoding="utf-8")

dep_re = re.compile(r'^candle-core\s*=\s*(.+)$', re.MULTILINE)
m = dep_re.search(cargo_text)
if not m:
    raise RuntimeError("Khong tim thay dependency candle-core trong kernel/Cargo.toml")

current_line = m.group(0)
if 'features = ["cuda"]' not in current_line:
    ver_match = re.search(r'version\s*=\s*"([^\"]+)"', current_line)
    if not ver_match:
        ver_match = re.search(r'"([^\"]+)"', current_line)
    if not ver_match:
        raise RuntimeError("Khong doc duoc version tu candle-core line: " + current_line)
    ver = ver_match.group(1)
    new_line = f'candle-core = {{ version = "{ver}", features = ["cuda"] }}'
    cargo_text = cargo_text.replace(current_line, new_line)
    kernel_cargo.write_text(cargo_text, encoding="utf-8")
    print("Patched kernel/Cargo.toml:", new_line)
else:
    print("kernel/Cargo.toml da bat CUDA cho candle-core:", current_line)

train_rs = WORKDIR / "kernel" / "src" / "bin" / "train_lkan.rs"
src = train_rs.read_text(encoding="utf-8")

def replace_const(text: str, name: str, value: str) -> str:
    pattern = rf'const {name}: [^=]+ = [^;]+;'
    repl = f'const {name}: usize = {value};'
    if name == "OUTPUT_PATH":
        repl = f'const OUTPUT_PATH: &str = "{value}";'
    new_text, n = re.subn(pattern, repl, text, count=1)
    if n == 0:
        raise RuntimeError(f"Khong tim thay const {name} trong train_lkan.rs")
    return new_text

src = replace_const(src, "OUTPUT_PATH", MODEL_OUT)
src = replace_const(src, "BATCH_SIZE", str(BATCH_SIZE))
src = replace_const(src, "SEQ_LEN", str(SEQ_LEN))
src = replace_const(src, "TRAIN_STEPS", str(TRAIN_STEPS))

src = src.replace("hidden_dim: 192,", f"hidden_dim: {HIDDEN_DIM},")
src = src.replace("in_dim: 192,", f"in_dim: {HIDDEN_DIM},")
src = src.replace("hidden_dim: 192,", f"hidden_dim: {HIDDEN_DIM},")
src = src.replace("out_dim: 192,", f"out_dim: {HIDDEN_DIM},")

if "Device::new_cuda(0)" not in src:
    cpu_line = "let device = Device::Cpu;"
    cuda_block = """let device = match Device::new_cuda(0) {
        Ok(dev) => {
            println!(\"using CUDA device 0\");
            dev
        }
        Err(err) => {
            println!(\"CUDA unavailable ({err}), fallback to CPU\");
            Device::Cpu
        }
    };"""
    if cpu_line not in src:
        raise RuntimeError("Khong tim thay `let device = Device::Cpu;` de patch")
    src = src.replace(cpu_line, cuda_block)

train_rs.write_text(src, encoding="utf-8")
print("Patched:", train_rs)


In [None]:
os.chdir(WORKDIR)
os.environ.setdefault("CARGO_BUILD_JOBS", "2")

# Build + train (release). Log se in loss theo train_lkan.rs.
run("cargo run -p vagi-kernel --release --bin train_lkan")


In [None]:
checkpoint = WORKDIR / MODEL_OUT
if not checkpoint.exists():
    raise FileNotFoundError(f"Khong tim thay checkpoint: {checkpoint}")

print("Checkpoint:", checkpoint)
print("Size (MB):", round(checkpoint.stat().st_size / (1024 * 1024), 2))
print("Done.")
