In [1]:
# ðŸŸ¢Â CellÂ 1Â â€”Â Runtime & GPU fingerprint
import torch, bitsandbytes as bnb, subprocess, platform, datetime

print("="*38, "SYSTEM", "="*38)
print("Timestamp:", datetime.datetime.utcnow().isoformat(), "UTC")
print("Hostname :", platform.node())
print("Python   :", platform.python_version())
print("Torch    :", torch.__version__)
print("CUDAÂ rt  :", torch.version.cuda)
print("CUDAÂ drv :", subprocess.check_output(
        ["nvidia-smi", "--query-gpu=driver_version", "--format=csv,noheader"]
    ).decode().strip())

print("\n== GPU(s) ==")
for i in range(torch.cuda.device_count()):
    cap = torch.cuda.get_device_capability(i)
    print(f"GPU{i}: {torch.cuda.get_device_name(i)} | CC {cap} | "
          f"arch list has sm_120? {'sm_120' in torch.cuda.get_arch_list()}")

print("\n== bitsandbytes ==")
print("bnb lib path:", bnb.__file__)


Timestamp: 2025-04-21T16:33:39.636853 UTC
Hostname : 07d88fdcb59f
Python   : 3.10.12
Torch    : 2.8.0.dev20250421+cu128
CUDAÂ rt  : 12.8
CUDAÂ drv : 572.83

== GPU(s) ==
GPU0: NVIDIA GeForce RTX 5090 | CC (12, 0) | arch list has sm_120? True

== bitsandbytes ==
bnb lib path: /opt/bitsandbytes/bitsandbytes/__init__.py


In [2]:
# ðŸŸ¢Â CellÂ 2Â â€”Â Kernel smokeâ€‘test (tensor ops + bnb optimiser)
import torch, bitsandbytes as bnb
p = torch.nn.Parameter(torch.rand(256, 256, device="cuda"))
loss = (torch.rand_like(p) * p).sum()
loss.backward()
opt = bnb.optim.Adam8bit([p])
opt.step()
print(">>> bitsandbytes sm_120 kernel appears OK âœ…")


>>> bitsandbytes sm_120 kernel appears OK âœ…


In [8]:
# ðŸŸ¢Â CellÂ 3Â â€”Â Pure PyTorch + bitsandbytes 4â€‘bit sanity test
import torch, bitsandbytes as bnb, gc
from bitsandbytes.nn import Linear4bit

torch.manual_seed(42)

# --- a tiny 2â€‘layer "model" in 4â€‘bit ---
class Tiny4Bit(torch.nn.Module):
    def __init__(self):
        super().__init__()
        self.fc1 = Linear4bit(64, 64, bias=False, compute_dtype=torch.bfloat16)
        self.fc2 = Linear4bit(64, 64, bias=False, compute_dtype=torch.bfloat16)
    def forward(self, x):               # simple MLP block
        return self.fc2(torch.nn.functional.relu(self.fc1(x)))

model = Tiny4Bit().cuda()               # weights already Q4 NFâ€‘4
print("Main weight device:", next(model.parameters()).device)

# fake LoRA adapter: just another 4â€‘bit layer we treat as extension
lora = Linear4bit(64, 64, bias=False, compute_dtype=torch.bfloat16).cuda()

# input
x = torch.rand(8, 64, device="cuda").requires_grad_()
out = model(x) + lora(x)                # combine base + tiny "LoRA"
loss = out.sum()
loss.backward()

# 8â€‘bit Adam step
opt = bnb.optim.Adam8bit(
    list(model.parameters()) + list(lora.parameters()), lr=1e-3
)
opt.step()

print("LoRAâ€‘style 4â€‘bit forward/backward âœ…  |  loss:", loss.item())

del model, lora; gc.collect(); torch.cuda.empty_cache()




Main weight device: cuda:0
LoRAâ€‘style 4â€‘bit forward/backward âœ…  |  loss: -8.329385757446289


In [9]:
# ðŸŸ¢Â CellÂ 4Â â€”Â Exit criteria
print("All three cells ran without errors â†’ GPU environment is validated ðŸŽ‰")



All three cells ran without errors â†’ GPU environment is validated ðŸŽ‰
