unslothai · danielhanchen · Jul 3, 2024 · Jun 9, 2024 · Jun 9, 2024 · Jun 9, 2024
diff --git a/images/Merge.png b/images/Merge.png
diff --git a/images/ollama.png b/images/ollama.png
diff --git a/pyproject.toml b/pyproject.toml
@@ -34,7 +34,7 @@ exclude = ["images*"]
 [project.optional-dependencies]
 huggingface = [
     "tyro",
-    "transformers>=4.38.2",
+    "transformers>=4.42.3",
     "datasets>=2.16.0",
     "sentencepiece>=0.2.0",
     "tqdm",
@@ -185,9 +185,9 @@ colab-ampere-torch220 = [
 ]
 colab-new = [
     "tyro",
-    "transformers>=4.38.2",
+    "transformers>=4.42.3",
     "datasets>=2.16.0",
-    "sentencepiece",
+    "sentencepiece>=0.2.0",
     "tqdm",
     "psutil",
     "wheel>=0.42.0",

diff --git a/unsloth/kernels/cross_entropy_loss.py b/unsloth/kernels/cross_entropy_loss.py
@@ -19,14 +19,17 @@
 from transformers.models.llama.modeling_llama import logger
 
 
+@triton.heuristics({"DO_SOFTCAPPING": lambda args: args["DO_SOFTCAPPING"],})
 @triton.jit
 def _cross_entropy_forward(
     logits_ptr, logits_row_stride,
     loss_ptr,
     logsumexp_ptr,
     labels_ptr,
-    VOCAB_SIZE : tl.constexpr,
-    BLOCK_SIZE : tl.constexpr,
+    VOCAB_SIZE     : tl.constexpr,
+    BLOCK_SIZE     : tl.constexpr,
+    DO_SOFTCAPPING : tl.constexpr,
+    SOFTCAP        : tl.constexpr,
 ):
     """
         Cross Entropy Loss = 1/n sum [ -yi log(Pi) ]
@@ -58,29 +61,38 @@ def _cross_entropy_forward(
     mask = col_offsets < VOCAB_SIZE
 
     label_idx = tl.load(labels_ptr).to(tl.int32)
-    logits = tl.load(logits_ptr + col_offsets, mask = mask, other = -float("inf")).to(tl.float32)
+    logits = tl.load(logits_ptr + col_offsets, mask = mask, other = -float("inf"))
+    # Do logit softcapping for Gemma 2: t * tanh(1/t * x)
+    if DO_SOFTCAPPING: logits = SOFTCAP * tl.math.tanh(logits / SOFTCAP)
+
+    logits = logits.to(tl.float32)
     c = tl.max(logits, 0)
     logsumexp = c + tl.log(tl.sum(tl.exp(logits - c), 0))
 
     if label_idx != -100:
-        x = tl.load(logits_ptr + label_idx).to(tl.float32)
-        loss = logsumexp - x
+        x = tl.load(logits_ptr + label_idx)
+        # Do logit softcapping for Gemma 2: t * tanh(1/t * x)
+        if DO_SOFTCAPPING: x = SOFTCAP * tl.math.tanh(x / SOFTCAP)
+        loss = logsumexp - x.to(tl.float32)
     else:
         loss = 0.0
     tl.store(logsumexp_ptr, logsumexp)
     tl.store(loss_ptr, loss)
 pass
 
 
+@triton.heuristics({"DO_SOFTCAPPING": lambda args: args["DO_SOFTCAPPING"],})
 @triton.jit
 def _chunked_cross_entropy_forward(
     logits_ptr, logits_row_stride,
     loss_ptr,
     logsumexp_ptr,
     labels_ptr,
-    VOCAB_SIZE : tl.constexpr,
-    N_CHUNKS   : tl.constexpr,
-    BLOCK_SIZE : tl.constexpr,
+    VOCAB_SIZE     : tl.constexpr,
+    N_CHUNKS       : tl.constexpr,
+    BLOCK_SIZE     : tl.constexpr,
+    DO_SOFTCAPPING : tl.constexpr,
+    SOFTCAP        : tl.constexpr,
 ):
     """
         256K vocab divided in 4 chunks
@@ -117,7 +129,11 @@ def _chunked_cross_entropy_forward(
     mask = col_offsets < VOCAB_SIZE
 
     label_idx = tl.load(labels_ptr).to(tl.int32)
-    logits = tl.load(logits_ptr + col_offsets, mask = mask, other = -float("inf")).to(tl.float32)
+    logits = tl.load(logits_ptr + col_offsets, mask = mask, other = -float("inf"))
+    # Do logit softcapping for Gemma 2: t * tanh(1/t * x)
+    if DO_SOFTCAPPING: logits = SOFTCAP * tl.math.tanh(logits / SOFTCAP)
+
+    logits = logits.to(tl.float32)
     c = tl.max(logits, 0)
     logsumexp = c + tl.log(tl.sum(tl.exp(logits - c), 0))
 
@@ -126,7 +142,9 @@ def _chunked_cross_entropy_forward(
         # Do the -x separately
         if label_idx != -100:
             x = tl.load(logits_ptr + label_idx).to(tl.float32)
-            loss = -1.0 * x
+            # Do logit softcapping for Gemma 2: t * tanh(1/t * x)
+            if DO_SOFTCAPPING: x = SOFTCAP * tl.math.tanh(x / SOFTCAP)
+            loss = -1.0 * x.to(tl.float32)
         else:
             loss = 0.0
         tl.store(loss_ptr, loss)
@@ -135,14 +153,17 @@ def _chunked_cross_entropy_forward(
 pass
 
 
+@triton.heuristics({"DO_SOFTCAPPING": lambda args: args["DO_SOFTCAPPING"],})
 @triton.jit
 def _cross_entropy_backward(
     logits_ptr, logits_row_stride,
     dloss_ptr,   dloss_row_stride,
     logsumexp_ptr,
     labels_ptr,
-    VOCAB_SIZE : tl.constexpr,
-    BLOCK_SIZE : tl.constexpr,
+    VOCAB_SIZE     : tl.constexpr,
+    BLOCK_SIZE     : tl.constexpr,
+    DO_SOFTCAPPING : tl.constexpr,
+    SOFTCAP        : tl.constexpr,
 ):
     """
         CE_i = -y log(P) = y * (log[sum(exp(x))] - x)
@@ -173,15 +194,27 @@ def _cross_entropy_backward(
     else:
         dloss = 0.0
 
-    x = tl.load(logits_ptr + col_offsets, mask = mask, other = -float("inf")).to(tl.float32)
+    x = tl.load(logits_ptr + col_offsets, mask = mask, other = -float("inf"))
+    # Do logit softcapping for Gemma 2: t * tanh(1/t * x)
+    if DO_SOFTCAPPING:
+        # d/dx [t * tanh(1/t * x)] = 1 - tanh^2(1/t * x)
+        partial = tl.math.tanh(x / SOFTCAP)
+        x = SOFTCAP * partial
+    pass
+
     logsumexp = tl.load(logsumexp_ptr + row_idx)
-    y = tl.exp(x - logsumexp)
+    y = tl.exp(x.to(tl.float32) - logsumexp)
     y = tl.where(
         col_offsets == label_idx,
         y - 1.0, # exp(x - logsumexp) - 1
         y,       # exp(x - logsumexp)
     )
 
+    if DO_SOFTCAPPING:
+        # d/dx [t * tanh(1/t * x)] = 1 - tanh^2(1/t * x)
+        y = y * (1.0 - partial*partial)
+    pass
+
     # If y == 0: dC/dx = 0 ==> we already masked it to be = 0, so dloss = 0.
     tl.store(logits_ptr + col_offsets, dloss * y, mask = mask)
 pass
@@ -191,40 +224,46 @@ def _cross_entropy_backward(
 
 class Fast_CrossEntropyLoss(torch.autograd.Function):
     @staticmethod
-    def forward(ctx, logits, labels):
+    def forward(ctx, logits, labels, logit_softcapping = 0):
         n_rows, vocab_size = logits.shape
 
         div, mod = divmod(vocab_size, MAX_FUSED_SIZE)
         n_chunks = div + (mod != 0)
-        losses = torch.empty(n_rows, dtype = torch.float32, device = "cuda")
+        losses = torch.empty(n_rows, dtype = torch.float32, device = "cuda:0")
+
+        DO_SOFTCAPPING = (logit_softcapping != 0)
 
         if n_chunks == 1:
             # For small vocabs <= 65336 like Llama, Mistral
             BLOCK_SIZE, num_warps = calculate_settings(vocab_size)
-            logsumexp = torch.empty(n_rows, dtype = torch.float32, device = "cuda")
+            logsumexp = torch.empty(n_rows, dtype = torch.float32, device = "cuda:0")
 
             _cross_entropy_forward[(n_rows,)](
                 logits, logits.stride(0),
                 losses,
                 logsumexp,
                 labels,
-                VOCAB_SIZE = vocab_size,
-                BLOCK_SIZE = BLOCK_SIZE,
-                num_warps  = num_warps,
+                VOCAB_SIZE     = vocab_size,
+                BLOCK_SIZE     = BLOCK_SIZE,
+                DO_SOFTCAPPING = DO_SOFTCAPPING,
+                SOFTCAP        = logit_softcapping,
+                num_warps      = num_warps,
             )
         else:
             # For large vocabs > 65336 like Gemma 256K
-            logsumexp = torch.empty((n_rows, n_chunks,), dtype = torch.float32, device = "cuda")
+            logsumexp = torch.empty((n_rows, n_chunks,), dtype = torch.float32, device = "cuda:0")
 
             _chunked_cross_entropy_forward[(n_rows, n_chunks,)](
                 logits, logits.stride(0),
                 losses,
                 logsumexp,
                 labels,
-                VOCAB_SIZE = vocab_size,
-                N_CHUNKS   = n_chunks,
-                BLOCK_SIZE = MAX_FUSED_SIZE,
-                num_warps  = 32,
+                VOCAB_SIZE     = vocab_size,
+                N_CHUNKS       = n_chunks,
+                BLOCK_SIZE     = MAX_FUSED_SIZE,
+                DO_SOFTCAPPING = DO_SOFTCAPPING,
+                SOFTCAP        = logit_softcapping,
+                num_warps      = 32,
             )
             # logsumexp(chunked_logsumexp) - x
             # Do the -x separately
@@ -234,6 +273,8 @@ def forward(ctx, logits, labels):
         pass
 
         ctx.save_for_backward(logits, logsumexp, labels)
+        ctx.DO_SOFTCAPPING    = DO_SOFTCAPPING
+        ctx.logit_softcapping = logit_softcapping
         return losses
     pass
 
@@ -251,16 +292,18 @@ def backward(ctx, dlosses):
             dlosses, dlosses.stride(0),
             logsumexp,
             labels,
-            VOCAB_SIZE = vocab_size,
-            BLOCK_SIZE = BLOCK_SIZE,
-            num_warps  = 8,
+            VOCAB_SIZE     = vocab_size,
+            BLOCK_SIZE     = BLOCK_SIZE,
+            DO_SOFTCAPPING = ctx.DO_SOFTCAPPING,
+            SOFTCAP        = ctx.logit_softcapping,
+            num_warps      = 8,
         )
         return logits, None, None,
     pass
 pass
 
 
-def fast_cross_entropy_loss(logits, labels):
+def fast_cross_entropy_loss(logits, labels, logit_softcapping = 0):
     """
     Arguments:
         logits: (batch, seq_len, vocab_size)
@@ -274,6 +317,7 @@ def fast_cross_entropy_loss(logits, labels):
     loss = Fast_CrossEntropyLoss.apply(
         logits.view(batch*seq_len, d),
         labels.view(-1),
+        logit_softcapping,
     )
     n_items = torch.count_nonzero(labels != -100)
     return loss.sum() / n_items

diff --git a/unsloth/kernels/geglu.py b/unsloth/kernels/geglu.py
@@ -41,7 +41,7 @@ def _exact_forward_kernel(e, g, h, n_elements, BLOCK_SIZE : tl.constexpr,):
 def geglu_exact_forward_kernel(gate, up):
     batch, seq_len, hd = gate.shape
     n_elements = gate.numel()
-    out = torch.empty((batch, seq_len, hd), dtype = gate.dtype, device = "cuda")
+    out = torch.empty((batch, seq_len, hd), dtype = gate.dtype, device = "cuda:0")
     grid = lambda meta: (triton.cdiv(n_elements, meta['BLOCK_SIZE']),)
     _exact_forward_kernel[grid](gate, up, out, n_elements, BLOCK_SIZE = 1024,)
     return out
@@ -133,7 +133,7 @@ def _approx_forward_kernel(e, g, h, n_elements, BLOCK_SIZE : tl.constexpr,):
 def geglu_approx_forward_kernel(gate, up):
     batch, seq_len, hd = gate.shape
     n_elements = gate.numel()
-    out = torch.empty((batch, seq_len, hd), dtype = gate.dtype, device = "cuda")
+    out = torch.empty((batch, seq_len, hd), dtype = gate.dtype, device = "cuda:0")
     grid = lambda meta: (triton.cdiv(n_elements, meta['BLOCK_SIZE']),)
     _approx_forward_kernel[grid](gate, up, out, n_elements, BLOCK_SIZE = 1024,)
     return out

diff --git a/unsloth/kernels/rms_layernorm.py b/unsloth/kernels/rms_layernorm.py
@@ -119,7 +119,7 @@ def _gemma_rms_layernorm_forward(
     W_row = tl.load(W + col_offsets, mask = mask, other = 0).to(tl.float32)
 
     row_var = tl.sum(X_row * X_row, axis = 0) / n_cols
-    inv_var = 1.0 / tl.sqrt(row_var + eps) # Must be 1/sqrt to match Deepmind's impl
+    inv_var = tl.math.rsqrt(row_var + eps)
     tl.store(r, inv_var)
     normed = X_row * inv_var
     output = normed * (W_row + 1.0)
@@ -137,8 +137,8 @@ def forward(ctx, X, W, eps, gemma = False):
         n_rows, n_cols = X.shape
         BLOCK_SIZE, num_warps = calculate_settings(n_cols)
 
-        Y = torch.empty((n_rows, n_cols), dtype = X.dtype, device = "cuda")
-        r = torch.empty(n_rows, dtype = torch.float32, device = "cuda")
+        Y = torch.empty((n_rows, n_cols), dtype = X.dtype, device = "cuda:0")
+        r = torch.empty(n_rows, dtype = torch.float32, device = "cuda:0")
 
         fx = _gemma_rms_layernorm_forward if gemma else _rms_layernorm_forward
         fx[(n_rows,)](

diff --git a/unsloth/kernels/swiglu.py b/unsloth/kernels/swiglu.py
@@ -41,7 +41,7 @@ def _fg_kernel(e, g, h, n_elements, BLOCK_SIZE : tl.constexpr,):
 def swiglu_fg_kernel(e, g):
     batch, seq_len, hd = e.shape
     n_elements = e.numel()
-    h = torch.empty((batch, seq_len, hd), dtype = e.dtype, device = "cuda")
+    h = torch.empty((batch, seq_len, hd), dtype = e.dtype, device = "cuda:0")
     grid = lambda meta: (triton.cdiv(n_elements, meta['BLOCK_SIZE']),)
     _fg_kernel[grid](e, g, h, n_elements, BLOCK_SIZE = 1024,)
     return h

diff --git a/unsloth/kernels/utils.py b/unsloth/kernels/utils.py
@@ -105,14 +105,14 @@ def fast_dequantize(W, quant_state = None, out = None):
 
     # Create weight matrix
     if out is None:
-        out = torch.empty(shape, dtype = dtype, device = "cuda")
+        out = torch.empty(shape, dtype = dtype, device = "cuda:0")
     else:
         assert(out.shape == shape)
         assert(out.dtype == dtype)
 
     # NF4 dequantization of statistics
     n_elements_absmax = absmax.numel()
-    out_absmax = torch.empty(n_elements_absmax, dtype = torch.float32, device = "cuda")
+    out_absmax = torch.empty(n_elements_absmax, dtype = torch.float32, device = "cuda:0")
 
     # Do dequantization
     ptr_out_absmax = get_ptr(out_absmax)
@@ -161,7 +161,7 @@ def fast_gemv(X, W, quant_state, out = None):
     bout = shape[0]
 
     if out is None:
-        out = torch.empty((1, 1, bout,), dtype = dtype, device = "cuda")
+        out = torch.empty((1, 1, bout,), dtype = dtype, device = "cuda:0")
     # else:
     #     assert(out.shape == (1, 1, bout,))
     # pass
@@ -179,7 +179,7 @@ def fast_gemv(X, W, quant_state, out = None):
     ldb = ctypes.c_int32(ldb)
     ldc = ctypes.c_int32(ldc)
 
-    df = torch.empty(absmax.shape, dtype = torch.float32, device = "cuda")
+    df = torch.empty(absmax.shape, dtype = torch.float32, device = "cuda:0")
     cdequantize_blockwise_fp32(
         get_ptr(code2), get_ptr(absmax), get_ptr(absmax2), get_ptr(df),
         ctypes.c_int(blocksize2), ctypes.c_int(df.numel()),