PaddlePaddle
diff --git a/‎csrc/gpu/set_preids_token_penalty_multi_scores.cu
Lines changed: 5 additions & 5 deletions b/‎csrc/gpu/set_preids_token_penalty_multi_scores.cu
Lines changed: 5 additions & 5 deletions
diff --git a/‎llm/auto_parallel/llama/run_pretrain_auto.py
Lines changed: 6 additions & 0 deletions b/‎llm/auto_parallel/llama/run_pretrain_auto.py
Lines changed: 6 additions & 0 deletions
diff --git a/‎llm/predict/predictor.py
Lines changed: 2 additions & 0 deletions b/‎llm/predict/predictor.py
Lines changed: 2 additions & 0 deletions
diff --git a/‎llm/utils/replace_ops.py
Lines changed: 240 additions & 0 deletions b/‎llm/utils/replace_ops.py
Lines changed: 240 additions & 0 deletions
diff --git a/‎paddlenlp/experimental/transformers/fused_transformer_layers.py
Lines changed: 7 additions & 9 deletions b/‎paddlenlp/experimental/transformers/fused_transformer_layers.py
Lines changed: 7 additions & 9 deletions
@@ -41,11 +41,11 @@ __global__ void set_preids_token_penalty_multi_scores_kernel(const bool *stop_fl
     T *logits_now = logits + bi * length;
     int tid = threadIdx.x;
 
-    if (tid < bs && !stop_flags[tid]) {
-        int64_t *pre_ids_now = pre_ids + tid * length_id;
-        const int64_t *input_ids_now = input_ids + tid * length_input_ids;
-        const int seq_len_dec = seq_lens_decoder[tid];
-        const int seq_len_enc = seq_lens_encoder[tid];
+    if (bi < bs && !stop_flags[bi]) {
+        int64_t *pre_ids_now = pre_ids + bi * length_id;
+        const int64_t *input_ids_now = input_ids + bi * length_input_ids;
+        const int seq_len_dec = seq_lens_decoder[bi];
+        const int seq_len_enc = seq_lens_encoder[bi];
         if (seq_len_dec == 0 && seq_len_enc == 0) return; // stopped
 
         const int step_idx_now = step_idx[bi];
 
@@ -539,6 +539,7 @@ def main():
     config.tensor_parallel_degree = training_args.tensor_parallel_degree
     config.tensor_parallel_rank = training_args.tensor_parallel_rank
     config.sharding_parallel_degree = training_args.sharding_parallel_degree
+    config.to_static = training_args.to_static
 
     if training_args.strategy.pipeline.enable and config.virtual_pp_degree > 1:
         pipeline = training_args.strategy.pipeline
@@ -556,6 +557,11 @@ def main():
 
     print("Final pre-training config:", config)
 
+    if "replace_with_parallel_cross_entropy" in training_args.tensor_parallel_config and config.tensor_parallel_degree > 1 and config.to_static is False:
+        from llm.utils.replace_ops import replace_cross_entropy
+
+        replace_cross_entropy()
+
     # # Set the dtype for loading model
     # dtype = "float32"
     # if training_args.fp16_opt_level == "O2":
 
@@ -1311,6 +1311,8 @@ def insert_task(self, pos, task_id, repeat_num):
         self.model_inputs["stop_flags"][pos] = False
         self.model_inputs["result_id"][pos][0] = task_id
         self.model_inputs["step_idx"][pos, 0] = 1
+        self.model_inputs["pre_ids"][pos][0] = self.input_ids[query_id][-1]
+        self.model_inputs["pre_ids"][pos][1:] = -1
         self.model_inputs["not_need_stop"][0] = True
 
         num_prefill_blocks = length // self.block_size
 
@@ -0,0 +1,240 @@
+# Copyright (c) 2025 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import paddle
+from paddle import nn
+import functools
+import math
+import operator
+from typing import Literal, TypeAlias
+import paddle.distributed as dist
+
+from paddle import Tensor
+from paddle import _C_ops, base, in_dynamic_mode
+from paddle.distributed.fleet.base import topology as tp
+from paddle.distributed import collective
+from paddle.tensor.manipulation import reshape
+from paddle.nn.layer.layers import Layer
+_ReduceMode: TypeAlias = Literal['mean', 'sum', 'none']
+
+
+# TODO: this function is rewrited from paddle.nn.functional.cross_entropy,
+# but better to merge into only one.
+def parallel_cross_entropy(
+    input: Tensor,
+    label: Tensor,
+    weight: Tensor | None = None,
+    ignore_index: int = -100,
+    reduction: _ReduceMode = 'mean',
+    soft_label: bool = False,
+    axis: int = -1,
+    use_softmax: bool = True,
+    label_smoothing: float = 0.0,
+    name: str | None = None,
+) -> Tensor:
+
+    if reduction not in ['sum', 'mean', 'none']:
+        raise ValueError(
+            "The value of 'reduction' in softmax_cross_entropy"
+            f"should be 'sum', 'mean' or 'none', but received {reduction}, which is not allowed."
+        )
+    if ignore_index > 0 and soft_label:
+        raise ValueError(
+            "When soft_label == True, the value of 'ignore_index' in softmax_cross_entropy"
+            f"should be '-100', but received {ignore_index}, which is not allowed."
+        )
+
+    input_dims = len(list(input.shape))
+    if input_dims == 0:
+        raise ValueError('The dimension of input should be larger than zero!')
+
+    label_dims = len(list(label.shape))
+    if input_dims - 1 == label_dims:
+        label = paddle.unsqueeze(label, axis=axis)
+
+    if input_dims - 1 != label_dims and input_dims != label_dims:
+        raise ValueError(
+            f'Expected nput_dims - 1 = label_dims or input_dims == label_dims\
+             (got nput_dims{input_dims}, label_dims{label_dims})'
+        )
+
+    if label_smoothing > 0.0:
+        soft_label = True
+        # converting the label to one-hot encoding
+        # for 1d case, converting label's shape from [N] to [N, C]
+        # for 2d case, converting label's shape from [N, d_1, ..., d_k] to [N, d_1, ..., d_k, C]
+        if input_dims - 1 == label_dims:
+            label = paddle.squeeze(label, axis=axis)
+            label = paddle.nn.functional.one_hot(label, input.shape[-1])
+
+        label = paddle.nn.functional.label_smooth(
+            label, epsilon=label_smoothing
+        )
+        label = label.astype(input.dtype)
+        label_dims = len(list(label.shape))
+
+    if not soft_label:
+        valid_label = (
+            paddle.cast(label != ignore_index, dtype=label.dtype) * label
+        )
+    
+    if soft_label == False and is_tensor_sharded(input):
+        group = tp._HYBRID_PARALLEL_GROUP.get_model_parallel_group()
+        ring_id = group.id
+        nranks = group.nranks
+        global_rank = collective._get_global_env().rank
+        rank = group.get_group_rank(global_rank)
+        _, out = _C_ops.c_softmax_with_cross_entropy(
+            input, label, ignore_index, ring_id, rank, nranks
+        )
+    else:
+        from paddlenlp.utils.log import logger
+
+        logger.warning(
+            "Failed to replace CrossEntropyLoss with ParallelCrossEntropyLoss. Please ensure: \n"
+            "1. soft_label=False is set for parallel computation (current value: {}) \n"
+            "2. Input tensor is properly sharded (current sharding status: {}) \n".format(
+                soft_label, 
+                input_placement,
+            )
+        )
+
+        _, out = _C_ops.cross_entropy_with_softmax(
+            input, label, soft_label, use_softmax, True, ignore_index, axis
+        )
+
+    if weight is not None:
+        # trans weight from class to sample, shape:N or [N,H,W] for 1d and 2d cases.
+        if soft_label:
+            # chajchaj:
+            # weight's shape is C, where C is class num.
+            # for 1d case: label's shape is [N,C], weight_gather's shape is N.
+            # for 2d case: label's shape is [N,H,W,C], weight_gather's shape is [N,H,W].
+            weight_gather = paddle.matmul(
+                x=paddle.cast(label, weight.dtype),
+                y=weight,
+                transpose_x=False,
+                transpose_y=True,
+            )
+            out_shape = list(out.shape)
+            weight_gather_reshape = reshape(weight_gather, shape=out_shape)
+            out = paddle.cast(out, weight_gather_reshape.dtype)
+
+            out = _C_ops.multiply(out, weight_gather_reshape)
+        else:
+            if input.shape[axis] != weight.shape[-1]:
+                raise ValueError(
+                    f"input's class_dimension({input.shape[axis]}) must equal to "
+                    f"weight's class_dimension({weight.shape[-1]}) "
+                    "when weight is provided"
+                )
+
+            ignore_weight_mask = paddle.cast(
+                (label != ignore_index), out.dtype
+            )
+            if (
+                ignore_weight_mask.ndim > 1
+                and ignore_weight_mask.shape[axis] == 1
+            ):
+                # TODO: Temporarily use squeeze instead of squeeze_
+                ignore_weight_mask = paddle.squeeze(
+                    ignore_weight_mask, axis
+                )
+            if axis != -1 and axis != valid_label.ndim - 1:
+                temp_perm = (
+                    list(range(axis % valid_label.ndim))
+                    + list(
+                        range(
+                            (axis % valid_label.ndim + 1), valid_label.ndim
+                        )
+                    )
+                    + [axis % valid_label.ndim]
+                )
+                weight_gather = _C_ops.gather_nd(
+                    weight, valid_label.transpose(temp_perm)
+                )
+            else:
+                weight_gather = _C_ops.gather_nd(weight, valid_label)
+            weight_gather = _C_ops.multiply(
+                weight_gather, ignore_weight_mask
+            )
+            input_shape = list(label.shape)
+            weight_gather_reshape = reshape(
+                weight_gather, shape=input_shape
+            )
+            out = paddle.cast(out, weight_gather_reshape.dtype)
+            out = _C_ops.multiply(out, weight_gather_reshape)
+
+    if reduction == "sum":
+        #   because of base_softmax_with_cross_entropy op's inner logic,
+        #   in the out tensor of this op, the loss of sample with class_index==ignore_index is 0
+        #   so, reduce_sum all directly is ok
+        return _C_ops.sum(out, [], None, False)
+    elif reduction == "mean":
+        # 1. if weight==none,
+        #     numerator: reduce_sum all loss directly is ok causeof base_softmax_with_cross_entropy's inner logic
+        #     denominator: count sample num with class_index!=ignore_index
+        # 2. else
+        #     numerator: loss's weighted sum
+        #     denominator: cal the sum of weight where the sample's class_index!=ignore_index
+        if ignore_index >= 0:  # ignore label
+            out_sum = _C_ops.sum(out, [], None, False)
+            # for each label[i],set 1 or 0, according to ignore_index
+            # mask[i]=0, if label[i]==ignore_index
+            # mask[i]=1, otherwise
+            mask = label != ignore_index
+            if weight is None:
+                mask = paddle.cast(mask, dtype=out_sum.dtype)
+                count = _C_ops.sum(mask, [], None, False)
+                ret = out_sum / (count + (count == 0.0).astype(count.dtype))
+            else:
+                mask = paddle.cast(mask, weight_gather_reshape.dtype)
+                weight_ignored = _C_ops.multiply(
+                    mask, weight_gather_reshape
+                )
+                weight_sum = _C_ops.sum(weight_ignored, [], None, False)
+                ret = out_sum / (
+                    weight_sum
+                    + (weight_sum == 0.0).astype(weight_sum.dtype)
+                )
+            return ret
+        elif weight is not None:
+            out_sum = _C_ops.sum(out, [], None, False)
+            total_weight = _C_ops.sum(
+                weight_gather_reshape, [], None, False
+            )
+            return out_sum / (
+                total_weight
+                + (total_weight == 0.0).astype(total_weight.dtype)
+            )
+        else:
+            return _C_ops.mean_all(out)
+
+    else:
+        if input_dims - 1 == label_dims:
+            out = paddle.squeeze(out, axis=axis)
+        return out
+
+
+# TODO: placement[1] may not be mp axis.
+def is_tensor_sharded(tensor):
+    if not tensor.is_dist():
+        return False
+
+    placement = tensor.placements
+    return placement[1].is_shard()
+
+
+def replace_cross_entropy():
+    paddle.nn.functional.cross_entropy = parallel_cross_entropy
@@ -1131,13 +1131,13 @@ def compute_qkv_linear(self, ln_out, i, latent_cache=None, **kwargs):
                 qkv_out = paddle.add(qkv_out, self.qkv_biases[i])
             return qkv_out
 
-    def compute_qkv(self, src, residual_input, i):
+    def compute_qkv(self, src, residual_input, i, **kwargs):
         ln_out = self.compute_layernorm_before_qkv(src, i)
 
         if self.config.mla_config.use_absorb():
             qkv_out = ln_out
         else:
-            qkv_out = self.compute_qkv_linear(ln_out, i)
+            qkv_out = self.compute_qkv_linear(ln_out, i, **kwargs)
 
         return qkv_out, residual_input
 
@@ -1523,7 +1523,7 @@ def forward(
 
         residual_input = src
         for i in range(self.num_layers):
-            qkv_out, residual_input = self.compute_qkv(src, residual_input, i)
+            qkv_out, residual_input = self.compute_qkv(src, residual_input, i, **kwargs)
             fmha_out = self.compute_attn(
                 time_step,
                 qkv_out,
@@ -1596,7 +1596,7 @@ class FusedMultiTransformerPostLayernorm(FusedMultiTransformerBase):
     def __init__(self, config: FusedMultiTransformerConfig):
         super().__init__(config)
 
-    def compute_qkv(self, src, residual_input, i):
+    def compute_qkv(self, src, residual_input, i, **kwargs):
         qkv_out = self.compute_qkv_linear(src, i)
         return qkv_out, src
 
@@ -2055,9 +2055,7 @@ def compute_qkv_linear(self, ln_out, i, latent_cache=None, **kwargs):
                 epsilon=self._epsilon,
                 begin_norm_axis=1,
             )[0]
-            query_pe, key_pe = self.config.rotary_emb(
-                self.position_ids[0 : kwargs.get("seq_lens_encoder", None).sum()], query_pe, key_pe
-            )
+            query_pe, key_pe = self.config.rotary_emb(self.position_ids, query_pe, key_pe)
 
             if self.config.mla_config.use_absorb():
                 from paddlenlp_ops import prefill_mla_write_cache
@@ -2689,7 +2687,7 @@ def compute_layernorm_before_qkv(self, src, i):
 
         return ln_out
 
-    def compute_qkv_linear(self, ln_out, i):
+    def compute_qkv_linear(self, ln_out, i, **kwargs):
         if self.config.mla_config.use_mla():
             raise NotImplementedError("Not support MLA yet.")
         else:
@@ -5140,7 +5138,7 @@ def compute_layernorm_before_qkv(self, src, i):
 
         return ln_out
 
-    def compute_qkv_linear(self, ln_out, i):
+    def compute_qkv_linear(self, ln_out, i, **kwargs):
         if self.config.mla_config.use_mla():
             raise NotImplementedError("Not support MLA yet.")
         else: