PaddlePaddle · ZHUI · May 21, 2025 · Apr 24, 2025
diff --git a/paddlenlp/data/vocab.py b/paddlenlp/data/vocab.py
@@ -27,8 +27,8 @@ class Vocab(object):
     store/load functions.
 
     Args:
-        counter (collections.Counter, optional): A Counter intance describes
-            the tokens and their frequencies. Its keys will be indexed accroding
+        counter (collections.Counter, optional): A Counter instance describes
+            the tokens and their frequencies. Its keys will be indexed according
             to the order of frequency sorting to construct mapping relationship.
             If None, `token_to_idx` must be provided as the mapping relationship.
             Default: None.

diff --git a/paddlenlp/datasets/dataset.py b/paddlenlp/datasets/dataset.py
@@ -570,7 +570,7 @@ def remove_if_exit(filepath):
             datasets = DatasetTuple(splits)
             parallel_env = dist.ParallelEnv()
             unique_endpoints = _get_unique_endpoints(parallel_env.trainer_endpoints[:])
-            # move register hook to first and register togather
+            # move register hook to first and register together
             lock_files = []
             for split in splits:
                 lock_file = os.path.join(DATA_HOME, self.__class__.__name__)

diff --git a/paddlenlp/experimental/autonlp/README_en.md b/paddlenlp/experimental/autonlp/README_en.md
@@ -6,7 +6,7 @@
 
 **The AutoNLP APIs are subjective to significant changes until formal release**
 
-**AutoNLP** is an experimental project by PaddleNLP to democratize NLP for everyone. Delivering a successful NLP project is not easy, as it requires deep domain knowledge. Time after time, we have seen people struggle to make NLP work on their dataset, for their projects, which is why we are building **AutoNLP**. Compared with the traditional AutoML approach of massive paid compute for State-of-the-Art model performance, we have a different philosphy:
+**AutoNLP** is an experimental project by PaddleNLP to democratize NLP for everyone. Delivering a successful NLP project is not easy, as it requires deep domain knowledge. Time after time, we have seen people struggle to make NLP work on their dataset, for their projects, which is why we are building **AutoNLP**. Compared with the traditional AutoML approach of massive paid compute for State-of-the-Art model performance, we have a different philosophy:
 
 
 1. Instead of training State-of-the-Art models on huge datasets running on huge clusters, our goal is to deliver **decent models under limited compute**. We assume our users have a few GPUs at most and want to get decent models under 8 hours on their own in-house datasets. Note that you can get this level of compute for FREE on [Baidu AI Studio](https://aistudio.baidu.com/aistudio).

diff --git a/paddlenlp/experimental/faster_tokenizer.py b/paddlenlp/experimental/faster_tokenizer.py
@@ -46,7 +46,7 @@ def to_vocab_buffer(vocab_dict, name):
     NOTICE: The value will be held in the cpu place.
 
     Args:
-        vocab_dict(dict): The value will be setted to the tensor.
+        vocab_dict(dict): The value will be set to the tensor.
             The key is token and the value is the token index.
         name(string): The name of the tensor.
     """

diff --git a/paddlenlp/experimental/transformers/qwen/modeling.py b/paddlenlp/experimental/transformers/qwen/modeling.py
@@ -499,7 +499,7 @@ def forward(
         hidden_states = outputs[0]
 
         # if labels is None，means we need full output, instead of tensor_parallel_output
-        # tensor_parallel_output is togather with ParallelCrossEntropy
+        # tensor_parallel_output is together with ParallelCrossEntropy
         tensor_parallel_output = (
             self.config.tensor_parallel_output and labels is not None and self.config.tensor_parallel_degree > 1
         )

diff --git a/paddlenlp/experimental/transformers/qwen2/modeling.py b/paddlenlp/experimental/transformers/qwen2/modeling.py
@@ -1210,7 +1210,7 @@ def forward(
         hidden_states = outputs[0]
 
         # if labels is None，means we need full output, instead of tensor_parallel_output
-        # tensor_parallel_output is togather with ParallelCrossEntropy
+        # tensor_parallel_output is together with ParallelCrossEntropy
         tensor_parallel_output = (
             self.config.tensor_parallel_output and labels is not None and self.config.tensor_parallel_degree > 1
         )

diff --git a/paddlenlp/generation/logits_process.py b/paddlenlp/generation/logits_process.py
@@ -439,7 +439,7 @@ def __init__(self, sequence_bias: Dict[Tuple[int], float]):
         self._validate_arguments()
 
         # Bias variables that will be populated on the first call (for retrocompatibility purposes, the vocabulary size
-        # is infered in the first usage, which inhibits initializing here)
+        # is inferred in the first usage, which inhibits initializing here)
         self.length_1_bias = None
         self.prepared_bias_variables = False
 

diff --git a/paddlenlp/layers/crf.py b/paddlenlp/layers/crf.py
@@ -248,7 +248,7 @@ def __init__(self, crf):
         self.crf = crf
         if isinstance(crf, paddle.Tensor):
             raise ValueError(
-                "From paddlenlp >= 2.0.0b4, the first param of LinearChainCrfLoss shoule be a LinearChainCrf object. For input parameter 'crf.transitions', you can remove '.transitions' to 'crf'"
+                "From paddlenlp >= 2.0.0b4, the first param of LinearChainCrfLoss should be a LinearChainCrf object. For input parameter 'crf.transitions', you can remove '.transitions' to 'crf'"
             )
 
     def forward(self, inputs, lengths, labels, old_version_labels=None):

diff --git a/paddlenlp/ops/distributed/parallel.py b/paddlenlp/ops/distributed/parallel.py
@@ -191,8 +191,8 @@ def __init__(self, size, num_partitions=1, gather_out=True, param_attr=None, bia
         main_block = paddle.static.default_main_program().global_block()
         startup_block.vars[weight.name].is_distributed = True
         main_block.vars[weight.name].is_distributed = True
-        # set is_distributed for splited bias
-        # if a linear layer is splited by col, the bias would also be split into each rank as its weight
+        # set is_distributed for split bias
+        # if a linear layer is split by col, the bias would also be split into each rank as its weight
         if self.linear._bias_attr:
             startup_block.vars[self.linear.bias.name].is_distributed = True
             main_block.vars[self.linear.bias.name].is_distributed = True
@@ -285,8 +285,8 @@ def __init__(self, size, num_partitions=1, input_is_parallel=False, param_attr=N
         main_block = paddle.static.default_main_program().global_block()
         startup_block.vars[weight.name].is_distributed = True
         main_block.vars[weight.name].is_distributed = True
-        # set is_distributed for splited bias
-        # if a linear layer is splited by row, each rank would hold a complete bias
+        # set is_distributed for split bias
+        # if a linear layer is split by row, each rank would hold a complete bias
 
         if bias_attr is not False:
             self.bias = self.create_parameter(shape=[num_cols], attr=bias_attr, dtype=self._dtype, is_bias=True)

diff --git a/paddlenlp/quantization/checkpoint_quantization_utils.py b/paddlenlp/quantization/checkpoint_quantization_utils.py
@@ -63,7 +63,7 @@ def group_wise_quant_dequant(
         tp_degree (`int`):
             Tensor parallel world size.
         use_pd (`bool`):
-            Whether to use paddle caculation. If False will use numpy.
+            Whether to use paddle calculation. If False will use numpy.
         symmetry (`bool`):
             Whether to use symmetry quantization.
     """
@@ -201,7 +201,7 @@ def cal_abs_min_max_channel(inputs, quant_axis=1):
         inputs (`numpy.array`):
             input tensor for quantization.
         quant_axis (`int`):
-            dimension where calulating inputs' abs min and max scales on.
+            dimension where calculating inputs' abs min and max scales on.
     """
     eps = 1e-8
     reduce_axis = tuple([i for i in range(len(inputs.shape)) if i != quant_axis])
@@ -227,7 +227,7 @@ def asymmetry_qdq_weight(
         quant_bits (`int`):
             Quantization bits.
         quant_axis (`int`):
-            Scales caculation axis.
+            Scales calculation axis.
         mins (`paddle.Tensor`):
             Min scales tensor in asymmetry quantization.
         maxs (`paddle.Tensor`):
@@ -239,7 +239,7 @@ def asymmetry_qdq_weight(
         tp_degree (`int`):
             Model parallel world size.
         use_pd (`bool`):
-            Whether to use paddle caculation. If False will use numpy.
+            Whether to use paddle calculation. If False will use numpy.
     """
 
     if mins is None:
@@ -288,7 +288,7 @@ def cal_abs_max_channel(inputs, quant_axis=1):
         inputs (`numpy.array`):
             input tensor for quantization.
         quant_axis (`int`):
-            dimension where calulating inputs' abs max scales on.
+            dimension where calculating inputs' abs max scales on.
     """
     epsilon = 1e-8
     reduce_axis = tuple([i for i in range(len(inputs.shape)) if i != quant_axis])
@@ -311,7 +311,7 @@ def qdq_weight(x, quant_bit=8, quant_axis=-1, scales=None, dequant=False, tp_ran
         quant_bits (`int`):
             Quantization bits.
         quant_axis (`int`):
-            Scales caculation axis.
+            Scales calculation axis.
         scales (`paddle.Tensor`):
             Abs max scales tensor in symmetry quantization.
         dequant (`bool`):
@@ -321,7 +321,7 @@ def qdq_weight(x, quant_bit=8, quant_axis=-1, scales=None, dequant=False, tp_ran
         tp_degree (`int`):
             Model parallel world size.
         use_pd (`bool`):
-            Whether to use paddle caculation. If False will use numpy.
+            Whether to use paddle calculation. If False will use numpy.
     """
 
     if scales is None:

diff --git a/paddlenlp/quantization/quantization_linear.py b/paddlenlp/quantization/quantization_linear.py
@@ -250,7 +250,7 @@ def __init__(
         self.quant_dtype, self.quant_weight_bit = QuantMapping[self.weight_quantize_algo]
         self.state = 0
 
-        # PaddlePaddle dosen't support 4bit data type, one 8bit data represents two 4bit data.
+        # PaddlePaddle doesn't support 4bit data type, one 8bit data represents two 4bit data.
         # paddle.nn.quant.weight_quantize will transpose in_features and out_features.
         if self.weight_quantize_algo in [
             "weight_only_int8",
@@ -405,7 +405,7 @@ def __init__(
         if self.sequence_parallel and self.gather_output:
             raise ValueError("Sequence parallel does not support gather_output")
 
-        # PaddlePaddle dosen't support Int4 data type, one Int8 data represents two Int4 data.
+        # PaddlePaddle doesn't support Int4 data type, one Int8 data represents two Int4 data.
         if self.weight_quantize_algo in [
             "weight_only_int8",
             "weight_only_int4",
@@ -542,7 +542,7 @@ def __init__(
         if not self.input_is_parallel and self.sequence_parallel:
             raise ValueError("Sequence parallel only support input_is_parallel.")
 
-        # PaddlePaddle dosen't support Int4 data type, one Int8 data represents two Int4 data.
+        # PaddlePaddle doesn't support Int4 data type, one Int8 data represents two Int4 data.
         # paddle.nn.quant.weight_quantize will transpose in_features and out_features.
         if self.weight_quantize_algo in [
             "weight_only_int8",

diff --git a/paddlenlp/rl/models/ppo_model_utils.py b/paddlenlp/rl/models/ppo_model_utils.py
@@ -455,11 +455,11 @@ def forward(
                 vocab_size=self.config.vocab_size,
                 tensor_parallel_degree=self.config.tensor_parallel_degree,
                 tensor_parallel_output=self.config.tensor_parallel_output,
-                pg_loss_coeff=self.pg_loss_coeff,  # donot use this
+                pg_loss_coeff=self.pg_loss_coeff,  # do not use this
                 clip_range_ratio=self.clip_range_ratio,
                 clip_range_ratio_low=self.clip_range_ratio_low,
                 clip_range_ratio_high=self.clip_range_ratio_high,
-                entropy_coeff=self.entropy_coeff,  # donot support this
+                entropy_coeff=self.entropy_coeff,  # do not support this
                 clip_range_score=self.clip_range_score,
                 kl_loss_coeff=self.kl_loss_coeff,
                 loop_chunk_size=1024,

diff --git a/paddlenlp/rl/trainer/rl_trainer.py b/paddlenlp/rl/trainer/rl_trainer.py
@@ -674,7 +674,7 @@ def get_train_step_vars(self, vars: Optional[Dict] = None) -> Dict:
             if paddle.distributed.get_world_size() > 1:
                 assert self.model is not self.model_wrapped
             self.train_step_vars = {
-                # meaningless vars can pass from outter, dummy value is enough
+                # meaningless vars can pass from outer, dummy value is enough
                 "epoch": 0,  # meaningless for step training
                 "step": 0,  # meaningless for step training
                 "steps_in_epoch": 100000,  # meaningless for step training
@@ -718,15 +718,15 @@ def full_training_step(self, **inputs) -> paddle.Tensor:
         # trainer.train use `tr_loss` as loss var to accumulate loss.
         # NOTE: `tr_loss` in trainer.train not only accumulate mean loss for
         # steps in one `gradient_accumulation_steps`, but also accumulate for
-        # one logging intervel which may contains more than one accumulated steps.
+        # one logging interval which may contains more than one accumulated steps.
         # However, in RLTrainer we only want to use `tr_loss` to accumulate
         # mean loss for steps in a `gradient_accumulation_steps` range. As for
-        # logging intervel loss accumulation is not take into account here and
-        # should be considered in outter.
+        # logging interval loss accumulation is not take into account here and
+        # should be considered in outer.
         if loss_var is None:  # the first step of current loss type
             loss_var = paddle.to_tensor(0.0)
             train_step_vars[loss_name] = loss_var
-        elif self.is_accumulation_step:  # begin a new accumulation step intervel
+        elif self.is_accumulation_step:  # begin a new accumulation step interval
             for name in self.loss_names:
                 train_step_vars[name] = paddle.to_tensor(0.0)
             loss_var = train_step_vars[loss_name]

diff --git a/paddlenlp/taskflow/knowledge_mining.py b/paddlenlp/taskflow/knowledge_mining.py
@@ -146,7 +146,7 @@
 class WordTagTask(Task):
     """
     This the NER(Named Entity Recognition) task that convert the raw text to entities. And the task with the `wordtag`
-    model will link the more meesage with the entity.
+    model will link the more message with the entity.
     Args:
         task(string): The name of task.
         model(string): The model name in the task.

diff --git a/paddlenlp/taskflow/lexical_analysis.py b/paddlenlp/taskflow/lexical_analysis.py
@@ -68,7 +68,7 @@ def load_vocab(dict_path):
 
 class LacTask(Task):
     """
-    Lexical analysis of Chinese task to segement the chinese sentence.
+    Lexical analysis of Chinese task to segment the chinese sentence.
     Args:
         task(string): The name of task.
         model(string): The model name in the task.

diff --git a/paddlenlp/taskflow/named_entity_recognition.py b/paddlenlp/taskflow/named_entity_recognition.py
@@ -74,7 +74,7 @@
 class NERWordTagTask(WordTagTask):
     """
     This the NER(Named Entity Recognition) task that convert the raw text to entities. And the task with the `wordtag`
-    model will link the more meesage with the entity.
+    model will link the more message with the entity.
     Args:
         task(string): The name of task.
         model(string): The model name in the task.

diff --git a/paddlenlp/trainer/auto_trainer.py b/paddlenlp/trainer/auto_trainer.py
@@ -109,7 +109,7 @@ def parallel_model(cls, model, training_args: AutoTrainingArguments):
             model (paddle.nn.Layer): the model to be parallelized.
             training_args (AutoTrainingArguments) : Training arguments which contain distributed information
         Returns:
-            the model after parallelize and config conatins distributed strategy
+            the model after parallelize and config contains distributed strategy
         """
         if not training_args.use_intermediate_api:
             return model, None

diff --git a/paddlenlp/trainer/plugins/npu_plugin.py b/paddlenlp/trainer/plugins/npu_plugin.py
@@ -68,7 +68,7 @@ def _flatten_param_grads(optimizer, params_grads):
         g.persistable = True
         if getattr(p, "need_clip", True) is False or getattr(p, "regularizer", None) is not None:
             logger.warning(
-                f"flatten_param_grads=True will be discarded since paramter {p.name}'s need_clip is False or "
+                f"flatten_param_grads=True will be discarded since parameter {p.name}'s need_clip is False or "
                 "the regularizer is set."
             )
             return params_grads

diff --git a/paddlenlp/trainer/plugins/timer.py b/paddlenlp/trainer/plugins/timer.py
@@ -26,7 +26,7 @@
 
 
 class _Timer:
-    """Profile Timer for recording time taken by forward/ bacward/ reduce/ step."""
+    """Profile Timer for recording time taken by forward/ backward/ reduce/ step."""
 
     def __init__(self, name):
         self.name = name

diff --git a/paddlenlp/trainer/trainer_compress.py b/paddlenlp/trainer/trainer_compress.py
@@ -700,10 +700,10 @@ def _quant_aware_training_dynamic(self, input_dir):
     args.output_filename_prefix = "int8"
 
     quant_config = {
-        # It defauts to None, which means that no preprocessing is performed
+        # It defaults to None, which means that no preprocessing is performed
         # on the active value."
         "activation_preprocess_type": "PACT" if args.use_pact else None,
-        # It defauts to None, which means that no preprocessing is performed
+        # It defaults to None, which means that no preprocessing is performed
         # on weights.
         "weight_preprocess_type": "PACT" if args.use_pact else None,
         "weight_quantize_type": args.weight_quantize_type,

diff --git a/paddlenlp/trainer/trainer_utils.py b/paddlenlp/trainer/trainer_utils.py
@@ -84,7 +84,7 @@ def _get_distributed_seeds(seed: int = 1234, topo: Topology = None):
     """
 
     # NOTE: For parameter init seed:
-    # seed: dp/mp_undistributed_paramter/sharding is same; others is different
+    # seed: dp/mp_undistributed_parameter/sharding is same; others is different
     # For compute seed(dropout):
     # global seed: only mp group is same.
     # local seed: all groups are different

diff --git a/paddlenlp/trainer/utils/reshard/pp_reshard.py b/paddlenlp/trainer/utils/reshard/pp_reshard.py
@@ -220,7 +220,7 @@ def layers(self):
 class PipeLineStage:
     def __init__(self):
         self._rename_mgr = LayerReNamingManager()
-        # map segement start index to segment
+        # map segment start index to segment
         self._segments = OrderedDict()
         self._layer_to_segment = OrderedDict()
         self._param_to_tname = OrderedDict()

diff --git a/paddlenlp/trainer/utils/sharding_io.py b/paddlenlp/trainer/utils/sharding_io.py
@@ -97,7 +97,7 @@ def filter_sharded_params(state_dict, optimizer, sharding_group):
     return filtered_state_dict
 
 
-def exclude_paramters_in_state_dict(
+def exclude_parameters_in_state_dict(
     model_state_dict, param_names_in_master_weights, sharding_group, should_save_sharding_stage1_model=True
 ):
     assert sharding_group is not None
@@ -399,7 +399,7 @@ def manipulate_state_dict_and_config(self, model_to_save, merge_tensor_parallel=
             optimzier_state_dict = self.optimizer.state_dict()
             assert "master_weights" in optimzier_state_dict
             param_names_in_master_weights = list(optimzier_state_dict["master_weights"].keys())
-            state_dict = exclude_paramters_in_state_dict(
+            state_dict = exclude_parameters_in_state_dict(
                 state_dict, param_names_in_master_weights, self.sharding_group
             )
             logger.info(

diff --git a/paddlenlp/trainer/utils/zero_cost_checkpoint.py b/paddlenlp/trainer/utils/zero_cost_checkpoint.py
@@ -539,7 +539,7 @@ def update_zcc_workers(self, new_version, dynamic_objecs, static_object, global_
         self.global_step = global_step
         assert self.current_worker is None, "[ZCC manager] current_worker must be None"
         task = (ZCCTaskType.UPDATE, [self.cache_version, dynamic_objecs, static_object])
-        logger.info(f"[ZCC manager] updating zcc workers, verison: {self.cache_version}")
+        logger.info(f"[ZCC manager] updating zcc workers, version: {self.cache_version}")
         for worker in self.workers:
             worker.task_queue.put(task)
         logger.info("[ZCC manager] waiting workers update done")

diff --git a/paddlenlp/transformers/auto/modeling.py b/paddlenlp/transformers/auto/modeling.py
@@ -857,7 +857,7 @@ def from_pretrained(cls, pretrained_model_name_or_path, *model_args, **kwargs):
         model_class = getattr(import_class, model_class_name)
 
         # It may return a new model class, like LlamaForCausalLMAvxInferenceModel
-        # Some model have different inference model class in deifferent execution divice
+        # Some model have different inference model class in different execution device
         # LlamaForCausalLMAvxInferenceModel is used in cpu execution device with avx instruction set
         model_class = model_class.confirm_inference_model(predictor_args=predictor_args)
 
@@ -931,7 +931,7 @@ def from_config(cls, config, *model_args, **kwargs):
         model_class = getattr(import_class, model_class_name)
 
         # It may return a new model class, like LlamaForCausalLMAvxInferenceModel
-        # Some model have different inference model class in deifferent execution divice
+        # Some model have different inference model class in different execution device
         # LlamaForCausalLMAvxInferenceModel is used in cpu execution device with avx instruction set
         model_class = model_class.confirm_inference_model(predictor_args=predictor_args)