Skip to content
Merged
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 2 additions & 2 deletions paddlenlp/data/vocab.py
Original file line number Diff line number Diff line change
@@ -27,8 +27,8 @@ class Vocab(object):
store/load functions.

Args:
counter (collections.Counter, optional): A Counter intance describes
the tokens and their frequencies. Its keys will be indexed accroding
counter (collections.Counter, optional): A Counter instance describes
the tokens and their frequencies. Its keys will be indexed according
to the order of frequency sorting to construct mapping relationship.
If None, `token_to_idx` must be provided as the mapping relationship.
Default: None.
2 changes: 1 addition & 1 deletion paddlenlp/datasets/dataset.py
Original file line number Diff line number Diff line change
@@ -570,7 +570,7 @@ def remove_if_exit(filepath):
datasets = DatasetTuple(splits)
parallel_env = dist.ParallelEnv()
unique_endpoints = _get_unique_endpoints(parallel_env.trainer_endpoints[:])
# move register hook to first and register togather
# move register hook to first and register together
lock_files = []
for split in splits:
lock_file = os.path.join(DATA_HOME, self.__class__.__name__)
2 changes: 1 addition & 1 deletion paddlenlp/experimental/autonlp/README_en.md
Original file line number Diff line number Diff line change
@@ -6,7 +6,7 @@

**The AutoNLP APIs are subjective to significant changes until formal release**

**AutoNLP** is an experimental project by PaddleNLP to democratize NLP for everyone. Delivering a successful NLP project is not easy, as it requires deep domain knowledge. Time after time, we have seen people struggle to make NLP work on their dataset, for their projects, which is why we are building **AutoNLP**. Compared with the traditional AutoML approach of massive paid compute for State-of-the-Art model performance, we have a different philosphy:
**AutoNLP** is an experimental project by PaddleNLP to democratize NLP for everyone. Delivering a successful NLP project is not easy, as it requires deep domain knowledge. Time after time, we have seen people struggle to make NLP work on their dataset, for their projects, which is why we are building **AutoNLP**. Compared with the traditional AutoML approach of massive paid compute for State-of-the-Art model performance, we have a different philosophy:


1. Instead of training State-of-the-Art models on huge datasets running on huge clusters, our goal is to deliver **decent models under limited compute**. We assume our users have a few GPUs at most and want to get decent models under 8 hours on their own in-house datasets. Note that you can get this level of compute for FREE on [Baidu AI Studio](https://aistudio.baidu.com/aistudio).
2 changes: 1 addition & 1 deletion paddlenlp/experimental/faster_tokenizer.py
Original file line number Diff line number Diff line change
@@ -46,7 +46,7 @@ def to_vocab_buffer(vocab_dict, name):
NOTICE: The value will be held in the cpu place.

Args:
vocab_dict(dict): The value will be setted to the tensor.
vocab_dict(dict): The value will be set to the tensor.
The key is token and the value is the token index.
name(string): The name of the tensor.
"""
2 changes: 1 addition & 1 deletion paddlenlp/experimental/transformers/qwen/modeling.py
Original file line number Diff line number Diff line change
@@ -499,7 +499,7 @@ def forward(
hidden_states = outputs[0]

# if labels is None,means we need full output, instead of tensor_parallel_output
# tensor_parallel_output is togather with ParallelCrossEntropy
# tensor_parallel_output is together with ParallelCrossEntropy
tensor_parallel_output = (
self.config.tensor_parallel_output and labels is not None and self.config.tensor_parallel_degree > 1
)
2 changes: 1 addition & 1 deletion paddlenlp/experimental/transformers/qwen2/modeling.py
Original file line number Diff line number Diff line change
@@ -1210,7 +1210,7 @@ def forward(
hidden_states = outputs[0]

# if labels is None,means we need full output, instead of tensor_parallel_output
# tensor_parallel_output is togather with ParallelCrossEntropy
# tensor_parallel_output is together with ParallelCrossEntropy
tensor_parallel_output = (
self.config.tensor_parallel_output and labels is not None and self.config.tensor_parallel_degree > 1
)
2 changes: 1 addition & 1 deletion paddlenlp/generation/logits_process.py
Original file line number Diff line number Diff line change
@@ -439,7 +439,7 @@ def __init__(self, sequence_bias: Dict[Tuple[int], float]):
self._validate_arguments()

# Bias variables that will be populated on the first call (for retrocompatibility purposes, the vocabulary size
# is infered in the first usage, which inhibits initializing here)
# is inferred in the first usage, which inhibits initializing here)
self.length_1_bias = None
self.prepared_bias_variables = False

2 changes: 1 addition & 1 deletion paddlenlp/layers/crf.py
Original file line number Diff line number Diff line change
@@ -248,7 +248,7 @@ def __init__(self, crf):
self.crf = crf
if isinstance(crf, paddle.Tensor):
raise ValueError(
"From paddlenlp >= 2.0.0b4, the first param of LinearChainCrfLoss shoule be a LinearChainCrf object. For input parameter 'crf.transitions', you can remove '.transitions' to 'crf'"
"From paddlenlp >= 2.0.0b4, the first param of LinearChainCrfLoss should be a LinearChainCrf object. For input parameter 'crf.transitions', you can remove '.transitions' to 'crf'"
)

def forward(self, inputs, lengths, labels, old_version_labels=None):
8 changes: 4 additions & 4 deletions paddlenlp/ops/distributed/parallel.py
Original file line number Diff line number Diff line change
@@ -191,8 +191,8 @@ def __init__(self, size, num_partitions=1, gather_out=True, param_attr=None, bia
main_block = paddle.static.default_main_program().global_block()
startup_block.vars[weight.name].is_distributed = True
main_block.vars[weight.name].is_distributed = True
# set is_distributed for splited bias
# if a linear layer is splited by col, the bias would also be split into each rank as its weight
# set is_distributed for split bias
# if a linear layer is split by col, the bias would also be split into each rank as its weight
if self.linear._bias_attr:
startup_block.vars[self.linear.bias.name].is_distributed = True
main_block.vars[self.linear.bias.name].is_distributed = True
@@ -285,8 +285,8 @@ def __init__(self, size, num_partitions=1, input_is_parallel=False, param_attr=N
main_block = paddle.static.default_main_program().global_block()
startup_block.vars[weight.name].is_distributed = True
main_block.vars[weight.name].is_distributed = True
# set is_distributed for splited bias
# if a linear layer is splited by row, each rank would hold a complete bias
# set is_distributed for split bias
# if a linear layer is split by row, each rank would hold a complete bias

if bias_attr is not False:
self.bias = self.create_parameter(shape=[num_cols], attr=bias_attr, dtype=self._dtype, is_bias=True)
14 changes: 7 additions & 7 deletions paddlenlp/quantization/checkpoint_quantization_utils.py
Original file line number Diff line number Diff line change
@@ -63,7 +63,7 @@ def group_wise_quant_dequant(
tp_degree (`int`):
Tensor parallel world size.
use_pd (`bool`):
Whether to use paddle caculation. If False will use numpy.
Whether to use paddle calculation. If False will use numpy.
symmetry (`bool`):
Whether to use symmetry quantization.
"""
@@ -201,7 +201,7 @@ def cal_abs_min_max_channel(inputs, quant_axis=1):
inputs (`numpy.array`):
input tensor for quantization.
quant_axis (`int`):
dimension where calulating inputs' abs min and max scales on.
dimension where calculating inputs' abs min and max scales on.
"""
eps = 1e-8
reduce_axis = tuple([i for i in range(len(inputs.shape)) if i != quant_axis])
@@ -227,7 +227,7 @@ def asymmetry_qdq_weight(
quant_bits (`int`):
Quantization bits.
quant_axis (`int`):
Scales caculation axis.
Scales calculation axis.
mins (`paddle.Tensor`):
Min scales tensor in asymmetry quantization.
maxs (`paddle.Tensor`):
@@ -239,7 +239,7 @@ def asymmetry_qdq_weight(
tp_degree (`int`):
Model parallel world size.
use_pd (`bool`):
Whether to use paddle caculation. If False will use numpy.
Whether to use paddle calculation. If False will use numpy.
"""

if mins is None:
@@ -288,7 +288,7 @@ def cal_abs_max_channel(inputs, quant_axis=1):
inputs (`numpy.array`):
input tensor for quantization.
quant_axis (`int`):
dimension where calulating inputs' abs max scales on.
dimension where calculating inputs' abs max scales on.
"""
epsilon = 1e-8
reduce_axis = tuple([i for i in range(len(inputs.shape)) if i != quant_axis])
@@ -311,7 +311,7 @@ def qdq_weight(x, quant_bit=8, quant_axis=-1, scales=None, dequant=False, tp_ran
quant_bits (`int`):
Quantization bits.
quant_axis (`int`):
Scales caculation axis.
Scales calculation axis.
scales (`paddle.Tensor`):
Abs max scales tensor in symmetry quantization.
dequant (`bool`):
@@ -321,7 +321,7 @@ def qdq_weight(x, quant_bit=8, quant_axis=-1, scales=None, dequant=False, tp_ran
tp_degree (`int`):
Model parallel world size.
use_pd (`bool`):
Whether to use paddle caculation. If False will use numpy.
Whether to use paddle calculation. If False will use numpy.
"""

if scales is None:
6 changes: 3 additions & 3 deletions paddlenlp/quantization/quantization_linear.py
Original file line number Diff line number Diff line change
@@ -250,7 +250,7 @@ def __init__(
self.quant_dtype, self.quant_weight_bit = QuantMapping[self.weight_quantize_algo]
self.state = 0

# PaddlePaddle dosen't support 4bit data type, one 8bit data represents two 4bit data.
# PaddlePaddle doesn't support 4bit data type, one 8bit data represents two 4bit data.
# paddle.nn.quant.weight_quantize will transpose in_features and out_features.
if self.weight_quantize_algo in [
"weight_only_int8",
@@ -405,7 +405,7 @@ def __init__(
if self.sequence_parallel and self.gather_output:
raise ValueError("Sequence parallel does not support gather_output")

# PaddlePaddle dosen't support Int4 data type, one Int8 data represents two Int4 data.
# PaddlePaddle doesn't support Int4 data type, one Int8 data represents two Int4 data.
if self.weight_quantize_algo in [
"weight_only_int8",
"weight_only_int4",
@@ -542,7 +542,7 @@ def __init__(
if not self.input_is_parallel and self.sequence_parallel:
raise ValueError("Sequence parallel only support input_is_parallel.")

# PaddlePaddle dosen't support Int4 data type, one Int8 data represents two Int4 data.
# PaddlePaddle doesn't support Int4 data type, one Int8 data represents two Int4 data.
# paddle.nn.quant.weight_quantize will transpose in_features and out_features.
if self.weight_quantize_algo in [
"weight_only_int8",
4 changes: 2 additions & 2 deletions paddlenlp/rl/models/ppo_model_utils.py
Original file line number Diff line number Diff line change
@@ -455,11 +455,11 @@ def forward(
vocab_size=self.config.vocab_size,
tensor_parallel_degree=self.config.tensor_parallel_degree,
tensor_parallel_output=self.config.tensor_parallel_output,
pg_loss_coeff=self.pg_loss_coeff, # donot use this
pg_loss_coeff=self.pg_loss_coeff, # do not use this
clip_range_ratio=self.clip_range_ratio,
clip_range_ratio_low=self.clip_range_ratio_low,
clip_range_ratio_high=self.clip_range_ratio_high,
entropy_coeff=self.entropy_coeff, # donot support this
entropy_coeff=self.entropy_coeff, # do not support this
clip_range_score=self.clip_range_score,
kl_loss_coeff=self.kl_loss_coeff,
loop_chunk_size=1024,
10 changes: 5 additions & 5 deletions paddlenlp/rl/trainer/rl_trainer.py
Original file line number Diff line number Diff line change
@@ -674,7 +674,7 @@ def get_train_step_vars(self, vars: Optional[Dict] = None) -> Dict:
if paddle.distributed.get_world_size() > 1:
assert self.model is not self.model_wrapped
self.train_step_vars = {
# meaningless vars can pass from outter, dummy value is enough
# meaningless vars can pass from outer, dummy value is enough
"epoch": 0, # meaningless for step training
"step": 0, # meaningless for step training
"steps_in_epoch": 100000, # meaningless for step training
@@ -718,15 +718,15 @@ def full_training_step(self, **inputs) -> paddle.Tensor:
# trainer.train use `tr_loss` as loss var to accumulate loss.
# NOTE: `tr_loss` in trainer.train not only accumulate mean loss for
# steps in one `gradient_accumulation_steps`, but also accumulate for
# one logging intervel which may contains more than one accumulated steps.
# one logging interval which may contains more than one accumulated steps.
# However, in RLTrainer we only want to use `tr_loss` to accumulate
# mean loss for steps in a `gradient_accumulation_steps` range. As for
# logging intervel loss accumulation is not take into account here and
# should be considered in outter.
# logging interval loss accumulation is not take into account here and
# should be considered in outer.
if loss_var is None: # the first step of current loss type
loss_var = paddle.to_tensor(0.0)
train_step_vars[loss_name] = loss_var
elif self.is_accumulation_step: # begin a new accumulation step intervel
elif self.is_accumulation_step: # begin a new accumulation step interval
for name in self.loss_names:
train_step_vars[name] = paddle.to_tensor(0.0)
loss_var = train_step_vars[loss_name]
2 changes: 1 addition & 1 deletion paddlenlp/taskflow/knowledge_mining.py
Original file line number Diff line number Diff line change
@@ -146,7 +146,7 @@
class WordTagTask(Task):
"""
This the NER(Named Entity Recognition) task that convert the raw text to entities. And the task with the `wordtag`
model will link the more meesage with the entity.
model will link the more message with the entity.
Args:
task(string): The name of task.
model(string): The model name in the task.
2 changes: 1 addition & 1 deletion paddlenlp/taskflow/lexical_analysis.py
Original file line number Diff line number Diff line change
@@ -68,7 +68,7 @@ def load_vocab(dict_path):

class LacTask(Task):
"""
Lexical analysis of Chinese task to segement the chinese sentence.
Lexical analysis of Chinese task to segment the chinese sentence.
Args:
task(string): The name of task.
model(string): The model name in the task.
2 changes: 1 addition & 1 deletion paddlenlp/taskflow/named_entity_recognition.py
Original file line number Diff line number Diff line change
@@ -74,7 +74,7 @@
class NERWordTagTask(WordTagTask):
"""
This the NER(Named Entity Recognition) task that convert the raw text to entities. And the task with the `wordtag`
model will link the more meesage with the entity.
model will link the more message with the entity.
Args:
task(string): The name of task.
model(string): The model name in the task.
2 changes: 1 addition & 1 deletion paddlenlp/trainer/auto_trainer.py
Original file line number Diff line number Diff line change
@@ -109,7 +109,7 @@ def parallel_model(cls, model, training_args: AutoTrainingArguments):
model (paddle.nn.Layer): the model to be parallelized.
training_args (AutoTrainingArguments) : Training arguments which contain distributed information
Returns:
the model after parallelize and config conatins distributed strategy
the model after parallelize and config contains distributed strategy
"""
if not training_args.use_intermediate_api:
return model, None
2 changes: 1 addition & 1 deletion paddlenlp/trainer/plugins/npu_plugin.py
Original file line number Diff line number Diff line change
@@ -68,7 +68,7 @@ def _flatten_param_grads(optimizer, params_grads):
g.persistable = True
if getattr(p, "need_clip", True) is False or getattr(p, "regularizer", None) is not None:
logger.warning(
f"flatten_param_grads=True will be discarded since paramter {p.name}'s need_clip is False or "
f"flatten_param_grads=True will be discarded since parameter {p.name}'s need_clip is False or "
"the regularizer is set."
)
return params_grads
2 changes: 1 addition & 1 deletion paddlenlp/trainer/plugins/timer.py
Original file line number Diff line number Diff line change
@@ -26,7 +26,7 @@


class _Timer:
"""Profile Timer for recording time taken by forward/ bacward/ reduce/ step."""
"""Profile Timer for recording time taken by forward/ backward/ reduce/ step."""

def __init__(self, name):
self.name = name
4 changes: 2 additions & 2 deletions paddlenlp/trainer/trainer_compress.py
Original file line number Diff line number Diff line change
@@ -700,10 +700,10 @@ def _quant_aware_training_dynamic(self, input_dir):
args.output_filename_prefix = "int8"

quant_config = {
# It defauts to None, which means that no preprocessing is performed
# It defaults to None, which means that no preprocessing is performed
# on the active value."
"activation_preprocess_type": "PACT" if args.use_pact else None,
# It defauts to None, which means that no preprocessing is performed
# It defaults to None, which means that no preprocessing is performed
# on weights.
"weight_preprocess_type": "PACT" if args.use_pact else None,
"weight_quantize_type": args.weight_quantize_type,
2 changes: 1 addition & 1 deletion paddlenlp/trainer/trainer_utils.py
Original file line number Diff line number Diff line change
@@ -84,7 +84,7 @@ def _get_distributed_seeds(seed: int = 1234, topo: Topology = None):
"""

# NOTE: For parameter init seed:
# seed: dp/mp_undistributed_paramter/sharding is same; others is different
# seed: dp/mp_undistributed_parameter/sharding is same; others is different
# For compute seed(dropout):
# global seed: only mp group is same.
# local seed: all groups are different
2 changes: 1 addition & 1 deletion paddlenlp/trainer/utils/reshard/pp_reshard.py
Original file line number Diff line number Diff line change
@@ -220,7 +220,7 @@ def layers(self):
class PipeLineStage:
def __init__(self):
self._rename_mgr = LayerReNamingManager()
# map segement start index to segment
# map segment start index to segment
self._segments = OrderedDict()
self._layer_to_segment = OrderedDict()
self._param_to_tname = OrderedDict()
4 changes: 2 additions & 2 deletions paddlenlp/trainer/utils/sharding_io.py
Original file line number Diff line number Diff line change
@@ -97,7 +97,7 @@ def filter_sharded_params(state_dict, optimizer, sharding_group):
return filtered_state_dict


def exclude_paramters_in_state_dict(
def exclude_parameters_in_state_dict(
model_state_dict, param_names_in_master_weights, sharding_group, should_save_sharding_stage1_model=True
):
assert sharding_group is not None
@@ -399,7 +399,7 @@ def manipulate_state_dict_and_config(self, model_to_save, merge_tensor_parallel=
optimzier_state_dict = self.optimizer.state_dict()
assert "master_weights" in optimzier_state_dict
param_names_in_master_weights = list(optimzier_state_dict["master_weights"].keys())
state_dict = exclude_paramters_in_state_dict(
state_dict = exclude_parameters_in_state_dict(
state_dict, param_names_in_master_weights, self.sharding_group
)
logger.info(
2 changes: 1 addition & 1 deletion paddlenlp/trainer/utils/zero_cost_checkpoint.py
Original file line number Diff line number Diff line change
@@ -539,7 +539,7 @@ def update_zcc_workers(self, new_version, dynamic_objecs, static_object, global_
self.global_step = global_step
assert self.current_worker is None, "[ZCC manager] current_worker must be None"
task = (ZCCTaskType.UPDATE, [self.cache_version, dynamic_objecs, static_object])
logger.info(f"[ZCC manager] updating zcc workers, verison: {self.cache_version}")
logger.info(f"[ZCC manager] updating zcc workers, version: {self.cache_version}")
for worker in self.workers:
worker.task_queue.put(task)
logger.info("[ZCC manager] waiting workers update done")
4 changes: 2 additions & 2 deletions paddlenlp/transformers/auto/modeling.py
Original file line number Diff line number Diff line change
@@ -857,7 +857,7 @@ def from_pretrained(cls, pretrained_model_name_or_path, *model_args, **kwargs):
model_class = getattr(import_class, model_class_name)

# It may return a new model class, like LlamaForCausalLMAvxInferenceModel
# Some model have different inference model class in deifferent execution divice
# Some model have different inference model class in different execution device
# LlamaForCausalLMAvxInferenceModel is used in cpu execution device with avx instruction set
model_class = model_class.confirm_inference_model(predictor_args=predictor_args)

@@ -931,7 +931,7 @@ def from_config(cls, config, *model_args, **kwargs):
model_class = getattr(import_class, model_class_name)

# It may return a new model class, like LlamaForCausalLMAvxInferenceModel
# Some model have different inference model class in deifferent execution divice
# Some model have different inference model class in different execution device
# LlamaForCausalLMAvxInferenceModel is used in cpu execution device with avx instruction set
model_class = model_class.confirm_inference_model(predictor_args=predictor_args)

Loading
Oops, something went wrong.
Loading
Oops, something went wrong.