Skip to content
Merged
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
@@ -276,7 +276,7 @@ msgid "word of current node."
msgstr ""

#: of paddlenlp.taskflow.utils.BurkhardKellerTree:1
msgid "Implementataion of BK-Tree"
msgid "Implementation of BK-Tree"
msgstr ""

#: of paddlenlp.taskflow.utils.BurkhardKellerTree.add:1
@@ -300,7 +300,7 @@ msgid "similar words."
msgstr ""

#: of paddlenlp.taskflow.utils.TriedTree:1
msgid "Implementataion of TriedTree"
msgid "Implementation of TriedTree"
msgstr ""

#: of paddlenlp.taskflow.utils.TriedTree.add_word:1
Original file line number Diff line number Diff line change
@@ -50,7 +50,7 @@ msgid ""
msgstr ""

#: of paddlenlp.transformers.squeezebert.tokenizer.SqueezeBertTokenizer:11
msgid "The special token for unkown words. Default: \"[UNK]\"."
msgid "The special token for unknown words. Default: \"[UNK]\"."
msgstr ""

#: of paddlenlp.transformers.squeezebert.tokenizer.SqueezeBertTokenizer:13
8 changes: 4 additions & 4 deletions llm/experimental/ernie-3.5-se/modeling.py
Original file line number Diff line number Diff line change
@@ -1380,12 +1380,12 @@ def forward(
return_dict = return_dict if return_dict is not None else self.config.use_return_dict

def progressive_seq(x, y):
globel_step = int(os.getenv("TRAINER_GLOBAL_STEP", "0"))
if globel_step < 500:
global_step = int(os.getenv("TRAINER_GLOBAL_STEP", "0"))
if global_step < 500:
return x[:, :512], y[:, :512]
if globel_step < 1000:
if global_step < 1000:
return x[:, :1024], y[:, :1024]
if globel_step < 1500:
if global_step < 1500:
return x[:, :2048], y[:, :2048]
return x, y

2 changes: 1 addition & 1 deletion paddlenlp/data/blendable_dataset.py
Original file line number Diff line number Diff line change
@@ -43,7 +43,7 @@ def __init__(self, datasets, weights, size, share_folder, *, data_cache_path=Non
assert sum_weights > 0.0
weights /= sum_weights

# Build indicies.
# Build indices.
def _build_indices():
start_time = time.time()

2 changes: 1 addition & 1 deletion paddlenlp/data/causal_dataset.py
Original file line number Diff line number Diff line change
@@ -677,7 +677,7 @@ def _build_sample_idx(sizes, doc_idx, seq_length, num_epochs, tokens_per_epoch):
doc_offset += remaining_seq_length + doc_length - 1
remaining_seq_length = 0
else:
# Otherwise, start from the begining of the next document.
# Otherwise, start from the beginning of the next document.
doc_idx_index += 1
doc_offset = 0
# Record the sequence.
4 changes: 2 additions & 2 deletions paddlenlp/data/indexed_dataset.py
Original file line number Diff line number Diff line change
@@ -55,7 +55,7 @@

def make_dataset(path, impl, skip_warmup=False):
if CompatibleIndexedDataset.exists(path):
print("Using old dataet (.npy & .npz)")
print("Using old dataset (.npy & .npz)")

Check warning on line 58 in paddlenlp/data/indexed_dataset.py

Codecov / codecov/patch

paddlenlp/data/indexed_dataset.py#L58

Added line #L58 was not covered by tests
return CompatibleIndexedDataset(path)
elif not IndexedDataset.exists(path):
print(f"Dataset does not exist: {path}")
@@ -903,7 +903,7 @@

self._path = path

# All documment ids, extend as 1-D array.
# All document ids, extend as 1-D array.
self._token_ids = np.load(path + "_ids.npy", mmap_mode="r", allow_pickle=True)
process_data = np.load(path + "_idx.npz")
self._sizes = process_data["lens"]
4 changes: 2 additions & 2 deletions paddlenlp/data/tokenizer.py
Original file line number Diff line number Diff line change
@@ -58,7 +58,7 @@ def cut(self, sentence, cut_all=False, use_hmm=True):
The method used to cut the text to tokens.

Args:
sentence(str): The text that needs to be cuted.
sentence(str): The text that needs to be cut.
cut_all(bool, optional): Whether to use the full mode. If True,
using full mode that gets all the possible words from the
sentence, which is fast but not accurate. If False, using
@@ -97,7 +97,7 @@ def encode(self, sentence, cut_all=False, use_hmm=True):
ids using `vocab`.

Args:
sentence(str): The text that needs to be cuted.
sentence(str): The text that needs to be cut.
cut_all(bool, optional): Whether to use the full mode. If True,
using full mode that gets all the possible words from the
sentence, which is fast but not accurate. If False, using
10 changes: 5 additions & 5 deletions paddlenlp/data/vocab.py
Original file line number Diff line number Diff line change
@@ -40,7 +40,7 @@ class Vocab(object):
between tokens and indices to be used. If provided, adjust the tokens
and indices mapping according to it. If None, counter must be provided.
Default: None.
unk_token (str, optional): Special token for unknow token. If no need,
unk_token (str, optional): Special token for unknown token. If no need,
it also could be None. Default: None.
pad_token (str, optional): Special token for padding token. If no need,
it also could be None. Default: None.
@@ -214,7 +214,7 @@ def to_tokens(self, indices):
for idx in indices:
if not isinstance(idx, (int, np.integer)):
warnings.warn(
"The type of `to_tokens()`'s input `indices` is not `int` which will be forcibly transfered to `int`. "
"The type of `to_tokens()`'s input `indices` is not `int` which will be forcibly transferred to `int`. "
)
idx = int(idx)

@@ -382,7 +382,7 @@ def from_dict(cls, token_to_idx, unk_token=None, pad_token=None, bos_token=None,
Args:
token_to_idx (dict): A dict describes the mapping relationship between
tokens and indices.
unk_token (str, optional): The special token for unknow token. If
unk_token (str, optional): The special token for unknown token. If
no need, it also could be None. Default: None.
pad_token (str, optional): The special token for padding token. If
no need, it also could be None. Default: None.
@@ -440,7 +440,7 @@ def build_vocab(
**kwargs
):
"""
Builds the :class:`Vocab` accoring to given iterator and other
Builds the :class:`Vocab` according to given iterator and other
information. Firstly, iterate over the `iterator` to construct a
:class:`collections.Counter` and used to init the as :class:`Vocab`.

@@ -455,7 +455,7 @@ def build_vocab(
relationship between tokens and indices to be used. If provided,
adjust the tokens and indices mapping according to it. If None,
counter must be provided. Default: None.
unk_token (str, optional): The special token for unknow token
unk_token (str, optional): The special token for unknown token
'<unk>'. If no need, it also could be None. Default: None.
pad_token (str, optional): The special token for padding token
'<pad>'. If no need, it also could be None. Default: None.
4 changes: 2 additions & 2 deletions paddlenlp/datasets/dataset.py
Original file line number Diff line number Diff line change
@@ -448,7 +448,7 @@
num_samples += 1
else:
if inspect.isgenerator(self.data):
warnings.warn("Reciving generator as data source, data can only be iterated once")
warnings.warn("Receiving generator as data source, data can only be iterated once")

Check warning on line 451 in paddlenlp/datasets/dataset.py

Codecov / codecov/patch

paddlenlp/datasets/dataset.py#L451

Added line #L451 was not covered by tests
for example in self.data:
if (not self._filter_pipline or self._filter(self._filter_pipline)) and self._shard_filter(
num_samples=num_samples
@@ -580,7 +580,7 @@
lock_files.append(lock_file)
# Must register to all procs to make the lock file can be removed
# when any proc breaks. Otherwise, the single registered proc may
# not receive proper singal send by the parent proc to exit.
# not receive proper signal send by the parent proc to exit.
atexit.register(lambda: remove_if_exit(lock_files))
for split in splits:
filename = self._get_data(split)
6 changes: 3 additions & 3 deletions paddlenlp/datasets/hf_datasets/docvqa_zh.py
Original file line number Diff line number Diff line change
@@ -16,17 +16,17 @@

# Lint as: python3

import os
import json
import hashlib
import json
import os

Check warning on line 21 in paddlenlp/datasets/hf_datasets/docvqa_zh.py

Codecov / codecov/patch

paddlenlp/datasets/hf_datasets/docvqa_zh.py#L20-L21

Added lines #L20 - L21 were not covered by tests

import datasets

logger = datasets.logging.get_logger(__name__)

_DESCRIPTION = """\
The training set from the competition of Insurance DocVQA organized by China Pacific Insurance. \
The submission is now closed so we split original dataset into three parts for model evluation. \
The submission is now closed so we split original dataset into three parts for model evaluation. \
There are 4,187 training images, 500 validation images, and 500 test images.
"""

2 changes: 1 addition & 1 deletion paddlenlp/datasets/rlhf_datasets/protocol.py
Original file line number Diff line number Diff line change
@@ -393,7 +393,7 @@ def pop(self, batch_keys=None, non_tensor_batch_keys=None, meta_info_keys=None)
meta_info_keys (list, optional): a list of keys indicating the meta info to pop

Returns:
DataProto: the DataProto with the poped batch_keys and meta_info_keys
DataProto: the DataProto with the popped batch_keys and meta_info_keys
"""
assert batch_keys is not None
if meta_info_keys is None:
2 changes: 1 addition & 1 deletion paddlenlp/datasets/rlhf_datasets/rl_dataset.py
Original file line number Diff line number Diff line change
@@ -40,7 +40,7 @@ def padding_batch_data(
input_dict = {}

input_ids = [sample["input_ids"] for sample in samples]
# TODO(drownfish19): confim if this is correct
# TODO(drownfish19): confirm if this is correct
# attention_mask = [np.ones(input_id.shape, dtype=bool) for input_id in input_ids]
input_dict["input_ids"] = left_padding(input_ids, padding_value=pad_token_id, max_length=max_prompt_len)
# input_dict["attention_mask"] = left_padding(attention_mask, padding_value=0)
2 changes: 1 addition & 1 deletion paddlenlp/datasets/thucnews.py
Original file line number Diff line number Diff line change
@@ -24,7 +24,7 @@
class THUCNews(DatasetBuilder):
"""
A subset of THUCNews dataset. THUCNews is a text classification dataset.
See descrition about this subset version at https://github.com/gaussic/text-classification-cnn-rnn#%E6%95%B0%E6%8D%AE%E9%9B%86
See description about this subset version at https://github.com/gaussic/text-classification-cnn-rnn#%E6%95%B0%E6%8D%AE%E9%9B%86
The whole dataset can be downloaded at https://thunlp.oss-cn-qingdao.aliyuncs.com/THUCNews.zip
"""

2 changes: 1 addition & 1 deletion paddlenlp/datasets/xnli_cn.py
Original file line number Diff line number Diff line change
@@ -29,7 +29,7 @@ class XNLI_CN(DatasetBuilder):
XNLI dataset for chinese.

XNLI is an evaluation corpus for language transfer and cross-lingual
sentence classification in 15 languages. Here, XNLI only contrains
sentence classification in 15 languages. Here, XNLI only contains
chinese corpus.

For more information, please visit https://github.com/facebookresearch/XNLI
4 changes: 2 additions & 2 deletions paddlenlp/datasets/zero_padding_dataset.py
Original file line number Diff line number Diff line change
@@ -88,7 +88,7 @@ def _pad_batch_records(cls, batch_records):
attention_mask = record.get("attention_mask", np.tril(np.ones([seq_length, seq_length], dtype=bool)))
batched_features["attention_mask"].append(attention_mask)
# NOTE: position_ids is optional and not required by every model
# We append instead of extend here to accomodate 2D position ids
# We append instead of extend here to accommodate 2D position ids
if "position_ids" in record:
batched_features["position_ids"].append(record["position_ids"])
sequence_sum += seq_length
@@ -98,7 +98,7 @@ def _pad_batch_records(cls, batch_records):
# convert to 3-D [batch_size(1), seq_length, seq_length]
batched_features["attention_mask"] = np.expand_dims(block_attention_mask, axis=0)
if "position_ids" in batched_features:
# Accomodate both 1D and 2D position ids
# Accommodate both 1D and 2D position ids
batched_features["position_ids"] = np.concatenate(batched_features["position_ids"], axis=-1).tolist()
return batched_features

4 changes: 2 additions & 2 deletions paddlenlp/ops/distributed/utils/topo.py
Original file line number Diff line number Diff line change
@@ -49,8 +49,8 @@
self.world = GroupInfo(size=world_size, rank=device_rank, world=list(range(0, world_size)))
worlds = []
for i in range(len(ranks)):
indexs = tuple(ranks[:i] + [slice(None)] + ranks[(i + 1) :])
worlds.append(arr[indexs])
indexes = tuple(ranks[:i] + [slice(None)] + ranks[(i + 1) :])
worlds.append(arr[indexes])

Check warning on line 53 in paddlenlp/ops/distributed/utils/topo.py

Codecov / codecov/patch

paddlenlp/ops/distributed/utils/topo.py#L52-L53

Added lines #L52 - L53 were not covered by tests

for i, key in enumerate(self.order):
if key == "dp":
2 changes: 1 addition & 1 deletion paddlenlp/ops/triton_ops/triton_utils.py
Original file line number Diff line number Diff line change
@@ -621,7 +621,7 @@ def decorator(*args, **kwargs):
op_dict = {"op_name": op_name, "reset_zero_when_tune": ""}
op_dict["triton_kernel_args"] = ",".join(modified_arg_exclude_constexpr)
op_dict["key"] = ",".join(self.key_args)
# when tunning, we need to reset the out to zero.
# when tuning, we need to reset the out to zero.
if "reset_zero_when_tune" in other_config.keys():
op_dict["reset_zero_when_tune"] = other_config["reset_zero_when_tune"]

2 changes: 1 addition & 1 deletion paddlenlp/quantization/quantization_config.py
Original file line number Diff line number Diff line change
@@ -30,7 +30,7 @@ class QuantizationConfig:
This is the configuration class to store quantization configuration.
Args:
weight_quantize_algo: Weight quantization algorithm.
quant_type: Quantization type appplied to weight and activation, weight may still keep in float tensor.
quant_type: Quantization type applied to weight and activation, weight may still keep in float tensor.
shift: Whether the model applied the shift strategy.
smooth: Whether the model applied the smooth strategy.
shift_smooth_all_linears: Whether the model applied shift or smooth strategy for all linears.
2 changes: 1 addition & 1 deletion paddlenlp/rl/trainer/ppo_trainer.py
Original file line number Diff line number Diff line change
@@ -267,7 +267,7 @@ def __init__(
"pipeline_parallel_degree": 1, # workaround for pipeline parallel model check
},
):
# just used to create trival attrs might be used in the training
# just used to create trivial attrs might be used in the training
# process of trainer, while changing some args to avoid model usage
# in __init__ such as recompute and AMP-O2
super().__init__(
2 changes: 1 addition & 1 deletion paddlenlp/taskflow/dialogue.py
Original file line number Diff line number Diff line change
@@ -155,7 +155,7 @@
inputs = [list(self.context)]
return inputs
else:
raise ValueError("In the interactive mode, the input data shold be a string")
raise ValueError("In the interactive mode, the input data should be a string")

Check warning on line 158 in paddlenlp/taskflow/dialogue.py

Codecov / codecov/patch

paddlenlp/taskflow/dialogue.py#L158

Added line #L158 was not covered by tests
elif not isinstance(inputs[0], list):
raise ValueError("If not in the interactive mode, the input data should be a list.")
return inputs
2 changes: 1 addition & 1 deletion paddlenlp/taskflow/document_intelligence.py
Original file line number Diff line number Diff line change
@@ -50,7 +50,7 @@

class DocPromptTask(Task):
"""
The document intelligence model, give the querys and predict the answers.
The document intelligence model, give the queries and predict the answers.
Args:
task(string): The name of task.
model(string): The model name in the task.
2 changes: 1 addition & 1 deletion paddlenlp/taskflow/task.py
Original file line number Diff line number Diff line change
@@ -33,7 +33,7 @@

class Task(metaclass=abc.ABCMeta):
"""
The meta classs of task in Taskflow. The meta class has the five abstract function,
The meta class of task in Taskflow. The meta class has the five abstract function,
the subclass need to inherit from the meta class.
Args:
task(string): The name of task.
10 changes: 5 additions & 5 deletions paddlenlp/taskflow/text_classification.py
Original file line number Diff line number Diff line change
@@ -108,7 +108,7 @@ def softmax(x, axis=None):

class TextClassificationTask(Task):
"""
The text classfication model to classify text.
The text classification model to classify text.
NOTE: This task is different from all other tasks that it has no out-of-box zero-shot capabilities.
Instead, it's used as a simple inference pipeline.

@@ -122,7 +122,7 @@ class TextClassificationTask(Task):
multilabel_threshold (float): The probability threshold used for the multi_label setup. Only effective if model = "multi_label". Defaults to 0.5.
max_length (int): Maximum number of tokens for the model.
precision (int): Select among ["fp32", "fp16"]. Default to "fp32".
plm_model_name (str): Pretrained langugae model name for PromptModel.
plm_model_name (str): Pretrained language model name for PromptModel.
input_spec [list]: Specify the tensor information for each input parameter of the forward function.
id2label(dict(int,string)): The dictionary to map the predictions from class ids to class names.
batch_size(int): The sample number of a mini-batch.
@@ -171,7 +171,7 @@ def _construct_input_spec(self):
init_class = json.load(fb)["architectures"].pop()
else:
raise IOError(
f"Model configuration file dosen't exist.[task_path] should inclue {LEGACY_CONFIG_NAME} or {CONFIG_NAME}"
f"Model configuration file doesn't exist.[task_path] should include {LEGACY_CONFIG_NAME} or {CONFIG_NAME}"
)

if init_class in ["ErnieMForSequenceClassification"]:
@@ -286,7 +286,7 @@ def _run_model(self, inputs: Dict[str, Any]) -> Dict[str, Any]:
"""
Run the task model from the outputs of the `_tokenize` function.
"""
# TODO: support hierachical classification
# TODO: support hierarchical classification
outputs = {}
outputs["text"] = inputs["text"]
outputs["batch_logits"] = []
@@ -326,7 +326,7 @@ def _postprocess(self, inputs: Dict[str, Any]) -> Dict[str, Any]:
"""
This function converts the model logits output to class score and predictions
"""
# TODO: support hierachical classification
# TODO: support hierarchical classification
postprocessed_outputs = []
for logits in inputs["batch_logits"]:
if self.problem_type == "multi_class":
2 changes: 1 addition & 1 deletion paddlenlp/taskflow/text_feature_extraction.py
Original file line number Diff line number Diff line change
@@ -424,7 +424,7 @@ def _parse_batch(batch_examples, max_seq_len=None):
)
return tokenized_inputs

# Seperates data into some batches.
# Separates data into some batches.
one_batch = []
self.length_sorted_idx = np.argsort([-text_length(sen) for sen in data])
sentences_sorted = [data[idx] for idx in self.length_sorted_idx]
Loading
Oops, something went wrong.
Loading
Oops, something went wrong.