From ff17c362c0120dae8b5defe9ed41e326ec30db85 Mon Sep 17 00:00:00 2001 From: Jiaqi Liu Date: Fri, 4 Jun 2021 10:58:11 +0800 Subject: [PATCH] Remove F.softmax_with_cross_entropy in paddlenlp (#484) * upgrade F.cross_entropy usage * fix sample code bug * fix ppl shape error --- examples/machine_translation/seq2seq/seq2seq_attn.py | 4 ++-- paddlenlp/metrics/perplexity.py | 7 ++++--- paddlenlp/transformers/bert/modeling.py | 11 +++++++---- paddlenlp/transformers/bigbird/modeling.py | 11 +++++++---- paddlenlp/transformers/ernie/modeling.py | 12 ++++++++---- paddlenlp/transformers/ernie_gen/modeling.py | 7 +++++-- paddlenlp/transformers/transformer/modeling.py | 7 ++++--- 7 files changed, 37 insertions(+), 22 deletions(-) diff --git a/examples/machine_translation/seq2seq/seq2seq_attn.py b/examples/machine_translation/seq2seq/seq2seq_attn.py index f97a101a187fb..1b4cfe174e966 100644 --- a/examples/machine_translation/seq2seq/seq2seq_attn.py +++ b/examples/machine_translation/seq2seq/seq2seq_attn.py @@ -25,8 +25,8 @@ def __init__(self): super(CrossEntropyCriterion, self).__init__() def forward(self, predict, label, trg_mask): - cost = F.softmax_with_cross_entropy( - logits=predict, label=label, soft_label=False) + cost = F.cross_entropy( + input=predict, label=label, soft_label=False, reduction='none') cost = paddle.squeeze(cost, axis=[2]) masked_cost = cost * trg_mask batch_mean_cost = paddle.mean(masked_cost, axis=[0]) diff --git a/paddlenlp/metrics/perplexity.py b/paddlenlp/metrics/perplexity.py index 6928afc2b685b..d7d05f983b8cb 100644 --- a/paddlenlp/metrics/perplexity.py +++ b/paddlenlp/metrics/perplexity.py @@ -46,9 +46,10 @@ def __init__(self, name='Perplexity', *args, **kwargs): self.total_word_num = 0 def compute(self, pred, label, seq_mask=None): - label = paddle.unsqueeze(label, axis=2) - ce = F.softmax_with_cross_entropy( - logits=pred, label=label, soft_label=False) + if label.dim() == 2: + label = paddle.unsqueeze(label, axis=2) + ce = F.cross_entropy( + input=pred, label=label, reduction='none', soft_label=False) ce = paddle.squeeze(ce, axis=[2]) if seq_mask is not None: ce = ce * seq_mask diff --git a/paddlenlp/transformers/bert/modeling.py b/paddlenlp/transformers/bert/modeling.py index 567cfeac7d59d..386cbe621d53a 100644 --- a/paddlenlp/transformers/bert/modeling.py +++ b/paddlenlp/transformers/bert/modeling.py @@ -535,9 +535,12 @@ def __init__(self, vocab_size): def forward(self, prediction_scores, seq_relationship_score, masked_lm_labels, next_sentence_labels, masked_lm_scale): with paddle.static.amp.fp16_guard(): - masked_lm_loss = paddle.nn.functional.softmax_with_cross_entropy( - prediction_scores, masked_lm_labels, ignore_index=-1) + masked_lm_loss = F.cross_entropy( + prediction_scores, + masked_lm_labels, + reduction='none', + ignore_index=-1) masked_lm_loss = masked_lm_loss / masked_lm_scale - next_sentence_loss = paddle.nn.functional.softmax_with_cross_entropy( - seq_relationship_score, next_sentence_labels) + next_sentence_loss = F.cross_entropy( + seq_relationship_score, next_sentence_labels, reduction='none') return paddle.sum(masked_lm_loss) + paddle.mean(next_sentence_loss) diff --git a/paddlenlp/transformers/bigbird/modeling.py b/paddlenlp/transformers/bigbird/modeling.py index 2b82ddf2c601d..d78aefb3e74c8 100644 --- a/paddlenlp/transformers/bigbird/modeling.py +++ b/paddlenlp/transformers/bigbird/modeling.py @@ -862,14 +862,17 @@ def forward(self, prediction_scores, seq_relationship_score, masked_lm_scale, masked_lm_weights) print(loss) """ - masked_lm_loss = paddle.nn.functional.softmax_with_cross_entropy( - prediction_scores, masked_lm_labels, ignore_index=self.ignore_index) + masked_lm_loss = F.cross_entropy( + prediction_scores, + masked_lm_labels, + ignore_index=self.ignore_index, + reduction='none') masked_lm_loss = paddle.transpose(masked_lm_loss, [1, 0]) masked_lm_loss = paddle.sum(masked_lm_loss * masked_lm_weights) / ( paddle.sum(masked_lm_weights) + 1e-5) scale = 1.0 if not self.use_nsp: scale = 0.0 - next_sentence_loss = paddle.nn.functional.softmax_with_cross_entropy( - seq_relationship_score, next_sentence_labels) + next_sentence_loss = F.cross_entropy( + seq_relationship_score, next_sentence_labels, reduction='none') return masked_lm_loss + paddle.mean(next_sentence_loss) * scale diff --git a/paddlenlp/transformers/ernie/modeling.py b/paddlenlp/transformers/ernie/modeling.py index edcb4a1744972..a878e5cf4849a 100644 --- a/paddlenlp/transformers/ernie/modeling.py +++ b/paddlenlp/transformers/ernie/modeling.py @@ -14,6 +14,7 @@ import paddle import paddle.nn as nn +import paddle.nn.functional as F from .. import PretrainedModel, register_base_model @@ -772,9 +773,12 @@ def __init__(self, vocab_size): def forward(self, prediction_scores, seq_relationship_score, masked_lm_labels, next_sentence_labels, masked_lm_scale): with paddle.static.amp.fp16_guard(): - masked_lm_loss = paddle.nn.functional.softmax_with_cross_entropy( - prediction_scores, masked_lm_labels, ignore_index=-1) + masked_lm_loss = F.cross_entropy( + prediction_scores, + masked_lm_labels, + ignore_index=-1, + reduction='none') masked_lm_loss = masked_lm_loss / masked_lm_scale - next_sentence_loss = paddle.nn.functional.softmax_with_cross_entropy( - seq_relationship_score, next_sentence_labels) + next_sentence_loss = F.cross_entropy( + seq_relationship_score, next_sentence_labels, reduction='none') return paddle.sum(masked_lm_loss) + paddle.mean(next_sentence_loss) diff --git a/paddlenlp/transformers/ernie_gen/modeling.py b/paddlenlp/transformers/ernie_gen/modeling.py index 67272447d2029..6604c9d2d1972 100644 --- a/paddlenlp/transformers/ernie_gen/modeling.py +++ b/paddlenlp/transformers/ernie_gen/modeling.py @@ -607,7 +607,10 @@ def forward(self, *args, **kwargs): if len(tgt_labels.shape) == 1: tgt_labels = paddle.reshape(tgt_labels, [-1, 1]) - loss = paddle.nn.functional.cross_entropy( - logits_2d, tgt_labels, soft_label=(tgt_labels.shape[-1] != 1)) + loss = F.cross_entropy( + logits_2d, + tgt_labels, + reduction="none", + soft_label=(tgt_labels.shape[-1] != 1)) return loss, logits_2d, info diff --git a/paddlenlp/transformers/transformer/modeling.py b/paddlenlp/transformers/transformer/modeling.py index 98b0a8c3a07a0..2f2bf1dd084fe 100644 --- a/paddlenlp/transformers/transformer/modeling.py +++ b/paddlenlp/transformers/transformer/modeling.py @@ -252,7 +252,7 @@ def forward(self, predict, label): label = paddle.randint( low=3, high=vocab_size, - shape=[batch_size, seq_len, vocab_size]) + shape=[batch_size, seq_len, 1]) criterion(predict, label) """ @@ -265,9 +265,10 @@ def forward(self, predict, label): x=label, num_classes=predict.shape[-1]), epsilon=self.label_smooth_eps) - cost = F.softmax_with_cross_entropy( - logits=predict, + cost = F.cross_entropy( + input=predict, label=label, + reduction='none', soft_label=True if self.label_smooth_eps else False) weighted_cost = cost * weights sum_cost = paddle.sum(weighted_cost)