# Load dataset.

## Go to the dataset directory.

In [1]:
%cd dataset/
!ls

[WinError 2] 系统找不到指定的文件。: 'dataset/'
C:\Users\wangy\NaturalLanguageProcessing\Epoch3-learningrate0.4


'ls' is not recognized as an internal or external command,
operable program or batch file.


## Import the pandas library and perform data reading for analysis.

In [2]:
import pandas as pd
train = pd.read_csv('train.csv', sep='\t', header=None)
test = pd.read_csv('test.csv', sep='\t', header=None)

## Adding column names to data.

In [3]:
train.columns = ["text",'labels']
test.columns = ["text",'labels']

## The first 10 texts of the training set are read in the format "text, labels". The labels may contain multiple sentiment categories, each sentiment is separated by ','.

In [4]:
train.head(10)

Unnamed: 0,text,labels
0,My favourite food is anything I didn't have to...,27
1,"Now if he does off himself, everyone will thin...",27
2,WHY THE FUCK IS BAYLESS ISOING,2
3,To make her feel threatened,14
4,Dirty Southern Wankers,3
5,OmG pEyToN iSn'T gOoD eNoUgH tO hElP uS iN tHe...,26
6,Yes I heard abt the f bombs! That has to be wh...,15
7,We need more boards and to create a bit more s...,820
8,Damn youtube and outrage drama is super lucrat...,0
9,It might be linked to the trust factor of your...,27


In [5]:
test.head(10)

Unnamed: 0,text,labels
0,I’m really sorry about your situation :( Altho...,25
1,It's wonderful because it's awful. At not with.,0
2,"Kings fan here, good luck to you guys! Will be...",13
3,"I didn't know that, thank you for teaching me ...",15
4,They got bored from haunting earth for thousan...,27
5,Thank you for asking questions and recognizing...,15
6,You’re welcome,15
7,100%! Congrats on your job too!,15
8,I’m sorry to hear that friend :(. It’s for the...,24
9,"Girlfriend weak as well, that jump was pathetic.",25


# Install and import related libraries.

In [6]:
!pip install --upgrade paddlenlp



In [7]:
!pip install paddlepaddle



In [8]:
import os
import paddle
import paddlenlp

  from .autonotebook import tqdm as notebook_tqdm


# Data pre-processing

## For the 28 micro-sentiment multi-label classification scenario, where a sentence may correspond to multiple sentiment category labels, the sentiment labels of the dataset need to be transformed using One-Hot coding first, with "0" indicating absence and "1" indicating presence for each sentiment.

### Create sentiment label mapping relationships.

In [9]:
label_vocab = {
    0: "admiration",
    1: "amusement",
    2: "anger",
    3: "annoyance",
    4: "approval",
    5: "caring",
    6: "confusion",
    7: "curiosity",
    8: "desire",
    9: "disappointment",
    10: "disapproval",
    11: "disgust",
    12: "embarrassment",
    13: "excitement",
    14: "fear",
    15: "gratitude",
    16: "grief",
    17: "joy",
    18: "love",
    19: "nervousness",
    20: "optimism",
    21: "pride",
    22: "realization",
    23: "relief",
    24: "remorse",
    25: "sadness",
    26: "surprise",
    27: "neutral"
}

### Customize the dataset, read the data file, create the dataset and define the data type as MapDataset.

In [10]:
import re

from paddlenlp.datasets import load_dataset

# Clear invalid characters
def clean_text(text):
    text = text.replace("\r", "").replace("\n", "")
    text = re.sub(r"\\n\n", ".", text)
    return text

# Define the read data set function
def read_custom_data(filepath, is_one_hot=True):
    f = open(filepath)
    while True:
        line = f.readline()
        if not line:
            break
        data = line.strip().split('\t')
        # One-hot processing for 28 types of micro sentiment tags
        if is_one_hot:
            labels = [float(1) if str(i) in data[1].split(',') else float(0) for i in range(28)]  # 28 types
        else:
            labels = [int(d) for d in data[1].split(',')]
        yield {"text": clean_text(data[0]), "labels": labels}
    f.close()

In [11]:
# load_dataset() to Create dataset.
# lazy=False，The dataset is returned as a MapDataset type.
# Pre-processing of training and validation sets.
train_ds = load_dataset(read_custom_data, filepath='train.csv', lazy=False) 
test_ds = load_dataset(read_custom_data, filepath='test.csv', lazy=False)

### Print dataset.

In [12]:
print("datatype:", type(train_ds))
print("training dataset example:", train_ds[0])
print("testing dataset example:", test_ds[0])

datatype: <class 'paddlenlp.datasets.dataset.MapDataset'>
training dataset example: {'text': "My favourite food is anything I didn't have to cook myself.", 'labels': [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0]}
testing dataset example: {'text': 'I’m really sorry about your situation :( Although I love the names Sapphira, Cirilla, and Scarlett!', 'labels': [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0]}


## Load Chinese ERNIE 3.0 pre-training model and word splitter

In [13]:
from paddlenlp.transformers import AutoModelForSequenceClassification, AutoTokenizer

model_name = "ernie-3.0-medium-zh"   # ERNIE3.0 model
num_classes = 28  # 28 classification mission
model = AutoModelForSequenceClassification.from_pretrained(model_name, num_classes=num_classes)
tokenizer = AutoTokenizer.from_pretrained(model_name)

[32m[2023-04-22 16:57:55,264] [    INFO][0m - We are using <class 'paddlenlp.transformers.ernie.modeling.ErnieForSequenceClassification'> to load 'ernie-3.0-medium-zh'.[0m
[32m[2023-04-22 16:57:55,265] [    INFO][0m - Model config ErnieConfig {
  "attention_probs_dropout_prob": 0.1,
  "enable_recompute": false,
  "fuse": false,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "id2label": {
    "0": "LABEL_0",
    "1": "LABEL_1",
    "2": "LABEL_2",
    "3": "LABEL_3",
    "4": "LABEL_4",
    "5": "LABEL_5",
    "6": "LABEL_6",
    "7": "LABEL_7",
    "8": "LABEL_8",
    "9": "LABEL_9",
    "10": "LABEL_10",
    "11": "LABEL_11",
    "12": "LABEL_12",
    "13": "LABEL_13",
    "14": "LABEL_14",
    "15": "LABEL_15",
    "16": "LABEL_16",
    "17": "LABEL_17",
    "18": "LABEL_18",
    "19": "LABEL_19",
    "20": "LABEL_20",
    "21": "LABEL_21",
    "22": "LABEL_22",
    "23": "LABEL_23",
    "24": "LABEL_24",
    "25": "LABEL_25",
    "26": "LABEL_26",

## Process the raw data into a model-acceptable format.

In [14]:
import functools
import numpy as np

from paddle.io import DataLoader, BatchSampler
from paddlenlp.data import DataCollatorWithPadding

# Data pre-processing function to convert text into integer sequences using a word splitter.
def preprocess_function(examples, tokenizer, max_seq_length):
    result = tokenizer(text=examples["text"], max_seq_len=max_seq_length)
    result["labels"] = examples["labels"]
    return result

trans_func = functools.partial(preprocess_function, tokenizer=tokenizer, max_seq_length=64)
train_ds = train_ds.map(trans_func)
test_ds = test_ds.map(trans_func)

# function is constructed to extend the different length sequences to the maximum length of the data in the batch, and then stack the data.
collate_fn = DataCollatorWithPadding(tokenizer)

# Define the BatchSampler, select the batch size and whether to randomly jumble the DataLoader.
train_batch_sampler = BatchSampler(train_ds, batch_size=32, shuffle=True)
test_batch_sampler = BatchSampler(test_ds, batch_size=16, shuffle=False)
train_data_loader = DataLoader(dataset=train_ds, batch_sampler=train_batch_sampler, collate_fn=collate_fn)
test_data_loader = DataLoader(dataset=test_ds, batch_sampler=test_batch_sampler, collate_fn=collate_fn)

## Define model validation metrics.

In [15]:
import numpy as np
import sklearn
from sklearn.metrics import roc_auc_score, f1_score, precision_score, recall_score
from paddle.metric import Metric

# Customize MultiLabelReport evaluation metrics.
class MultiLabelReport(Metric):
    """
    AUC and F1 Score for multi-label text classification task.
    """

    def __init__(self, name='MultiLabelReport', average='micro'):
        super(MultiLabelReport, self).__init__()
        self.average = average
        self._name = name
        self.reset()

    def f1_score(self, y_prob):
        '''
        Returns the f1 score by searching the best threshhold
        '''
        best_score = 0
        for threshold in [i * 0.01 for i in range(100)]:
            self.y_pred = y_prob > threshold
            score = sklearn.metrics.f1_score(y_pred=self.y_pred, y_true=self.y_true, average=self.average)
            if score > best_score:
                best_score = score
                precison = precision_score(y_pred=self.y_pred, y_true=self.y_true, average=self.average)
                recall = recall_score(y_pred=self.y_pred, y_true=self.y_true, average=self.average)
        return best_score, precison, recall

    def reset(self):
        """
        Resets all of the metric state.
        """
        self.y_prob = None
        self.y_true = None

    def update(self, probs, labels):
        if self.y_prob is not None:
            self.y_prob = np.append(self.y_prob, probs.numpy(), axis=0)
        else:
            self.y_prob = probs.numpy()
        if self.y_true is not None:
            self.y_true = np.append(self.y_true, labels.numpy(), axis=0)
        else:
            self.y_true = labels.numpy()

    def accumulate(self):
        auc = roc_auc_score(
            y_score=self.y_prob, y_true=self.y_true, average=self.average)
        f1_score, precison, recall = self.f1_score(y_prob=self.y_prob)
        return auc, f1_score, precison, recall

    def name(self):
        """
        Returns metric name
        """
        return self._name

# Building the training model.

## Select an optimization strategy and run configuration.

In [16]:
import time
import paddle.nn.functional as F

# AdamW optimizer, cross-entropy loss function, custom MultiLabelReport evaluation metrics.
optimizer = paddle.optimizer.AdamW(learning_rate=8e-2, parameters=model.parameters(), weight_decay=0.01)
criterion = paddle.nn.BCEWithLogitsLoss()
metric = MultiLabelReport()

## Model training and validation.

In [17]:
import paddle
import numpy as np
import paddle.nn.functional as F

# Build the validation set evaluate function.
@paddle.no_grad()
def evaluate(model, criterion, metric, data_loader, label_vocab, if_return_results=True):
    model.eval()
    metric.reset()
    losses = []
    results = []
    for batch in data_loader:
        input_ids, token_type_ids, labels = batch['input_ids'], batch['token_type_ids'], batch['labels']
        logits = model(input_ids, token_type_ids)
        loss = criterion(logits, labels)
        probs = F.sigmoid(logits)
        losses.append(loss.numpy())
        metric.update(probs, labels)
        if if_return_results:
            probs = probs.tolist()
            for prob in probs:
                result = []
                for c, pred in enumerate(prob):
                    if pred > 0.5:
                        result.append(label_vocab[c])
                results.append(','.join(result))

    auc, f1_score, precison, recall = metric.accumulate()
    print("eval loss: %.5f, auc: %.5f, f1 score: %.5f, precison: %.5f, recall: %.5f" %
          (np.mean(losses), auc, f1_score, precison, recall))
    model.train()
    metric.reset()
    if if_return_results:
        return results
    else:
        return f1_score

In [18]:
epochs = 3 # training times
ckpt_dir = "ernie_ckpt" # Folder for saving model parameters during training

global_step = 0  # Number of iterations
tic_train = time.time()
best_f1_score = 0

# Model Training
for epoch in range(1, epochs + 1):
    for step, batch in enumerate(train_data_loader, start=1):
        input_ids, token_type_ids, labels = batch['input_ids'], batch['token_type_ids'], batch['labels']

        # Calculate model output, loss function value, classification probability value, accuracy, f1 score.
        logits = model(input_ids, token_type_ids)
        loss = criterion(logits, labels)
        probs = F.sigmoid(logits)
        metric.update(probs, labels)
        auc, f1_score, _, _ = metric.accumulate()

        # Print the loss function value, accuracy, f1 score, and computation speed for each 10 iterations.
        global_step += 1
        if global_step % 10 == 0:
            print(
                "global step %d, epoch: %d, batch: %d, loss: %.5f, auc: %.5f, f1 score: %.5f, speed: %.2f step/s"
                % (global_step, epoch, step, loss, auc, f1_score,
                    10 / (time.time() - tic_train)))
            tic_train = time.time()
        
        # Reverse gradient passback with updated parameters.
        loss.backward()
        optimizer.step()
        optimizer.clear_grad()
        
        # Every 40 iterations, evaluate the current trained model, save the current best model parameters and word list of the word splitter, etc.
        if global_step % 40 == 0:
            save_dir = ckpt_dir
            if not os.path.exists(save_dir):
                os.makedirs(save_dir)
            eval_f1_score = evaluate(model, criterion, metric, test_data_loader, label_vocab, if_return_results=False)
            if eval_f1_score > best_f1_score:
                best_f1_score = eval_f1_score
                model.save_pretrained(save_dir)
                tokenizer.save_pretrained(save_dir)

global step 10, epoch: 1, batch: 10, loss: 4.33618, auc: 0.58050, f1 score: 0.10225, speed: 0.03 step/s
global step 20, epoch: 1, batch: 20, loss: 2.53671, auc: 0.62999, f1 score: 0.12991, speed: 0.11 step/s
global step 30, epoch: 1, batch: 30, loss: 1.81855, auc: 0.66159, f1 score: 0.13337, speed: 0.22 step/s
global step 40, epoch: 1, batch: 40, loss: 1.70928, auc: 0.67132, f1 score: 0.12981, speed: 0.17 step/s


[32m[2023-04-22 17:10:50,074] [    INFO][0m - Configuration saved in ernie_ckpt\config.json[0m


eval loss: 1.70651, auc: 0.70595, f1 score: 0.12011, precison: 0.06473, recall: 0.83252


[32m[2023-04-22 17:10:50,435] [    INFO][0m - tokenizer config file saved in ernie_ckpt\tokenizer_config.json[0m
[32m[2023-04-22 17:10:50,436] [    INFO][0m - Special tokens file saved in ernie_ckpt\special_tokens_map.json[0m


global step 50, epoch: 1, batch: 50, loss: 1.63013, auc: 0.70410, f1 score: 0.13570, speed: 0.04 step/s
global step 60, epoch: 1, batch: 60, loss: 1.18606, auc: 0.71633, f1 score: 0.16174, speed: 0.22 step/s
global step 70, epoch: 1, batch: 70, loss: 1.51051, auc: 0.70533, f1 score: 0.16618, speed: 0.20 step/s
global step 80, epoch: 1, batch: 80, loss: 1.19676, auc: 0.70704, f1 score: 0.18201, speed: 0.19 step/s


[32m[2023-04-22 17:17:28,748] [    INFO][0m - Configuration saved in ernie_ckpt\config.json[0m


eval loss: 1.26067, auc: 0.68788, f1 score: 0.24885, precison: 0.19698, recall: 0.33781


[32m[2023-04-22 17:17:29,543] [    INFO][0m - tokenizer config file saved in ernie_ckpt\tokenizer_config.json[0m
[32m[2023-04-22 17:17:29,545] [    INFO][0m - Special tokens file saved in ernie_ckpt\special_tokens_map.json[0m


global step 90, epoch: 1, batch: 90, loss: 1.04914, auc: 0.67381, f1 score: 0.16757, speed: 0.04 step/s
global step 100, epoch: 1, batch: 100, loss: 2.15968, auc: 0.68476, f1 score: 0.15137, speed: 0.22 step/s
global step 110, epoch: 1, batch: 110, loss: 1.45255, auc: 0.69158, f1 score: 0.16245, speed: 0.18 step/s
global step 120, epoch: 1, batch: 120, loss: 1.70332, auc: 0.68285, f1 score: 0.15302, speed: 0.21 step/s


[32m[2023-04-22 17:24:11,065] [    INFO][0m - Configuration saved in ernie_ckpt\config.json[0m


eval loss: 1.57957, auc: 0.69805, f1 score: 0.24897, precison: 0.19707, recall: 0.33797


[32m[2023-04-22 17:24:11,395] [    INFO][0m - tokenizer config file saved in ernie_ckpt\tokenizer_config.json[0m
[32m[2023-04-22 17:24:11,395] [    INFO][0m - Special tokens file saved in ernie_ckpt\special_tokens_map.json[0m


global step 130, epoch: 1, batch: 130, loss: 1.26797, auc: 0.67156, f1 score: 0.12668, speed: 0.04 step/s
global step 140, epoch: 1, batch: 140, loss: 1.22536, auc: 0.67692, f1 score: 0.14152, speed: 0.23 step/s
global step 150, epoch: 1, batch: 150, loss: 1.17240, auc: 0.67330, f1 score: 0.14876, speed: 0.18 step/s
global step 160, epoch: 1, batch: 160, loss: 1.49970, auc: 0.65644, f1 score: 0.13615, speed: 0.18 step/s
eval loss: 1.33765, auc: 0.64224, f1 score: 0.22627, precison: 0.15712, recall: 0.40417
global step 170, epoch: 1, batch: 170, loss: 0.58357, auc: 0.63529, f1 score: 0.18531, speed: 0.04 step/s
global step 180, epoch: 1, batch: 180, loss: 1.39769, auc: 0.64400, f1 score: 0.18552, speed: 0.13 step/s
global step 190, epoch: 1, batch: 190, loss: 1.28365, auc: 0.63202, f1 score: 0.16202, speed: 0.12 step/s
global step 200, epoch: 1, batch: 200, loss: 1.08167, auc: 0.63832, f1 score: 0.15956, speed: 0.03 step/s
eval loss: 1.30272, auc: 0.64033, f1 score: 0.10402, precison: 0

[32m[2023-04-22 18:09:22,616] [    INFO][0m - Configuration saved in ernie_ckpt\config.json[0m


eval loss: 1.80687, auc: 0.68974, f1 score: 0.30401, precison: 0.32928, recall: 0.28235


[32m[2023-04-22 18:09:22,889] [    INFO][0m - tokenizer config file saved in ernie_ckpt\tokenizer_config.json[0m
[32m[2023-04-22 18:09:22,891] [    INFO][0m - Special tokens file saved in ernie_ckpt\special_tokens_map.json[0m


global step 370, epoch: 1, batch: 370, loss: 2.48351, auc: 0.70497, f1 score: 0.15979, speed: 0.04 step/s
global step 380, epoch: 1, batch: 380, loss: 1.84890, auc: 0.70468, f1 score: 0.15737, speed: 0.22 step/s
global step 390, epoch: 1, batch: 390, loss: 2.50951, auc: 0.70094, f1 score: 0.16000, speed: 0.21 step/s
global step 400, epoch: 1, batch: 400, loss: 1.07874, auc: 0.69300, f1 score: 0.15389, speed: 0.22 step/s
eval loss: 1.29577, auc: 0.68790, f1 score: 0.20509, precison: 0.13244, recall: 0.45426
global step 410, epoch: 1, batch: 410, loss: 1.49718, auc: 0.67517, f1 score: 0.15931, speed: 0.04 step/s
global step 420, epoch: 1, batch: 420, loss: 1.91719, auc: 0.67427, f1 score: 0.14970, speed: 0.23 step/s
global step 430, epoch: 1, batch: 430, loss: 1.99744, auc: 0.68293, f1 score: 0.14358, speed: 0.23 step/s
global step 440, epoch: 1, batch: 440, loss: 1.49702, auc: 0.68409, f1 score: 0.13525, speed: 0.22 step/s
eval loss: 1.59091, auc: 0.69521, f1 score: 0.10316, precison: 0

global step 1010, epoch: 1, batch: 1010, loss: 1.68829, auc: 0.70929, f1 score: 0.16851, speed: 0.04 step/s
global step 1020, epoch: 1, batch: 1020, loss: 1.43395, auc: 0.69562, f1 score: 0.15392, speed: 0.20 step/s
global step 1030, epoch: 1, batch: 1030, loss: 1.89944, auc: 0.68937, f1 score: 0.16561, speed: 0.19 step/s
global step 1040, epoch: 1, batch: 1040, loss: 2.19611, auc: 0.68854, f1 score: 0.16507, speed: 0.19 step/s
eval loss: 1.78390, auc: 0.67928, f1 score: 0.10620, precison: 0.05674, recall: 0.82714
global step 1050, epoch: 1, batch: 1050, loss: 1.30353, auc: 0.70524, f1 score: 0.14286, speed: 0.04 step/s
global step 1060, epoch: 1, batch: 1060, loss: 2.15563, auc: 0.71161, f1 score: 0.14170, speed: 0.19 step/s
global step 1070, epoch: 1, batch: 1070, loss: 2.83493, auc: 0.70382, f1 score: 0.14104, speed: 0.19 step/s
global step 1080, epoch: 1, batch: 1080, loss: 0.93225, auc: 0.69643, f1 score: 0.13667, speed: 0.19 step/s
eval loss: 1.32807, auc: 0.66598, f1 score: 0.10

eval loss: 1.07281, auc: 0.65248, f1 score: 0.08118, precison: 0.04248, recall: 0.91073
global step 1650, epoch: 2, batch: 123, loss: 1.17655, auc: 0.62378, f1 score: 0.12859, speed: 0.04 step/s
global step 1660, epoch: 2, batch: 133, loss: 1.63375, auc: 0.62827, f1 score: 0.14571, speed: 0.22 step/s
global step 1670, epoch: 2, batch: 143, loss: 0.58679, auc: 0.64036, f1 score: 0.16002, speed: 0.20 step/s
global step 1680, epoch: 2, batch: 153, loss: 1.10753, auc: 0.64216, f1 score: 0.16458, speed: 0.21 step/s
eval loss: 1.01600, auc: 0.66455, f1 score: 0.30401, precison: 0.32928, recall: 0.28235
global step 1690, epoch: 2, batch: 163, loss: 0.77930, auc: 0.64700, f1 score: 0.15796, speed: 0.04 step/s
global step 1700, epoch: 2, batch: 173, loss: 1.16540, auc: 0.63564, f1 score: 0.15942, speed: 0.22 step/s
global step 1710, epoch: 2, batch: 183, loss: 1.17594, auc: 0.62668, f1 score: 0.16698, speed: 0.21 step/s
global step 1720, epoch: 2, batch: 193, loss: 0.90422, auc: 0.62306, f1 sco

eval loss: 1.64124, auc: 0.68665, f1 score: 0.24897, precison: 0.19707, recall: 0.33797
global step 2290, epoch: 2, batch: 763, loss: 2.17038, auc: 0.65884, f1 score: 0.15033, speed: 0.03 step/s
global step 2300, epoch: 2, batch: 773, loss: 1.52434, auc: 0.63880, f1 score: 0.16860, speed: 0.11 step/s
global step 2310, epoch: 2, batch: 783, loss: 1.80534, auc: 0.62875, f1 score: 0.15167, speed: 0.11 step/s
global step 2320, epoch: 2, batch: 793, loss: 0.69989, auc: 0.63399, f1 score: 0.16259, speed: 0.11 step/s
eval loss: 1.16746, auc: 0.66330, f1 score: 0.30401, precison: 0.32928, recall: 0.28235
global step 2330, epoch: 2, batch: 803, loss: 1.04422, auc: 0.65693, f1 score: 0.20559, speed: 0.03 step/s
global step 2340, epoch: 2, batch: 813, loss: 0.94903, auc: 0.65983, f1 score: 0.20131, speed: 0.11 step/s
global step 2350, epoch: 2, batch: 823, loss: 1.09749, auc: 0.66512, f1 score: 0.20080, speed: 0.11 step/s
global step 2360, epoch: 2, batch: 833, loss: 1.29398, auc: 0.67023, f1 sco

eval loss: 2.07879, auc: 0.66158, f1 score: 0.24897, precison: 0.19707, recall: 0.33797
global step 2930, epoch: 2, batch: 1403, loss: 1.31198, auc: 0.65742, f1 score: 0.19628, speed: 0.03 step/s
global step 2940, epoch: 2, batch: 1413, loss: 1.22165, auc: 0.66683, f1 score: 0.19264, speed: 0.13 step/s
global step 2950, epoch: 2, batch: 1423, loss: 2.10137, auc: 0.67986, f1 score: 0.19042, speed: 0.10 step/s
global step 2960, epoch: 2, batch: 1433, loss: 2.23471, auc: 0.66762, f1 score: 0.17757, speed: 0.06 step/s
eval loss: 2.23256, auc: 0.67923, f1 score: 0.30401, precison: 0.32928, recall: 0.28235
global step 2970, epoch: 2, batch: 1443, loss: 1.58005, auc: 0.65610, f1 score: 0.17016, speed: 0.02 step/s
global step 2980, epoch: 2, batch: 1453, loss: 2.43131, auc: 0.66906, f1 score: 0.16000, speed: 0.17 step/s
global step 2990, epoch: 2, batch: 1463, loss: 2.79432, auc: 0.66998, f1 score: 0.15680, speed: 0.13 step/s
global step 3000, epoch: 2, batch: 1473, loss: 2.29253, auc: 0.66987

eval loss: 1.66154, auc: 0.70338, f1 score: 0.30401, precison: 0.32928, recall: 0.28235
global step 3570, epoch: 3, batch: 516, loss: 1.20458, auc: 0.69786, f1 score: 0.18364, speed: 0.02 step/s
global step 3580, epoch: 3, batch: 526, loss: 1.95458, auc: 0.69241, f1 score: 0.19709, speed: 0.02 step/s
global step 3590, epoch: 3, batch: 536, loss: 2.66109, auc: 0.68766, f1 score: 0.19594, speed: 0.02 step/s
global step 3600, epoch: 3, batch: 546, loss: 1.82703, auc: 0.69068, f1 score: 0.20476, speed: 0.02 step/s
eval loss: 2.00741, auc: 0.69625, f1 score: 0.21998, precison: 0.17413, recall: 0.29863
global step 3610, epoch: 3, batch: 556, loss: 2.59920, auc: 0.70902, f1 score: 0.20826, speed: 0.01 step/s
global step 3620, epoch: 3, batch: 566, loss: 1.32230, auc: 0.70817, f1 score: 0.22133, speed: 0.02 step/s
global step 3630, epoch: 3, batch: 576, loss: 1.94144, auc: 0.69928, f1 score: 0.20783, speed: 0.02 step/s
global step 3640, epoch: 3, batch: 586, loss: 1.67114, auc: 0.69460, f1 sco

eval loss: 1.03750, auc: 0.67199, f1 score: 0.08717, precison: 0.04579, recall: 0.90314
global step 4210, epoch: 3, batch: 1156, loss: 1.55302, auc: 0.66711, f1 score: 0.15838, speed: 0.03 step/s
global step 4220, epoch: 3, batch: 1166, loss: 1.36608, auc: 0.65177, f1 score: 0.14349, speed: 0.10 step/s
global step 4230, epoch: 3, batch: 1176, loss: 1.93105, auc: 0.66683, f1 score: 0.15897, speed: 0.18 step/s
global step 4240, epoch: 3, batch: 1186, loss: 1.37028, auc: 0.66518, f1 score: 0.15964, speed: 0.18 step/s
eval loss: 1.26746, auc: 0.70573, f1 score: 0.08898, precison: 0.04675, recall: 0.92195
global step 4250, epoch: 3, batch: 1196, loss: 1.67942, auc: 0.64555, f1 score: 0.14979, speed: 0.04 step/s
global step 4260, epoch: 3, batch: 1206, loss: 1.95832, auc: 0.66736, f1 score: 0.16238, speed: 0.17 step/s
global step 4270, epoch: 3, batch: 1216, loss: 1.41604, auc: 0.67073, f1 score: 0.17482, speed: 0.16 step/s
global step 4280, epoch: 3, batch: 1226, loss: 0.92663, auc: 0.67203

## Model Validation Performance Results

In [19]:
# Load the optimal parameters of the trained model.
model.set_dict(paddle.load('ernie_ckpt/model_state.pdparams'))

# Load the parameters of the previously trained model.
# model.set_dict(paddle.load('/home/aistudio/work/model_state.pdparams'))

# Model Validation.
print("ERNIE 3.0 performance on GoEmotions micro-emotion 28 classification test set：", end= " ")
results = evaluate(model, criterion, metric, test_data_loader, label_vocab)

ERNIE 3.0 performance on GoEmotions micro-emotion 28 classification test set： eval loss: 1.80687, auc: 0.68974, f1 score: 0.30401, precison: 0.32928, recall: 0.28235


## 28 Multi-Label Groups of "Micro" Emotions Predicting Demo

In [20]:
# Define data loading and processing functions
from paddlenlp.data import JiebaTokenizer, Pad, Stack, Tuple, Vocab
def convert_example(example, tokenizer, max_seq_length=64, is_test=False):
    qtconcat = example["text"]
    encoded_inputs = tokenizer(text=qtconcat, max_seq_len=max_seq_length)
    input_ids = encoded_inputs["input_ids"]
    token_type_ids = encoded_inputs["token_type_ids"]
    if not is_test:
        label = np.array([example["label"]], dtype="int64")
        return input_ids, token_type_ids, label
    else:
        return input_ids, token_type_ids

# Define the model prediction function
def predict(model, data, tokenizer, label_vocab, batch_size=1, max_seq=64):
    examples = []
    # Process input data (the list form) into a format acceptable to the model
    for text in data:
        input_ids, segment_ids = convert_example(
            text,
            tokenizer,
            max_seq_length=max_seq,
            is_test=True)
        examples.append((input_ids, segment_ids))

    batchify_fn = lambda samples, fn=Tuple(
        Pad(axis=0, pad_val=tokenizer.pad_token_id),  # input id
        Pad(axis=0, pad_val=tokenizer.pad_token_id),  # segment id
    ): fn(samples)

    # Seperates data into some batches.
    batches = []
    one_batch = []
    for example in examples:
        one_batch.append(example)
        if len(one_batch) == batch_size:
            batches.append(one_batch)
            one_batch = []
    if one_batch:
        # The last batch whose size is less than the config batch_size setting.
        batches.append(one_batch)

    results = []
    model.eval()
    for batch in batches:
        input_ids, segment_ids = batchify_fn(batch)
        input_ids = paddle.to_tensor(input_ids)
        segment_ids = paddle.to_tensor(segment_ids)
        logits = model(input_ids, segment_ids)
        probs = F.sigmoid(logits)
        probs = probs.tolist()
        # The results were processed by selecting the sentiment categories with probability greater than 0.5
        for prob in probs:
            result = []
            for c, pred in enumerate(prob):
                if pred > 0.5:
                    result.append(label_vocab[c])
            results.append(','.join(result))
    return results  # Return the predeicted results

## Prediction Results

In [21]:
# Define the text data to be subjected to micro-sentiment analysis
data = [
    # 0 admiration
    {"text": 'You do a great job!'},
    # 1 amusement
    {"text": 'Lets have fun'},
    # 2 anger
    {"text":"You shut your mouth"},
    # 3 annoyance
    {"text": 'You are so annoyed'},
    # 4 approval
    {"text": 'You are allowed to do this'},
    # 5 caring & # 7 Curiosity
    {"text": 'Are you feeling well?'},
    # 6 confusion
    {"text": 'This problem is so hard and I cannot solve this problem'},
    # 7 curiosity
    {"text":'Why would I do that?'},
    # 8 desire
    {"text": 'I want this gift so much'},
    # 9 disappointment
    {"text": 'I am very disappointed by everything you have done to me'},
    # 10 disapproval
    {"text": 'You are not admitted to the college.'},
    # 11 disgust
    {"text": 'Thats absolutely disgusting.'},
    # 12 embarrassment
    {"text": 'Thats so embarrassing.'},
    # 13 excitement
    {"text": 'I am so excited'},
    # 14 fear
    {"text": 'I am so scared of skydiving'},
    # 15 gratitude
    {"text":"Thank you."},
    # 16 grief
    {"text": 'My grandpa passed away'},
    # 17 joy
    {"text": 'Happy Birthday'},
    # 18 love
    {"text": 'I love you so much'},
    # 19 nervousness
    {"text": 'I am so nervous.'},
    # 20 neutral
    {"text": 'It is just so so.'},
    # 21 optimism
    {"text": 'Successful people only focus on giving their best effort.'},
    # 22 pride
    {"text": 'I am so proud of you.'},
    # 23 realization
    {"text": 'Thank you for letting me realizing this rule.'},
    # 24 relief 
    {"text": 'You are doing better than you think you are'},
    # 25 remorse
    {"text": 'I am guilty.'},
    # 26 sadness
    {"text": 'I am so sad.'},
    # 27 surprise
    {"text": 'I am so surprised that you made it.'},
    
]

# Model Prediction
labels =  predict(model, data, tokenizer, label_vocab, batch_size=1)

# Output of predicted results
for idx, text in enumerate(data):
    print('Text: {} \t Labels: {}'.format(text['text'], labels[idx]))

Text: You do a great job! 	 Labels: neutral
Text: Lets have fun 	 Labels: neutral
Text: You shut your mouth 	 Labels: neutral
Text: You are so annoyed 	 Labels: neutral
Text: You are allowed to do this 	 Labels: neutral
Text: Are you feeling well? 	 Labels: neutral
Text: This problem is so hard and I cannot solve this problem 	 Labels: neutral
Text: Why would I do that? 	 Labels: neutral
Text: I want this gift so much 	 Labels: neutral
Text: I am very disappointed by everything you have done to me 	 Labels: neutral
Text: You are not admitted to the college. 	 Labels: neutral
Text: Thats absolutely disgusting. 	 Labels: neutral
Text: Thats so embarrassing. 	 Labels: neutral
Text: I am so excited 	 Labels: neutral
Text: I am so scared of skydiving 	 Labels: neutral
Text: Thank you. 	 Labels: neutral
Text: My grandpa passed away 	 Labels: neutral
Text: Happy Birthday 	 Labels: neutral
Text: I love you so much 	 Labels: neutral
Text: I am so nervous. 	 Labels: neutral
Text: It is just so s