In [4]:
from paddlenlp.datasets import MapDataset
def load_dict(dict_path):
    vocab = {}
    i = 0
    with open(dict_path, 'r', encoding='utf-8') as fin:
        for line in fin:
            key = line.strip('\n')
            vocab[key] = i
            i += 1
    return vocab

def load_dataset(datafiles):

    def read(data_path):
        with open(data_path, 'r', encoding='utf-8') as fp:
            next(fp)  # Skip header
            for line in fp.readlines():
                words, labels = line.strip('\n').split('\t')
                words = words.split('\002')
                labels = labels.split('\002')
                yield words, labels

    if isinstance(datafiles, str):
        return MapDataset(list(read(datafiles)))
    elif isinstance(datafiles, list) or isinstance(datafiles, tuple):
        return [MapDataset(list(read(datafile))) for datafile in datafiles]
def parse_decodes(sentences, predictions, lengths, label_vocab):
    """Parse the padding result

    Args:
        sentences (list): the tagging sentences.
        predictions (list): the prediction tags.
        lengths (list): the valid length of each sentence.
        label_vocab (dict): the label vocab.

    Returns:
        outputs (list): the formatted output.
    """
    predictions = [x for batch in predictions for x in batch]
    lengths = [x for batch in lengths for x in batch]
    id_label = dict(zip(label_vocab.values(), label_vocab.keys()))

    outputs = []
    for idx, end in enumerate(lengths):
        sent = sentences[idx][:end]
        tags = [id_label[x] for x in predictions[idx][:end]]
        sent_out = []
        tags_out = []
        words = ""
        for s, t in zip(sent, tags):
            if t.endswith('-B') or t == 'O':
                if len(words):
                    sent_out.append(words)
                tags_out.append(t.split('-')[0])
                words = s
            else:
                words += s
        if len(sent_out) < len(tags_out):
            sent_out.append(words)
        outputs.append(''.join(
            [str((s, t)) for s, t in zip(sent_out, tags_out)]))
    return outputs

In [1]:
from functools import partial
import argparse
import os
import paddle
from paddlenlp.data import Stack, Tuple, Pad
from paddlenlp.transformers import AutoTokenizer, AutoModelForTokenClassification
from paddlenlp.metrics import ChunkEvaluator

parser = argparse.ArgumentParser()
# yapf: disable
parser.add_argument("--save_dir", default='./ernie_ckpt', type=str, help="The output directory where the model checkpoints will be written.")
parser.add_argument("--epochs", default=5, type=int, help="Total number of training epochs to perform.")
parser.add_argument("--batch_size", default=40, type=int, help="Batch size per GPU/CPU for training.")
parser.add_argument("--device", default="gpu", type=str, choices=["cpu", "gpu"] ,help="The device to select to train the model, is must be cpu/gpu.")
parser.add_argument("--data_dir", default='./data', type=str, help="The folder where the dataset is located.")
args = parser.parse_args(args=[])
print(args)

In [2]:
def convert_to_features(example, tokenizer, label_vocab):
    tokens, labels = example
    tokenized_input = tokenizer(tokens,
                                return_length=True,
                                is_split_into_words='token')

    # Token '[CLS]' and '[SEP]' will get label 'O'
    labels = ['O'] + labels + ['O']
    tokenized_input['labels'] = [label_vocab[x] for x in labels]
    return tokenized_input['input_ids'], tokenized_input[
        'token_type_ids'], tokenized_input['seq_len'], tokenized_input['labels']

@paddle.no_grad()
def evaluate(model, metric, data_loader):
    model.eval()
    
    total_correct = 0
    total_pred = 0    
    total_gold = 0    
    
    for input_ids, seg_ids, lens, labels in data_loader:
        logits = model(input_ids, seg_ids)
        preds = paddle.argmax(logits, axis=-1)
        
        preds = preds.numpy()
        labels = labels.numpy()
        lens = lens.numpy()
        
        for pred, label, length in zip(preds, labels, lens):
            pred = pred[:length]
            label = label[:length]
            
            pred_positive = (pred != 0).sum()

            gold_positive = (label != 0).sum()
            
            correct = ((pred == label) & (label != 0)).sum()
            
            total_correct += correct
            total_pred += pred_positive
            total_gold += gold_positive
    
    precision = total_correct / total_pred if total_pred > 0 else 0
    recall = total_correct / total_gold if total_gold > 0 else 0
    f1_score = 2 * precision * recall / (precision + recall) if precision + recall > 0 else 0
    
    print("[EVAL] Precision: %f - Recall: %f - F1: %f" % 
          (precision, recall, f1_score))
    
    model.train()

@paddle.no_grad()
def predict(model, data_loader, ds, label_vocab):
    all_preds = []
    all_lens = []
    for input_ids, seg_ids, lens, labels in data_loader:
        logits = model(input_ids, seg_ids)
        preds = paddle.argmax(logits, axis=-1)
        # Drop CLS prediction
        preds = [pred[1:] for pred in preds.numpy()]
        all_preds.append(preds)
        all_lens.append(lens)
    sentences = [example[0] for example in ds.data]
    results = parse_decodes(sentences, all_preds, all_lens, label_vocab)
    return results

def create_dataloader(dataset,
                      mode='train',
                      batch_size=1,
                      batchify_fn=None,
                      trans_fn=None):
    if trans_fn:
        dataset = dataset.map(trans_fn)

    shuffle = True if mode == 'train' else False
    if mode == 'train':
        batch_sampler = paddle.io.DistributedBatchSampler(dataset,
                                                          batch_size=batch_size,
                                                          shuffle=shuffle)
    else:
        batch_sampler = paddle.io.BatchSampler(dataset,
                                               batch_size=batch_size,
                                               shuffle=shuffle)

    return paddle.io.DataLoader(dataset=dataset,
                                batch_sampler=batch_sampler,
                                collate_fn=batchify_fn,
                                return_list=True)

In [2]:
hidden_size =256
from paddle.nn import GRU
from paddle.nn import RNN
from paddle.nn import LSTM

class ErnieWithLSTM(paddle.nn.Layer):
    def __init__(self, ernie_model, hidden_size, num_classes):
        super(ErnieWithLSTM, self).__init__()
        self.ernie = ernie_model
        self.lstm = GRU(input_size=hidden_size, hidden_size=hidden_size, num_layers=1, direction='forward')
        self.classifier = paddle.nn.Linear(hidden_size , num_classes)  #Bidirectional here requires:hidden_size*2
        
    def forward(self, input_ids, token_type_ids):
        ernie_output = self.ernie(input_ids, token_type_ids)
        # print(ernie_output)
        sequence_output, _ = self.lstm(ernie_output)
        logits = self.classifier(sequence_output)
        return logits

modelname = 'ernie-3.0-medium-zh'

paddle.set_device(args.device)
rank = paddle.distributed.get_rank()

train_ds, dev_ds, test_ds = load_dataset(
    datafiles=(os.path.join(args.data_dir, 'train.txt'),
                os.path.join(args.data_dir, 'dev.txt'),
                os.path.join(args.data_dir, 'test.txt')))

label_vocab = load_dict(os.path.join(args.data_dir, 'tag.dic'))
tokenizer = AutoTokenizer.from_pretrained(modelname)

trans_func = partial(convert_to_features,
                        tokenizer=tokenizer,
                        label_vocab=label_vocab)

train_ds.map(trans_func)
dev_ds.map(trans_func)
test_ds.map(trans_func)

ignore_label = -1

batchify_fn = lambda samples, fn=Tuple(
    Pad(axis=0, pad_val=tokenizer.pad_token_id, dtype='int64'),  # input_ids
    Pad(axis=0, pad_val=tokenizer.pad_token_type_id, dtype='int64'
        ),  # token_type_ids
    Stack(dtype='int64'),  # seq_len
    Pad(axis=0, pad_val=ignore_label, dtype='int64')  # labels
): fn(samples)

train_loader = create_dataloader(dataset=train_ds,
                                    mode='train',
                                    batch_size=args.batch_size,
                                    batchify_fn=batchify_fn)

dev_loader = create_dataloader(dataset=dev_ds,
                                mode='dev',
                                batch_size=args.batch_size,
                                batchify_fn=batchify_fn)

test_loader = create_dataloader(dataset=test_ds,
                                mode='test',
                                batch_size=args.batch_size,
                                batchify_fn=batchify_fn)

# Define the model netword and its loss
# Method 1: Directly Utilizing Only the ERNIE Model
# model = AutoModelForTokenClassification.from_pretrained(
#     modelname, num_classes=len(label_vocab))

# Method Two: Ernie Model + LSTM Model
ernie_model = AutoModelForTokenClassification.from_pretrained(modelname, num_classes=hidden_size)
model = ErnieWithLSTM(ernie_model, hidden_size=hidden_size, num_classes=len(label_vocab))

metric = ChunkEvaluator(label_list=label_vocab.keys(), suffix=True)
loss_fn = paddle.nn.loss.CrossEntropyLoss(ignore_index=ignore_label)
optimizer = paddle.optimizer.AdamW(learning_rate=2e-5,
                                    parameters=model.parameters())

step = 0
for epoch in range(args.epochs):
    for input_ids, token_type_ids, length, labels in train_loader:
        logits = model(input_ids, token_type_ids)
        loss = paddle.mean(loss_fn(logits, labels))
        loss.backward()
        optimizer.step()
        optimizer.clear_grad()
        step += 1
        print("[TRAIN] Epoch:%d - Step:%d - Loss: %f" % (epoch, step, loss))
    evaluate(model, metric, dev_loader)


In [3]:
import time
print('Initiating predictive data analysis：')

if rank == 0:
    prediction_times = []
    for i in range(10):
        print(f"The {i+1}/10 prediction...")
        start_time = time.time()
        preds = predict(model, test_loader, test_ds, label_vocab)
        elapsed_time = time.time() - start_time
        prediction_times.append(elapsed_time)
        print(f"The {i+1} prediction: {elapsed_time:.4f} s")
    
    average_time = sum(prediction_times) / len(prediction_times)
    
    print("\n" + "="*50)
    print("Predictive Time Statistics:")
    for i, t in enumerate(prediction_times):
        print(f"The {i+1} prediction: {t:.4f} s")
    print(f"\nAverage Predictive Time: {average_time:.4f} s")
    print("="*50)

ernie  0.24s

bert  0.21s

ernie+gru 0.38s

In [4]:
import time
if rank == 0:
    preds = predict(model, test_loader, test_ds, label_vocab)
    file_path = "ernie_results.txt"
    with open(file_path, "w", encoding="utf8") as fout:
        fout.write("\n".join(preds))
    # Print some examples
    print(
        "The results have been saved in the file: %s, some examples are shown below: "
        % file_path)
    print("\n".join(preds[:10]))