In [1]:
import os
os.environ["KERAS_BACKEND"] = "tensorflow"
import tensorflow as tf
import numpy as np
from datasets import load_dataset
from collections import Counter
from conlleval import evaluate

gpus = tf.config.experimental.list_physical_devices("GPU")
for gpu in gpus:
    tf.config.experimental.set_memory_growth(gpu, True)

D:\anaconda\envs\tf-gpu-2.10.0-py-3.10\lib\site-packages\numpy\.libs\libopenblas.FB5AE2TYXYH2IJRDKGDGQ3XBKLKTF43H.gfortran-win_amd64.dll
D:\anaconda\envs\tf-gpu-2.10.0-py-3.10\lib\site-packages\numpy\.libs\libopenblas64__v0.3.23-246-g3d31191b-gcc_10_3_0.dll
  from .autonotebook import tqdm as notebook_tqdm


In [20]:
class TransformerBlock(tf.keras.layers.Layer):
    def __init__(self, embed_dim, num_heads, ff_dim):
        super().__init__()
        self.att = tf.keras.layers.MultiHeadAttention(num_heads=num_heads, key_dim=embed_dim)
        self.ffn = tf.keras.Sequential([
            tf.keras.layers.Dense(ff_dim, activation="relu"),
            tf.keras.layers.Dense(embed_dim)
        ])
        self.layernorm1 = tf.keras.layers.LayerNormalization(epsilon=1e-6)
        self.layernorm2 = tf.keras.layers.LayerNormalization(epsilon=1e-6)
        self.dropout1 = tf.keras.layers.Dropout(0.1)
        self.dropout2 = tf.keras.layers.Dropout(0.1)

    def call(self, inputs, training=False):
        attn_output = self.att(inputs, inputs)
        attn_output = self.dropout1(attn_output, training=training)
        out1 = self.layernorm1(inputs + attn_output)
        ffn_output = self.ffn(out1)
        ffn_output = self.dropout2(ffn_output, training=training)
        return self.layernorm2(out1 + ffn_output) 

class TokenAndPositionEmbedding(tf.keras.layers.Layer):
    def __init__(self, maxlen, vocab_size, embed_dim):
        super().__init__()
        self.token_emb = tf.keras.layers.Embedding(vocab_size, embed_dim)
        self.pos_emb = tf.keras.layers.Embedding(maxlen, embed_dim)

    def call(self, inputs):
        maxlen = tf.shape(inputs)[-1]
        positions = tf.range(0, maxlen, 1)
        position_embeddings=  self.pos_emb(positions)
        token_embeddings = self.token_emb(inputs)
        return token_embeddings + position_embeddings

class NERModel(tf.keras.Model):
    def __init__(self, num_tags, vocab_size, maxlen=128, embed_dim=32, num_heads=2, ff_dim=32):
        super().__init__()
        self.embedding_layer = TokenAndPositionEmbedding(maxlen, vocab_size, embed_dim)
        self.transformer_block = TransformerBlock(embed_dim, num_heads, ff_dim)
        self.dropout1 = tf.keras.layers.Dropout(0.1)
        self.ff = tf.keras.layers.Dense(ff_dim, activation="relu")
        self.dropout2 = tf.keras.layers.Dropout(0.1)
        self.ff_final = tf.keras.layers.Dense(num_tags, activation="softmax")
        
    def call(self, inputs, training=False):
        x = self.embedding_layer(inputs)
        x = self.transformer_block(x)
        x = self.dropout1(x, training)
        x = self.ff(x)
        x = self.dropout2(x, training)
        x = self.ff_final(x)
        return x


In [90]:
# 主要是处理数据保存早本地
# conll_data = load_dataset("conll2003")
# def export_to_file(export_file_path, data):
#     with open(export_file_path, "w") as f:
#         for record in data:
#             ner_tags = record["ner_tags"]
#             tokens = record["tokens"]
#             if len(tokens) > 0:
#                 f.write(
#                     str(len(tokens))
#                     + "\t"
#                     + "\t".join(tokens)
#                     + "\t"
#                     + "\t".join(map(str, ner_tags))
#                     + "\n"
#                 )


# os.mkdir("data")
# export_to_file("./data/conll_train.txt", conll_data["train"])
# export_to_file("./data/conll_val.txt", conll_data["validation"])

In [24]:
def make_tag_lookup_table():
    iob_labels = ["B", "I"]
    ner_labels = ["PER", "ORG", "LOC", "MISC"]
    all_labels = [(label1, label2) for label2 in ner_labels for label1 in iob_labels]
    all_labels = ["-".join([a,b]) for a,b in all_labels]
    all_labels = ["[PAD]", "O"] + all_labels
    return dict(zip(range(0,len(all_labels)+1), all_labels))
    
conll_data = load_dataset("conll2003")
mapping = make_tag_lookup_table()
all_tokens = sum(conll_data["train"]["tokens"], [])
all_tokens_array = np.array(list(map(str.lower, all_tokens)))
counter = Counter(all_tokens_array)   # 21009 个
num_tags = len(mapping)
vocab_size = 20000
vocabulary = [token for token, count in counter.most_common(vocab_size - 2)]
lookup_layer = tf.keras.layers.StringLookup(vocabulary=vocabulary)
train_data = tf.data.TextLineDataset("./data/conll_train.txt")
val_data = tf.data.TextLineDataset("./data/conll_val.txt")
print()
print(list(train_data.take(1).as_numpy_iterator()))

def map_record_to_training_data(record):
    record = tf.strings.split(record, sep="\t")
    length = tf.strings.to_number(record[0], out_type=tf.int32)
    tokens = record[1:length+1]
    tags = record[length+1:]
    tags = tf.strings.to_number(tags, out_type=tf.int64)
    tags += 1
    tokens = tf.strings.lower(tokens)
    tokens = lookup_layer(tokens)
    return tokens, tags

batch_size = 32
train_ds = train_data.map(map_record_to_training_data).padded_batch(batch_size)
val_ds = val_data.map(map_record_to_training_data).padded_batch(batch_size)
for tokens, tags in train_ds.take(1):
    print(tokens[:2])
    print(tags[:2])

Found cached dataset conll2003 (C:/Users/13900K/.cache/huggingface/datasets/conll2003/conll2003/1.0.0/9a4d16a94f8674ba3466315300359b0acd891b68b6c8743ddf60b9c702adce98)
100%|██████████████████████████████████████████████████████████████████████████████████| 3/3 [00:00<00:00, 1000.31it/s]



[b'9\tEU\trejects\tGerman\tcall\tto\tboycott\tBritish\tlamb\t.\t3\t0\t7\t0\t0\t0\t7\t0\t0']
tf.Tensor(
[[  988 10950   204   628     6  3938   215  5773     2     0     0     0
      0     0     0     0     0     0     0     0     0     0     0     0
      0     0     0     0     0     0     0     0     0     0     0     0
      0     0     0     0     0     0     0     0     0     0     0]
 [  773  1871     0     0     0     0     0     0     0     0     0     0
      0     0     0     0     0     0     0     0     0     0     0     0
      0     0     0     0     0     0     0     0     0     0     0     0
      0     0     0     0     0     0     0     0     0     0     0]], shape=(2, 47), dtype=int64)
tf.Tensor(
[[4 1 8 1 1 1 8 1 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
  0 0 0 0 0 0 0 0 0 0 0]
 [2 3 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
  0 0 0 0 0 0 0 0 0 0 0]], shape=(2, 47), dtype=int64)


In [26]:
class CustomNonPaddingTokenLoss(tf.keras.losses.Loss):
    def __init__(self, name="custom_ner_loss"):
        super(CustomNonPaddingTokenLoss, self).__init__(name=name)

    def call(self, y_true, y_pred):
        loss_fn = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=False, reduction='none')
        loss = loss_fn(y_true, y_pred)
        mask = tf.cast((y_true!=0), dtype=tf.float32)
        loss *= mask
        return tf.reduce_sum(loss) / tf.reduce_sum(mask)

class CustomNonPaddingTokenAcc(tf.keras.metrics.Metric):
    def __init__(self, name='custom_accuracy', **kwargs):
        super(CustomNonPaddingTokenAcc, self).__init__(name=name, **kwargs)
        self.total = self.add_weight(name='total', initializer='zeros')
        self.count = self.add_weight(name='count', initializer='zeros')

    def update_state(self, y_true, y_pred, sample_weight=None):
        y_pred = tf.argmax(y_pred, axis=-1)
        y_pred = tf.cast(y_pred, dtype=y_true.dtype)
        y_pred = tf.cast(y_pred, dtype=y_true.dtype)
        match = tf.cast(tf.equal(y_pred, y_true), dtype=tf.float32)
        mask = tf.cast((y_true!=0),  dtype=tf.float32)
        self.total.assign_add(tf.reduce_sum(match))
        self.count.assign_add(tf.reduce_sum(mask))

    def result(self):
        return self.total / self.count

    def reset_states(self):
        self.total.assign(0)
        self.count.assign(0)

ner_model = NERModel(num_tags, vocab_size, embed_dim=32, num_heads=4, ff_dim=64)
ner_model.compile(optimizer="adam", loss=CustomNonPaddingTokenLoss(), metrics=[CustomNonPaddingTokenAcc()])
ner_model.fit(train_ds, validation_data=val_ds, epochs=100, callbacks=[tf.keras.callbacks.EarlyStopping(5)])

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.callbacks.History at 0x1fda5c69090>

In [88]:
for tokens, tags in val_ds.take(1):
    break
n = 5
lens = tf.reduce_sum(tf.cast(tf.not_equal(tags, 0), tf.int32), axis=-1).numpy()
pred = tf.argmax(ner_model.predict(tokens[:n]), axis=-1).numpy()
true = tags[:n].numpy()
for i,(a,b) in enumerate(zip(pred, true)):
    print(f"预测 tokens：{[mapping[item] for item in a][:lens[i]]}")
    print(f"实际 tokens：{[mapping[item] for item in b][:lens[i]]}")
    print()

预测 tokens：['O', 'O', 'B-ORG', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O']
实际 tokens：['O', 'O', 'B-ORG', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O']

预测 tokens：['B-LOC', 'O']
实际 tokens：['B-LOC', 'O']

预测 tokens：['B-MISC', 'B-MISC', 'O', 'B-PER', 'I-PER', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'B-ORG', 'O', 'B-ORG', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O']
实际 tokens：['B-MISC', 'I-MISC', 'O', 'B-PER', 'I-PER', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'B-ORG', 'O', 'B-ORG', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O']

预测 tokens：['O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'B-ORG', 'O', 'O', 'O', 'B-ORG', 'O', 'O', 'O', 'O', 'O', 'O', 'B-ORG', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'B-ORG', 'O']
实际 tokens：['O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'B-ORG', 'O', 'B-ORG', 'O', 'B-ORG', 'O', 'O', 'O', 'O', 'O', 'O', 'B-ORG', 'O', 'O', 'O', 'O',