<a href="https://colab.research.google.com/github/zhousanfu/machine-learning-demo/blob/master/%E7%A7%91%E5%A4%A7%E8%AE%AF%E9%A3%9E_%E5%9F%BA%E4%BA%8E%E8%AE%BA%E6%96%87%E6%91%98%E8%A6%81%E7%9A%84%E6%96%87%E6%9C%AC%E5%88%86%E7%B1%BB%E4%B8%8E%E5%85%B3%E9%94%AE%E8%AF%8D%E6%8A%BD%E5%8F%96%E6%8C%91%E6%88%98%E8%B5%9B.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# 科大讯飞 基于论文摘要的文本分类与关键词抽取挑战赛[🔗](https://challenge.xfyun.cn/topic/info?type=abstract-of-the-paper&option=ssgy&ch=F5ZbQiB)

一、赛事背景

医学领域的文献库中蕴含了丰富的疾病诊断和治疗信息，如何高效地从海量文献中提取关键信息，进行疾病诊断和治疗推荐，对于临床医生和研究人员具有重要意义。

二、赛事任务

本任务分为两个子任务：
机器通过对论文摘要等信息的理解，判断该论文是否属于医学领域的文献。
提取出该论文标题、作者、摘要、关键词。

**加载网盘文件**

In [2]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [3]:
!mkdir data
!cp /content/drive/MyDrive/Data/科大讯飞比赛/* ./data

In [None]:
!pip install transformers[tensorflow]
!pip install sklearn

In [6]:
import tensorflow as tf
import pandas as pd
from sklearn.metrics import accuracy_score, f1_score, classification_report
from sklearn.model_selection import train_test_split
from transformers import BertTokenizer, TFAutoModel, TFBertForSequenceClassification, TFAutoModelForTokenClassification

## 任务一: 分类

### 超参数

In [None]:
vocab_size = 10000  # 词汇表大小
embedding_dim = 100  # 词向量维度
hidden_size = 128  # LSTM隐藏层大小
max_sequence_length = 128
batch_size = 16

num_classes = 2  # 分类任务类别数，这里为医学文献和非医学文献
num_extraction = 10  # 提取任务提取的关键词数目
epochs_classes = 10
epochs_keywords = 10
classification_learning_rate = 0.001
extraction_learning_rate = 0.001
weight_decay = 0.01

### 数据预处理

In [None]:
# 加载数据和标签
data = pd.read_csv('data/train.csv')[:100].to_dict(orient='list')
sentences = data['abstract']
labels = data['label']

# 划分训练集和验证集
train_data, eval_data, train_labels, eval_labels = train_test_split(sentences, labels, test_size=0.2, random_state=42)

# 初始化BertTokenizer
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

# 编码训练集和验证集的输入文本
train_encodeds = tokenizer.batch_encode_plus(train_data, truncation=True, padding='max_length', max_length=max_sequence_length, return_tensors='tf')
eval_encodeds = tokenizer.batch_encode_plus(eval_data, truncation=True, padding='max_length', max_length=max_sequence_length, return_tensors='tf')
# input_ids = tf.convert_to_tensor(encodeds['input_ids'])
# attention_mask = tf.convert_to_tensor(encodeds['attention_mask'])
# labels = tf.convert_to_tensor(labels)

# 转换为TensorFlow Dataset格式
train_dataset = tf.data.Dataset.from_tensor_slices( \
    ({'input_ids': train_encodeds['input_ids'], 'attention_mask': train_encodeds['attention_mask']}, train_labels) \
    ).shuffle(num_classes).batch(batch_size)
eval_dataset = tf.data.Dataset.from_tensor_slices( \
    ({'input_ids': eval_encodeds['input_ids'], 'attention_mask': eval_encodeds['attention_mask']}, eval_labels) \
    ).shuffle(num_classes).batch(batch_size)

# 测试数据集
data = pd.read_csv('data/test.csv')[:100].to_dict(orient='list')
sentences = data['abstract']

test_encodeds = tokenizer.batch_encode_plus(sentences, truncation=True, padding='max_length', max_length=max_sequence_length, return_tensors='tf')
test_dataset = tf.data.Dataset.from_tensor_slices( \
    ({'input_ids': train_encodeds['input_ids'], 'attention_mask': train_encodeds['attention_mask']}) \
    ).shuffle(num_classes).batch(batch_size)

test_dataset

### 构建模型

In [None]:
# 定义TFBertForSequenceClassification模型
classes_model = TFBertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=num_classes)

# 定义优化器和损失函数
optimizer = tf.keras.optimizers.Adam(learning_rate=classification_learning_rate)
loss = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True)

# 定义评估指标
train_accuracy = tf.keras.metrics.SparseCategoricalAccuracy('train_accuracy')
eval_accuracy = tf.keras.metrics.SparseCategoricalAccuracy('eval_accuracy')
train_loss = tf.keras.metrics.Mean('train_loss', dtype=tf.float32)
eval_loss = tf.keras.metrics.Mean('eval_loss', dtype=tf.float32)

@tf.function
def train_step(inputs, labels):
    predictions = None

    with tf.GradientTape() as tape:
        outputs = classes_model(inputs, training=True)[0]
        loss_value = loss(labels, outputs)
        predictions = tf.argmax(outputs, axis=1)

    gradients = tape.gradient(loss_value, classes_model.trainable_variables)
    optimizer.apply_gradients(zip(gradients, classes_model.trainable_variables))

    train_accuracy(labels, outputs)
    train_loss(loss_value)

    return predictions

@tf.function
def eval_step(inputs, labels):
    outputs = classes_model(inputs, training=False)[0]
    loss_value = loss(labels, outputs)

    eval_accuracy(labels, outputs)
    eval_loss(loss_value)

    predictions = tf.argmax(outputs, axis=1)

    return predictions



### 训练模型

In [None]:
# 训练模型
for epoch in range(3):
    train_accuracy.reset_states()
    train_loss.reset_states()
    eval_accuracy.reset_states()
    eval_loss.reset_states()

    train_predictions = []
    eval_predictions = []
    train_f1 = 0
    eval_f1 = 0

    for batch_inputs, batch_labels in train_dataset:
        predictions = train_step(batch_inputs, batch_labels)
        train_predictions.extend(predictions)

    for batch_inputs, batch_labels in eval_dataset:
        predictions = eval_step(batch_inputs, batch_labels)
        eval_predictions.extend(predictions)

    train_f1 = f1_score(train_labels, train_predictions)
    eval_f1 = f1_score(eval_labels, eval_predictions)

    print('Epoch {}: \n训练: Loss: {:.4f}, Accuracy: {:.4f}, F1: {:.4f}, \n验证: Loss: {:.4f}, Accuracy: {:.4f}, F1: {:.4f},'.format(
        epoch + 1, train_loss.result(), train_accuracy.result(), train_f1, eval_loss.result(), eval_accuracy.result(), eval_f1
    ))

### 模型预测

In [None]:
outputs = classes_model.predict(test_dataset)
probabilities = tf.nn.softmax(outputs.logits, axis=1)

for i in probabilities:
    predicted_label = tf.argmax([i], axis=1).numpy()[0]
    print("预测标签:", predicted_label, i.numpy())
    break

## 任务二: 命名实体抽取

In [None]:
!pip install tensorflow_addons transformers
!pip install keras-bert
!pip install bert-for-tf2
!pip install tf2crf

In [None]:
import tensorflow as tf
import tensorflow_addons as tfa
import pandas as pd
import numpy as np
from nltk.tokenize import word_tokenize
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, f1_score, classification_report
from transformers import TFBertModel, BertTokenizer, TFAutoModel, TFBertForSequenceClassification, TFAutoModelForTokenClassification

from tensorflow.keras.layers import BatchNormalization

import tensorflow as tf
import tensorflow_hub as hub
import keras
# from tf.keras.models import Model, Input, Sequential
# from tf.keras.layers import GRU, LSTM, Embedding, Dense, TimeDistributed, Dropout, Bidirectional, Lambda, BatchNormalization, Add
# from tf.keras.utils import to_categorical
# from tf.keras.callbacks import CSVLogger
# from tf.keras.optimizers import Adam
# from tf.keras_bert import load_trained_model_from_checkpoint
from tf2crf import CRF, ModelWithCRFLoss
import bert

import nltk
nltk.download('punkt')

# 加载预训练的BERT模型
bert_model = TFBertModel.from_pretrained('bert-base-uncased')

# 获取BERT的tokenizer
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

In [40]:
max_length = 512
hidden_size = 100
num_tags = 4
lstm_units = 64  # BiLSTM单元数量
num_words = 10000  # 词汇表大小
embedding_dim = 100  # BERT嵌入维度

### 数据预处理

首先，需要准备一个含有实体标注的文本语料库
对于每个文本，需要将其分成单独的句子，并将每个句子分词。
对于每个词，需要将其转化为对应的BERT词表中的ID。
对于标注序列，需要将其转化为BIO格式（即Begin，Inside，Outside）。

In [10]:
# 数据预处理
def preprocess_data(texts, labels, tokenizer, label2id, max_length):
    input_ids = []
    attention_masks = []
    encoded_labels = []

    for text, label in zip(texts, labels):
        # 分词，并添加特殊标记[CLS]和[SEP]，并进行id编码和注意力掩码生成
        input_encode = tokenizer.encode_plus(text, add_special_tokens=True, max_length=max_length,
                                             padding='max_length', truncation=True, return_token_type_ids=False)
        input_ids.append(input_encode['input_ids'])
        attention_masks.append(input_encode['attention_mask'])

        # 编码标签序列
        encoded_label = [label2id[l] for l in label]
        while len(encoded_label) < max_length:
            encoded_label.append(1)
        encoded_labels.append(encoded_label)

    return input_ids, attention_masks, encoded_labels

# 构建标签字典
def build_label_dict(labels):
    unique_labels = set([label for sublist in labels for label in sublist])
    label2id = {label: i for i, label in enumerate(unique_labels)}
    id2label = {i: label for label, i in label2id.items()}

    return label2id, id2label


In [23]:
df = pd.read_csv('data/train.csv')[:100]
texts = df['title'] + ' ' + df['author'] + ' ' + df['abstract']
label = df['Keywords']
labels = []
labels_bio = []

for i in label:
    tokens = word_tokenize(i)
    for t in tokens:
        if t != ';' and t != '.':
            labels.append(t)

labels = list(set(labels))

for i in range(len(texts)):
    tmp_l = []
    tokens = word_tokenize(texts[i])

    for t in tokens:
        if len(tmp_l) <= max_length:
            if t in labels:
                tmp_l.append('B-Key')
            else:
                tmp_l.append('O')
        labels_bio.append(tmp_l)

# 构建标签字典
label2id, id2label = build_label_dict(labels_bio)

# 数据预处理
input_ids, attention_masks, encoded_labels = preprocess_data(texts, labels_bio, tokenizer, label2id, max_length=max_length)

for i in range(len(input_ids)):
    if len(input_ids[i]) != len(encoded_labels[i]):
        print(len(input_ids[i]), len(encoded_labels[i]))

# 划分训练集和验证集
train_input_ids, test_input_ids,\
    train_attention_masks, test_attention_masks,\
    train_labels, test_labels = train_test_split(input_ids, attention_masks, encoded_labels, test_size=0.2, random_state=42)

train_input_ids = tf.convert_to_tensor(train_input_ids, dtype=tf.int32)
train_attention_masks = tf.convert_to_tensor(train_attention_masks, dtype=tf.int32)
train_labels = np.array(train_labels)
train_labels = tf.convert_to_tensor(train_labels, dtype=tf.int32)

test_input_ids = tf.convert_to_tensor(test_input_ids, dtype=tf.int32)
test_attention_masks = tf.convert_to_tensor(test_attention_masks, dtype=tf.int32)
test_labels = np.array(test_labels)
test_labels = tf.convert_to_tensor(test_labels, dtype=tf.int32)

print(train_input_ids.shape)
print(train_labels.shape)

(80, 512)
(80, 512)


In [24]:
train_input_ids

<tf.Tensor: shape=(80, 512), dtype=int32, numpy=
array([[  101,  3808,  9312, ..., 13004,  3938,   102],
       [  101,  8208,  1997, ...,     0,     0,     0],
       [  101, 11538,  1996, ...,     0,     0,     0],
       ...,
       [  101,  9203,  3496, ...,     0,     0,     0],
       [  101, 19002,  1011, ...,     0,     0,     0],
       [  101,  2659, 11619, ...,     0,     0,     0]], dtype=int32)>

### 构建模型

In [None]:
from keras.backend import dtype
tf.keras.backend.clear_session()

# BERT + BiLSTM + CRF Model
def build_model(bert_model, num_tags, max_length, hidden_size, lstm_units):
    input_ids = tf.keras.layers.Input(shape=(max_length,), name='input_ids', dtype=tf.int32)
    attention_mask = tf.keras.layers.Input(shape=(max_length,), name='attention_mask', dtype=tf.int32)
    # token_type_ids = tf.keras.layers.Input(shape=(max_length,), name='token_type_ids', dtype=tf.int32)

    # BERT layer
    bert_output = bert_model(input_ids, attention_mask=attention_mask)[0]

    # BiLSTM layer
    bilstm_layer = tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(hidden_size, return_sequences=True), merge_mode="sum")
    bilstm_layer_out = bilstm_layer(tf.keras.layers.Dropout(0.5)(bert_output))

    #     lstm_layer = LSTM(hidden_size, return_sequences=True, dropout=0.5, recurrent_dropout=0.5)
    #     lstm_layer_out = lstm_layer(bilstm_layer_out)

    time_distributed_layer = tf.keras.layers.TimeDistributed(tf.keras.layers.Dense(num_tags))
    time_distributed_layer_out = time_distributed_layer(bilstm_layer_out)

    # CRF layer
    crf = CRF(dtype='float32') # float32
    crf_out = crf(tf.keras.layers.BatchNormalization()(time_distributed_layer_out))

    base_model = tf.keras.Model(inputs=[input_ids, attention_mask], outputs=crf_out)
    model = ModelWithCRFLoss(bert_model)

    model.compile(optimizer=tf.keras.optimizers.Adam(lr=1e-5)) # loss=tf.keras.losses.MeanSquaredError(), metrics=['accuracy']
    # model.summary()

    return model

# 构建模型
model = build_model(bert_model, num_tags, max_length, hidden_size, lstm_units)

# 训练模型
csv_logger = tf.keras.callbacks.CSVLogger('bert-ner-training.log',  append=False)
checkpointer = tf.keras.callbacks.ModelCheckpoint(
    filepath="./bert-gru-bilstm-crf.tf",
    monitor="val_accuracy",
    verbose=0,
    save_best_only=True)

model.fit([train_input_ids, train_attention_masks], train_labels,
          batch_size=32,
          epochs=10,
          callbacks=[csv_logger, checkpointer])

# 评估模型
# evaluation = model.evaluate([test_input_ids, test_attention_masks], test_labels)
# loss = evaluation[0]
# accuracy = evaluation[1]
# print("Loss: {:.4f}".format(loss))
# print("Accuracy: {:.4f}".format(accuracy))

# 预测
predictions = model.predict([test_input_ids, test_attention_masks])