# 新闻文本分类使用textcnn

In [1]:
# !pip install tensorflow-gpu==1.14.0 bert4keras sklearn pandas numpy gensim

In [None]:
import pandas as pd
import gensim
import numpy as np
from sklearn.model_selection import train_test_split
from bert4keras.snippets import DataGenerator, sequence_padding
from keras.layers import *
from keras.models import Model
from keras.optimizers import Adam
from keras.callbacks import EarlyStopping, ModelCheckpoint, Callback
from sklearn.metrics import f1_score
import numpy as np

# wordkeyvector = "/ceph/11122/txkeyvector/70000-small.txt"
# news = "/ceph/11122/txkeyvector/article_features_train_raw.csv"
wordkeyvector = r"D:\ANewStart\dataset\腾讯词向量\70000-small.txt"
news = r"D:\ANewStart\dataset\复旦新闻语料\article_features_train_raw.csv"

# # 导入数据(已经被转化成index)
# df_train = pd.read_csv(r"D:\ANewStart\dataset\天池新闻文本分类数据集\train_set.csv", sep="\t")
# df_test = pd.read_csv(r"D:\ANewStart\dataset\天池新闻文本分类数据集\test_a.csv", sep="\t")
# df_train["text"]=df_train["text"].apply(lambda x:list(map(lambda y:int(y), x.split())))
# df_test["text"]=df_test["text"].apply(lambda x:list(map(lambda y:int(y), x.split())))

# 导入词向量(7000小词典，百度云)
def w2v_model_preprocessing():
    # 导入模型
    w2v_model = gensim.models.KeyedVectors.load_word2vec_format(wordkeyvector, binary=False)
    word2idx = {"_PAD": 0} # 词->index
    vocab_list = [(k, w2v_model.wv[k]) for k, v in w2v_model.wv.vocab.items()]
    # 存储所有word2vec中所有词向量的数组，其中第一个全为0用于padding
    embeddings_matrix = np.zeros((len(w2v_model.wv.vocab.items())+1, w2v_model.vector_size))
    # 填充字典和矩阵
    for i in range(len(vocab_list)):
        word = vocab_list[i][0]
        word2idx[word] = i+1
        embeddings_matrix[i+1] = vocab_list[i][1]
    return w2v_model, word2idx, embeddings_matrix
w2v_model, word2idx, embeddings_matrix = w2v_model_preprocessing()

# 词转index
def get_words_index(data, word_index):
    new_txt = []
    for word in data:
        try:
            new_txt.append(word_index[word])
        except:
            new_txt.append(0) # 不存在填充0
    return new_txt

# 加载已经处理好的中文新闻语料
df = pd.read_csv(news)

# 去除含nan的行
df = df.dropna(axis=0, how="any") # any/all 任何/全部

# # 数量
# df.count()

# 中文label转索引
lable_index = {'艺术':0, '文学':1, '哲学':2, '通信':3, '能源':4, '历史':5,
               '矿藏':6, '空间':7, '教育':8, '交通':9, '计算机':10, '环境':11,
               '电子':12, '农业':13, '体育':14, '时政':15, '医疗':16, '经济':17, '法律':18}
df["label"] =df["label"].apply(lambda x:lable_index[x])

# 中文语料转index
df["text"] = df["words"].apply(lambda x:get_words_index(x.split(), word2idx))

# # 索引-中文字典
# dict(zip(word2idx.values(), word2idx.keys()))[9156]

# df["text"][:1]

# 超参数
SEED=2020
num_classes = 19
vocabulary_size = 7000
maxlen=1024
batch_size = 1024
# embedding_dim = 256
embedding_dim = 200 # 匹配词向量 改成200 自己初始化训练就不需要匹配
num_filters = 512
filter_sizes = [3,4,5]
drop = 0.5
lr = 1e-4
epochs = 2

# 切分、加载、组织数据
df_train, df_valid = train_test_split(df, test_size=0.2, random_state=SEED)
def load_data(df):
    D = list()
    for _, row in df.iterrows():
        text = row["text"]
        label = row["label"]
        D.append((text, int(label)))
    return D
train_data = load_data(df_train)
valid_data = load_data(df_valid)

# 数据生成器
class data_generator(DataGenerator):
    def __init__(self, data, batch_size=32, buffer_size=None, random=False):
        super().__init__(data, batch_size, buffer_size)
        self.random = random

    def __iter__(self, random=False):
        batch_token_ids, batch_labels = [], []
        for is_end, (text, label) in self.sample(random):
            token_ids = text[:maxlen] if len(text) > maxlen else text + (maxlen - len(text)) * [0]
            batch_token_ids.append(token_ids)
            batch_labels.append([label])
            if len(batch_token_ids) == self.batch_size or is_end:
                batch_token_ids = sequence_padding(batch_token_ids)
                batch_labels = sequence_padding(batch_labels)
                yield [batch_token_ids], batch_labels # 输出一个batch或者最后剩余
                batch_token_ids, batch_labels = [], []
    def forfit(self):
        while True:
            for d in self.__iter__(self.random):
                yield d # 一个一个输出
train_generator = data_generator(train_data, batch_size, random=True)
valid_generator = data_generator(valid_data, batch_size)

# 构建模型
# 输入
inputs = Input(shape=(maxlen,), dtype="int32")

# 嵌入层
embedding = Embedding(
#     input_dim=vocabulary_size, # 词典size
    input_dim=len(embeddings_matrix),
    output_dim=embedding_dim, # 词向量size
    input_length=maxlen, # 输入size
    weights=[embeddings_matrix], # 这里引入外部词向量
    trainable=False # 外部词向量不变
)(inputs) # 输入
reshape = Reshape((maxlen, embedding_dim, 1))(embedding) # 加一个维度

# 卷积层
conv_0 = Conv2D(
    num_filters, # 输出size
    kernel_size=(filter_sizes[0], embedding_dim), # 卷积核宽高，宽=卷积核数，高=词向量size
    padding="valid",
    kernel_initializer="normal",
    activation="relu"
)(reshape)
conv_1 = Conv2D(
    num_filters, # 输出size
    kernel_size=(filter_sizes[1], embedding_dim), # 卷积核宽高，宽=卷积核数，高=词向量size
    padding="valid",
    kernel_initializer="normal",
    activation="relu"
)(reshape)
conv_2 = Conv2D(
    num_filters, # 输出size
    kernel_size=(filter_sizes[2], embedding_dim), # 卷积核宽高，宽=卷积核数，高=词向量size
    padding="valid",
    kernel_initializer="normal",
    activation="relu"
)(reshape)
# 池化
maxpool_0 = MaxPool2D(
    pool_size = (maxlen-filter_sizes[0]+1, 1),
    strides = (1,1),
    padding="valid"
)(conv_0)
maxpool_1 = MaxPool2D(
    pool_size = (maxlen-filter_sizes[1]+1, 1),
    strides = (1,1),
    padding="valid"
)(conv_1)
maxpool_2 = MaxPool2D(
    pool_size = (maxlen-filter_sizes[2]+1, 1),
    strides = (1,1),
    padding="valid"
)(conv_2)

# 输出层
concatenated_tensor = Concatenate(axis=1)([maxpool_0,maxpool_1,maxpool_2])
flatten = Flatten()(concatenated_tensor)
dropout = Dropout(drop)(flatten)
output = Dense(units=num_classes, activation="softmax")(dropout)

model = Model(inputs=inputs, outputs=output)

model.compile(
    optimizer=Adam(lr=lr),
    loss="sparse_categorical_crossentropy",
    metrics=["accuracy"]
)

# 回调
class Evaluator(Callback):
    def __init__(self):
        super().__init__()
        self.best_val_f1 = 0 # f1
    def evaluate(self):
        y_true, y_pred = list(), list()
        for x, y in valid_generator: #
            y_true.append(y)
            y_pred.append(self.model.predict(x).argmax(axis=1)) # 取预测值最高
        y_true = np.concatenate(y_true) # 拼接一个batch
        y_pred = np.concatenate(y_pred)
        f1 = f1_score(y_true, y_pred, average="macro") # 计算f1
        return f1
    def on_epoch_end(self, epoch, logs=None):
        val_f1 = self.evaluate()
        if val_f1>self.best_val_f1: # best f1 更新
            self.best_val_f1 = val_f1
        logs["val_f1"] = val_f1
        print(f"val_f1:{val_f1:.5f}, best_val_f1:{self.best_val_f1:.5f}")

callbacks = [
    Evaluator(),
    EarlyStopping( # 早停
        monitor = "val_loss", # 监控指标
        patience = 1, # 容忍epoch
        verbose = 1
    ),
    ModelCheckpoint(
        "best_model.weights", # 参数
        monitor="val_f1", # 监控指标
        save_weights_only=True, # 只保存参数
        save_best_only=True, # 清除历史
        verbose=1,
        mode="max"
    ),
]

# 训练模型
model.fit_generator(
    train_generator.forfit(),
    steps_per_epoch=len(train_generator),
    epochs=epochs,
    callbacks=callbacks,
    validation_data=valid_generator.forfit(),
    validation_steps=len(valid_generator)
)