# 使用卷积神经网络进行公司经营范围分类

    Python 3
    TensorFlow 1.3以上
    numpy
    scikit-learn
    scipy


本次训练使用了其中的8个分类。

类别如下：

'服装', '工业', '农业', '运输', '信息', '食品', '建筑', '金融'

数据集划分如下：



    cnews.train.txt: 训练集(38988条)
    cnews.val.txt: 验证集(4155条)
    cnews.test.txt: 测试集(8835条)


In [None]:
from __future__ import print_function

import os
import sys
import time
from datetime import timedelta

import numpy as np
import tensorflow as tf
from sklearn import metrics

from cnews_loader import read_vocab, read_category, batch_iter, process_file, build_vocab

data/cnews_loader.py为数据的预处理文件。

    read_file(): 读取文件数据;
    build_vocab(): 构建词汇表，使用字符级的表示，这一函数会将词汇表存储下来，避免每一次重复处理;
    read_vocab(): 读取上一步存储的词汇表，转换为{词：id}表示;
    read_category(): 将分类目录固定，转换为{类别: id}表示;
    to_words(): 将一条由id表示的数据重新转换为文字;
    process_file(): 将数据集从文字转换为固定长度的id序列表示;
    batch_iter(): 为神经网络的训练准备经过shuffle的批次的数据。


In [10]:
class TCNNConfig(object):
    """CNN配置参数"""

    embedding_dim = 64  # 词向量维度
    seq_length = 600  # 序列长度
    num_classes = 8  # 类别数
    num_filters = 256  # 卷积核数目
    kernel_size = 5  # 卷积核尺寸
    vocab_size = 5000  # 词汇表达小

    hidden_dim = 128  # 全连接层神经元

    dropout_keep_prob = 0.5  # dropout保留比例
    learning_rate = 1e-3  # 学习率

    batch_size = 64  # 每批训练大小
    num_epochs = 10  # 总迭代轮次

    print_per_batch = 100  # 每多少轮输出一次结果
    save_per_batch = 10  # 每多少轮存入tensorboard

In [11]:
class TextCNN(object):
    """文本分类，CNN模型"""

    def __init__(self, config):
        self.config = config

        # 三个待输入的数据
        self.input_x = tf.placeholder(tf.int32, [None, self.config.seq_length], name='input_x')
        self.input_y = tf.placeholder(tf.float32, [None, self.config.num_classes], name='input_y')
        self.keep_prob = tf.placeholder(tf.float32, name='keep_prob')

        self.cnn()

    def cnn(self):
        """CNN模型"""
        # 词向量映射
        with tf.device('/cpu:0'):
            embedding = tf.get_variable('embedding', [self.config.vocab_size, self.config.embedding_dim])
            embedding_inputs = tf.nn.embedding_lookup(embedding, self.input_x)

        with tf.name_scope("cnn"):
            # CNN layer
            conv = tf.layers.conv1d(embedding_inputs, self.config.num_filters, self.config.kernel_size, name='conv')
            # global max pooling layer
            gmp = tf.reduce_max(conv, reduction_indices=[1], name='gmp')

        with tf.name_scope("score"):
            # 全连接层，后面接dropout以及relu激活
            fc = tf.layers.dense(gmp, self.config.hidden_dim, name='fc1')
            fc = tf.contrib.layers.dropout(fc, self.keep_prob)
            fc = tf.nn.relu(fc)

            # 分类器
            self.logits = tf.layers.dense(fc, self.config.num_classes, name='fc2')
            self.y_pred_cls = tf.argmax(tf.nn.softmax(self.logits), 1)  # 预测类别

        with tf.name_scope("optimize"):
            # 损失函数，交叉熵
            cross_entropy = tf.nn.softmax_cross_entropy_with_logits(logits=self.logits, labels=self.input_y)
            self.loss = tf.reduce_mean(cross_entropy)
            # 优化器
            self.optim = tf.train.AdamOptimizer(learning_rate=self.config.learning_rate).minimize(self.loss)

        with tf.name_scope("accuracy"):
            # 准确率
            correct_pred = tf.equal(tf.argmax(self.input_y, 1), self.y_pred_cls)
            self.acc = tf.reduce_mean(tf.cast(correct_pred, tf.float32))


In [24]:
base_dir = '/home/wu/文档/modeling/Scope_data/CNN_RNN/data/cnews/'
train_dir = os.path.join(base_dir, 'cnews.train.txt')
test_dir = os.path.join(base_dir, 'cnews.test.txt')
val_dir = os.path.join(base_dir, 'cnews.val.txt')
vocab_dir = os.path.join(base_dir, 'cnews.vocab.txt')

save_dir = '/home/wu/文档/modeling/Scope_data/CNN_RNN/checkpoints/textcnn/'
save_path = os.path.join(save_dir, 'best_validation')  # 最佳验证结果保存路径

In [13]:
def get_time_dif(start_time):
    """获取已使用时间"""
    end_time = time.time()
    time_dif = end_time - start_time
    return timedelta(seconds=int(round(time_dif)))

In [14]:
def feed_data(x_batch, y_batch, keep_prob):
    feed_dict = {
        model.input_x: x_batch,
        model.input_y: y_batch,
        model.keep_prob: keep_prob
    }
    return feed_dict

In [15]:
def evaluate(sess, x_, y_):
    """评估在某一数据上的准确率和损失"""
    data_len = len(x_)
    batch_eval = batch_iter(x_, y_, 128)
    total_loss = 0.0
    total_acc = 0.0
    for x_batch, y_batch in batch_eval:
        batch_len = len(x_batch)
        feed_dict = feed_data(x_batch, y_batch, 1.0)
        loss, acc = sess.run([model.loss, model.acc], feed_dict=feed_dict)
        total_loss += loss * batch_len
        total_acc += acc * batch_len

    return total_loss / data_len, total_acc / data_len

In [29]:
def train():
    print("Configuring TensorBoard and Saver...")
    # 配置 Tensorboard，重新训练时，请将tensorboard文件夹删除，不然图会覆盖
    tensorboard_dir = '/home/wu/文档/modeling/Scope_data/CNN_RNN/tensorboard/textcnn'
    if not os.path.exists(tensorboard_dir):
        os.makedirs(tensorboard_dir)

    tf.summary.scalar("loss", model.loss)
    tf.summary.scalar("accuracy", model.acc)
    merged_summary = tf.summary.merge_all()
    writer = tf.summary.FileWriter(tensorboard_dir)

    # 配置 Saver
    saver = tf.train.Saver()
    if not os.path.exists(save_dir):
        os.makedirs(save_dir)

    print("Loading training and validation data...")
    # 载入训练集与验证集
    start_time = time.time()
    x_train, y_train = process_file(train_dir, word_to_id, cat_to_id, config.seq_length)
    x_val, y_val = process_file(val_dir, word_to_id, cat_to_id, config.seq_length)
    time_dif = get_time_dif(start_time)
    print("Time usage:", time_dif)

    # 创建session
    session = tf.Session()
    session.run(tf.global_variables_initializer())
    writer.add_graph(session.graph)

    print('Training and evaluating...')
    start_time = time.time()
    total_batch = 0  # 总批次
    best_acc_val = 0.0  # 最佳验证集准确率
    last_improved = 0  # 记录上一次提升批次
    require_improvement = 1000  # 如果超过1000轮未提升，提前结束训练
    train_scores = []
    test_scores = []

    flag = False
    for epoch in range(config.num_epochs):
        print('Epoch:', epoch + 1)
        batch_train = batch_iter(x_train, y_train, config.batch_size)
        for x_batch, y_batch in batch_train:
            feed_dict = feed_data(x_batch, y_batch, config.dropout_keep_prob)

            if total_batch % config.save_per_batch == 0:
                # 每多少轮次将训练结果写入tensorboard scalar
                s = session.run(merged_summary, feed_dict=feed_dict)
                writer.add_summary(s, total_batch)

            if total_batch % config.print_per_batch == 0:
                # 每多少轮次输出在训练集和验证集上的性能
                feed_dict[model.keep_prob] = 1.0
                loss_train, acc_train = session.run([model.loss, model.acc], feed_dict=feed_dict)
                loss_val, acc_val = evaluate(session, x_val, y_val)  # todo

                if acc_val > best_acc_val:
                    # 保存最好结果
                    best_acc_val = acc_val
                    last_improved = total_batch
                    saver.save(sess=session, save_path=save_path)
                    improved_str = '*'
                else:
                    improved_str = ''

                time_dif = get_time_dif(start_time)
                msg = 'Iter: {0:>6}, Train Loss: {1:>6.2}, Train Acc: {2:>7.2%},' \
                      + ' Val Loss: {3:>6.2}, Val Acc: {4:>7.2%}, Time: {5} {6}'
                print(msg.format(total_batch, loss_train, acc_train, loss_val, acc_val, time_dif, improved_str))

            session.run(model.optim, feed_dict=feed_dict)  # 运行优化
            total_batch += 1

            if total_batch - last_improved > require_improvement:
                # 验证集正确率长期不提升，提前结束训练
                print("No optimization for a long time, auto-stopping...")
                flag = True
                break  # 跳出循环
        if flag:  # 同上
            break
    return train_scores,test_scores


In [26]:
def test():
    print("Loading test data...")
    start_time = time.time()
    x_test, y_test = process_file(test_dir, word_to_id, cat_to_id, config.seq_length)

    session = tf.Session()
    session.run(tf.global_variables_initializer())
    saver = tf.train.Saver()
    saver.restore(sess=session, save_path=save_path)  # 读取保存的模型

    print('Testing...')
    loss_test, acc_test = evaluate(session, x_test, y_test)
    msg = 'Test Loss: {0:>6.2}, Test Acc: {1:>7.2%}'
    print(msg.format(loss_test, acc_test))

    batch_size = 128
    data_len = len(x_test)
    num_batch = int((data_len - 1) / batch_size) + 1

    y_test_cls = np.argmax(y_test, 1)
    y_pred_cls = np.zeros(shape=len(x_test), dtype=np.int32)  # 保存预测结果
    for i in range(num_batch):  # 逐批次处理
        start_id = i * batch_size
        end_id = min((i + 1) * batch_size, data_len)
        feed_dict = {
            model.input_x: x_test[start_id:end_id],
            model.keep_prob: 1.0
        }
        y_pred_cls[start_id:end_id] = session.run(model.y_pred_cls, feed_dict=feed_dict)

    # 评估
    print("Precision, Recall and F1-Score...")
    print(metrics.classification_report(y_test_cls, y_pred_cls, target_names=categories))

    # 混淆矩阵
    print("Confusion Matrix...")
    cm = metrics.confusion_matrix(y_test_cls, y_pred_cls)
    print(cm)

    time_dif = get_time_dif(start_time)
    print("Time usage:", time_dif)


In [None]:
config = TCNNConfig()
if not os.path.exists(vocab_dir):  # 如果不存在词汇表，重建
    build_vocab(train_dir, vocab_dir, config.vocab_size)
categories, cat_to_id = read_category()
words, word_to_id = read_vocab(vocab_dir)
config.vocab_size = len(words)
model = TextCNN(config)

In [41]:
train_scores,test_scores = train()

Configuring TensorBoard and Saver...
Loading training and validation data...
Time usage: 0:00:03
Training and evaluating...
Epoch: 1
Iter:      0, Train Loss:    2.1, Train Acc:   4.69%, Val Loss:    2.1, Val Acc:   2.46%, Time: 0:00:13 *
Iter:    100, Train Loss:   0.61, Train Acc:  79.69%, Val Loss:   0.64, Val Acc:  80.30%, Time: 0:02:29 *
Iter:    200, Train Loss:   0.71, Train Acc:  75.00%, Val Loss:   0.49, Val Acc:  84.39%, Time: 0:04:44 *
Iter:    300, Train Loss:   0.41, Train Acc:  85.94%, Val Loss:   0.44, Val Acc:  86.78%, Time: 0:07:00 *
Iter:    400, Train Loss:   0.25, Train Acc:  93.75%, Val Loss:   0.34, Val Acc:  90.13%, Time: 0:09:16 *
Iter:    500, Train Loss:   0.31, Train Acc:  87.50%, Val Loss:   0.31, Val Acc:  90.73%, Time: 0:11:32 *
Iter:    600, Train Loss:   0.23, Train Acc:  93.75%, Val Loss:   0.29, Val Acc:  91.35%, Time: 0:13:47 *
Epoch: 2
Iter:    700, Train Loss:   0.36, Train Acc:  90.62%, Val Loss:   0.28, Val Acc:  91.59%, Time: 0:16:03 *
Iter:    8

在验证集上的最佳效果为93.79%，经过了9轮迭代停止。

In [47]:
import matplotlib.pyplot as plt
print(test_scores)
fig=plt.figure()
ax = fig.add_subplot(1, 2, 1)
ax.plot(range(7 * config.num_epochs), train_scores, label="accuracy ", marker='+')
ax.set_title(" accuracy ")
ax.set_xlabel(r"r")
ax.set_ylabel("score")
ax.set_ylim(0, 1.05)
ax.legend(loc="best", framealpha=0.5)
ax = fig.add_subplot(1, 2, 2)
ax.plot(range(7 * config.num_epochs), test_scores, label=" loss ", marker='o')
ax.set_title(" loss ")
ax.set_xlabel(r"r")
ax.set_ylabel("score")
ax.set_ylim(0, 1.05)
ax.legend(loc="best", framealpha=0.5)
plt.show()

[]


In [49]:
test()

Loading test data...
INFO:tensorflow:Restoring parameters from /home/wu/文档/modeling/Scope_data/CNN_RNN/checkpoints/textcnn/best_validation
Testing...
Test Loss:   0.26, Test Acc:  93.42%
Precision, Recall and F1-Score...
             precision    recall  f1-score   support

         服装       0.97      0.98      0.98      5265
         工业       0.81      0.89      0.85      1209
         农业       0.83      0.70      0.76       211
         运输       0.94      0.86      0.90       131
         信息       0.92      0.88      0.90      1312
         食品       0.90      0.74      0.81       231
         建筑       0.90      0.82      0.86       235
         金融       0.99      0.96      0.98       236

avg / total       0.93      0.93      0.93      8830

Confusion Matrix...
[[5173   61   10    3   16    1    1    0]
 [  75 1074    3    1   47    6    3    0]
 [  15   26  148    1   12    3    6    0]
 [   5    6    0  113    3    1    2    1]
 [  47   94    4    1 1150    9    7    0]
 [   3   41

在测试集上的准确率为93.42%

从混淆矩阵也可以看出分类效果相对较好。

分类效果较弱的两类为农业和食品，其他分类效果较好。