## CNN中文文本分类（Tensorflow）

THUCNews是根据新浪新闻RSS订阅频道2005~2011年间的历史数据筛选过滤生成，包含74万篇新闻文档。本文采用了清华NLP组提供的THUCNews新闻文本分类数据集的一个子集.

In [2]:
# coding: utf-8

import sys
from collections import Counter

import numpy as np
import tensorflow.keras as kr

if sys.version_info[0] > 2:
    is_py3 = True
else:
    reload(sys)
    sys.setdefaultencoding("utf-8")
    is_py3 = False


def native_word(word, encoding='utf-8'):
    """如果在python2下面使用python3训练的模型，可考虑调用此函数转化一下字符编码"""
    if not is_py3:
        return word.encode(encoding)
    else:
        return word


def native_content(content):
    if not is_py3:
        return content.decode('utf-8')
    else:
        return content


def open_file(filename, mode='r'):
    """
    常用文件操作，可在python2和python3间切换.
    mode: 'r' or 'w' for read or write
    """
    if is_py3:
        return open(filename, mode, encoding='utf-8', errors='ignore')
    else:
        return open(filename, mode)


def read_file(filename):
    """读取文件数据"""
    contents, labels = [], []
    with open_file(filename) as f:
        for line in f:
            try:
                label, content = line.strip().split('\t')
                if content:
                    contents.append(list(native_content(content)))
                    labels.append(native_content(label))
            except:
                pass
    return contents, labels


def build_vocab(train_dir, vocab_dir, vocab_size=5000):
    """根据训练集构建词汇表，存储"""
    data_train, _ = read_file(train_dir)

    all_data = []
    for content in data_train:
        all_data.extend(content)

    counter = Counter(all_data)
    count_pairs = counter.most_common(vocab_size - 1)
    words, _ = list(zip(*count_pairs))
    # 添加一个 <PAD> 来将所有文本pad为同一长度
    words = ['<PAD>'] + list(words)
    open_file(vocab_dir, mode='w').write('\n'.join(words) + '\n')


def read_vocab(vocab_dir):
    """读取词汇表"""
    # words = open_file(vocab_dir).read().strip().split('\n')
    with open_file(vocab_dir) as fp:
        # 如果是py2 则每个值都转化为unicode
        words = [native_content(_.strip()) for _ in fp.readlines()]
    word_to_id = dict(zip(words, range(len(words))))
    return words, word_to_id


def read_category():
    """读取分类目录，固定"""
    categories = ['体育', '财经', '房产', '家居', '教育', '科技', '时尚', '时政', '游戏', '娱乐']

    categories = [native_content(x) for x in categories]

    cat_to_id = dict(zip(categories, range(len(categories))))

    return categories, cat_to_id


def to_words(content, words):
    """将id表示的内容转换为文字"""
    return ''.join(words[x] for x in content)


def process_file(filename, word_to_id, cat_to_id, max_length=600):
    """将文件转换为id表示，转换为字向量"""
    # 读取整个文件
    contents, labels = read_file(filename)

    data_id, label_id = [], []
    for i in range(len(contents)):
        # 处理每个新闻
        # 顺序记录 每个新闻的所包含的字序列表；以及类别编号
        data_id.append([word_to_id[x] for x in contents[i] if x in word_to_id])
        label_id.append(cat_to_id[labels[i]])

    # 使用keras提供的pad_sequences来将文本pad为固定长度；格式化数据
    x_pad = kr.preprocessing.sequence.pad_sequences(data_id, max_length)
    y_pad = kr.utils.to_categorical(label_id, num_classes=len(cat_to_id))  # 将标签转换为one-hot表示

    return x_pad, y_pad


def batch_iter(x, y, batch_size=64):
    """生成批次数据"""
    data_len = len(x)
    num_batch = int((data_len - 1) / batch_size) + 1

    indices = np.random.permutation(np.arange(data_len))
    x_shuffle = x[indices]
    y_shuffle = y[indices]

    for i in range(num_batch):
        start_id = i * batch_size
        end_id = min((i + 1) * batch_size, data_len)
        yield x_shuffle[start_id:end_id], y_shuffle[start_id:end_id]

In [3]:
!head cnews/cnews.train.txt

head: cnews/cnews.train.txt: No such file or directory


In [4]:
import pandas as pd

In [5]:
# 训练集
train_data = pd.read_csv('../cnews/cnews.train.txt', sep="	", names=['category', 'content'])
# 测试集
test_data = pd.read_csv('../cnews/cnews.test.txt', sep="	", names=['category', 'content'])
# 验证集
val_data = pd.read_csv('../cnews/cnews.val.txt', sep="	", names=['category', 'content'])

In [58]:
# 类别

train_category = train_data['category'].unique()
test_category = test_data['category'].unique()
val_category = val_data['category'].unique()
display(train_category, test_category, val_category)

array(['体育', '娱乐', '家居', '房产', '教育', '时尚', '时政', '游戏', '科技', '财经'],
      dtype=object)

array(['体育', '娱乐', '家居', '房产', '教育', '时尚', '时政', '游戏', '科技', '财经'],
      dtype=object)

array(['体育', '娱乐', '家居', '房产', '教育', '时尚', '时政', '游戏', '科技', '财经'],
      dtype=object)

In [59]:
# 数据集大小
print('train_data_shape:', train_data.shape)
print('test_data_shape:', test_data.shape)
print('val_data_shape:', val_data.shape)


train_data_shape: (50000, 2)
test_data_shape: (10000, 2)
val_data_shape: (5000, 2)


### 数据预处理

In [61]:
# 新闻类别映射数字，

categories, cat_to_id = read_category()
categories, cat_to_id

(['体育', '财经', '房产', '家居', '教育', '科技', '时尚', '时政', '游戏', '娱乐'],
 {'体育': 0,
  '财经': 1,
  '房产': 2,
  '家居': 3,
  '教育': 4,
  '科技': 5,
  '时尚': 6,
  '时政': 7,
  '游戏': 8,
  '娱乐': 9})

In [1]:
# 读取词表，并赋予一个id编号
words, word_to_id = read_vocab(vocab_dir='../cnews/cnews.vocab.txt')


NameError: name 'read_vocab' is not defined

In [68]:
# 训练集

config.vocab_size = len(words)
x_train, y_train = process_file('../cnews/cnews.train.txt', word_to_id, cat_to_id, config.vocab_size)
x_val, y_val = process_file('../cnews/cnews.val.txt', word_to_id, cat_to_id,config.vocab_size)

In [78]:
# tf 执行
import tensorflow as tf
import time


# 创建session
session = tf.Session()
session.run(tf.global_variables_initializer())


start_time = time.time()
total_batch = 0  # 总批次
best_acc_val = 0.0  # 最佳验证集准确率
last_improved = 0  # 记录上一次提升批次
require_improvement = 1000  # 如果超过1000轮未提升，提前结束训练


In [89]:
#run_model