# THUCNews 分类预测

In [8]:
import sys
from collections import Counter
import numpy as np
from keras.preprocessing import sequence
from keras.utils import to_categorical

if sys.version_info[0] > 2:
    is_py3 = True
else:
    reload(sys)
    sys.setdefaultencoding("utf-8")
    is_py3 = False

In [9]:
def native_word(word, encoding='utf-8'):
    """如果在python2下面使用python3训练的模型，可考虑调用此函数转化一下字符编码"""
    if not is_py3:
        return word.encode(encoding)
    else:
        return word
    
def native_content(content):
    if not is_py3:
        return content.decode('utf-8')
    else:
        return content
    
def open_file(filename, mode='r'):
    """
    常用文件操作，可在python2和python3间切换.
    mode: 'r' or 'w' for read or write
    """
    if is_py3:
        return open(filename, mode, encoding='utf-8', errors='ignore')
    else:
        return open(filename, mode)
    
def read_file(filename):
    """读取文件数据"""
    contents, labels = [], []
    with open_file(filename) as f:
        for line in f:
            try:
                label, content = line.strip().split('\t')
                if content:
                    contents.append(list(native_content(content)))
                    labels.append(native_content(label))
            except:
                pass
    return contents, labels

def build_vocab(train_dir, vocab_dir, vocab_size=5000):
    """根据训练集构建词汇表，存储"""
    data_train, _ = read_file(train_dir)

    all_data = []
    for content in data_train:
        all_data.extend(content)

    counter = Counter(all_data)
    count_pairs = counter.most_common(vocab_size - 1)
    words, _ = list(zip(*count_pairs))
    # 添加一个 <PAD> 来将所有文本pad为同一长度
    words = ['<PAD>'] + list(words)
    open_file(vocab_dir, mode='w').write('\n'.join(words) + '\n')

In [12]:
def read_vocab(vocab_dir):
    """读取词汇表"""
    # words = open_file(vocab_dir).read().strip().split('\n')
    with open_file(vocab_dir) as fp:
        # 如果是py2 则每个值都转化为unicode
        words = [native_content(_.strip()) for _ in fp.readlines()]
    word_to_id = dict(zip(words, range(len(words))))
    return words, word_to_id

def read_category():
    """读取分类目录，固定"""
    categories = ['体育', '财经', '房产', '家居', '教育', '科技', '时尚', '时政', '游戏', '娱乐']
    categories = [native_content(x) for x in categories]
    cat_to_id = dict(zip(categories, range(len(categories))))

    return categories, cat_to_id

def to_words(content, words):
    """将id表示的内容转换为文字"""
    return ''.join(words[x] for x in content)

def process_file(filename, word_to_id, cat_to_id, max_length=600):
    """将文件转换为id表示"""
    contents, labels = read_file(filename)

    data_id, label_id = [], []
    for i in range(len(contents)):
        data_id.append([word_to_id[x] for x in contents[i] if x in word_to_id])
        label_id.append(cat_to_id[labels[i]])

    # 使用keras提供的pad_sequences来将文本pad为固定长度
    x_pad = sequence.pad_sequences(data_id, max_length)
    y_pad = to_categorical(label_id, num_classes=len(cat_to_id))  # 将标签转换为one-hot表示

    return x_pad, y_pad

In [13]:
train_file = './data/THUCNews/cnews.train.txt'
valid_file = './data/THUCNews/cnews.val.txt'
test_file = './data/THUCNews/cnews.test.txt'
vocab_file = './data/THUCNews/cnews.vocab.txt'

words, word_to_id = read_vocab(vocab_file)
categories, cat_to_id = read_category()

x_train, y_train = process_file(train_file, word_to_id, cat_to_id)
x_val, y_val = process_file(valid_file, word_to_id, cat_to_id)
x_test, y_test = process_file(test_file, word_to_id, cat_to_id)

In [18]:
x_train.shape

(50000, 600)

In [23]:
y_train[0]

array([1., 0., 0., 0., 0., 0., 0., 0., 0., 0.], dtype=float32)

## 1. TextCNN 模型

In [24]:
from keras import Input, Model
from keras.layers import Embedding, Dense, Conv1D, GlobalMaxPooling1D, Concatenate, Dropout
from keras.callbacks import EarlyStopping
from keras.datasets import imdb
from keras.preprocessing import sequence
from sklearn.metrics import accuracy_score
from sklearn.metrics import recall_score
from sklearn.metrics import precision_score
from sklearn.metrics import roc_auc_score

class TextCNN(object):
    def __init__(self, maxlen, max_features, embedding_dims,
                 class_num=1,
                 last_activation='sigmoid'):
        self.maxlen = maxlen
        self.max_features = max_features
        self.embedding_dims = embedding_dims
        self.class_num = class_num
        self.last_activation = last_activation

    def get_model(self):
        input = Input((self.maxlen,))

        # Embedding part can try multichannel as same as origin paper
        embedding = Embedding(self.max_features, self.embedding_dims, input_length=self.maxlen)(input)
        convs = []
        for kernel_size in [3, 4, 5]:
            c = Conv1D(128, kernel_size, activation='relu')(embedding)
            c = GlobalMaxPooling1D()(c)
            convs.append(c)
        x = Concatenate()(convs)

        output = Dense(self.class_num, activation=self.last_activation)(x)
        model = Model(inputs=input, outputs=output)
        return model

In [27]:
max_features = 5000
maxlen = 600
batch_size = 32
embedding_dims = 100
epochs=10

model = TextCNN(maxlen, max_features, embedding_dims, class_num=10, last_activation='softmax').get_model()
model.compile('adam', 'binary_crossentropy', metrics=['accuracy'])

early_stopping = EarlyStopping(monitor='val_acc', patience=3, mode='max')

model.fit(x_train, y_train,
          batch_size=batch_size,
          epochs=epochs,
          callbacks=[early_stopping],
          validation_data=(x_val, y_val))

result = model.predict(x_test)

Instructions for updating:
Use tf.cast instead.
Instructions for updating:
Deprecated in favor of operator or tf.math.divide.
Train on 50000 samples, validate on 5000 samples
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10


In [53]:
acc = np.mean([result[i].argmax() == y_test[i].argmax() for i in range(0 , len(result))])
print('测试集准确率为', acc)

测试集准确率为 0.9615


In [54]:
epochs=4

model = TextCNN(maxlen, max_features, embedding_dims, class_num=10, last_activation='softmax').get_model()
model.compile('adam', 'binary_crossentropy', metrics=['accuracy'])

early_stopping = EarlyStopping(monitor='val_acc', patience=3, mode='max')

model.fit(x_train, y_train,
          batch_size=batch_size,
          epochs=epochs,
          callbacks=[early_stopping],
          validation_data=(x_val, y_val))

result = model.predict(x_test)
acc = np.mean([result[i].argmax() == y_test[i].argmax() for i in range(0 , len(result))])
print('测试集准确率为', acc)

Train on 50000 samples, validate on 5000 samples
Epoch 1/4
Epoch 2/4
Epoch 3/4
Epoch 4/4
测试集准确率为 0.9664


## 2. TextRNN

In [55]:
from keras import Input, Model
from keras.layers import Embedding, Dense, Dropout, LSTM
from keras.callbacks import EarlyStopping
from keras.datasets import imdb
from keras.preprocessing import sequence

class TextRNN(object):
    def __init__(self, maxlen, max_features, embedding_dims,
                 class_num=1,
                 last_activation='sigmoid'):
        self.maxlen = maxlen
        self.max_features = max_features
        self.embedding_dims = embedding_dims
        self.class_num = class_num
        self.last_activation = last_activation

    def get_model(self):
        input = Input((self.maxlen,))

        embedding = Embedding(self.max_features, self.embedding_dims, input_length=self.maxlen)(input)
        x = LSTM(128)(embedding)  # LSTM or GRU

        output = Dense(self.class_num, activation=self.last_activation)(x)
        model = Model(inputs=input, outputs=output)
        return model

In [56]:
max_features = 5000
maxlen = 600
batch_size = 32
embedding_dims = 100
epochs=10

model = TextRNN(maxlen, max_features, embedding_dims, class_num=10, last_activation='softmax').get_model()
model.compile('adam', 'binary_crossentropy', metrics=['accuracy'])

early_stopping = EarlyStopping(monitor='val_acc', patience=3, mode='max')

model.fit(x_train, y_train,
          batch_size=batch_size,
          epochs=epochs,
          callbacks=[early_stopping],
          validation_data=(x_val, y_val))

result = model.predict(x_test)
acc = np.mean([result[i].argmax() == y_test[i].argmax() for i in range(0 , len(result))])
print('测试集准确率为', acc)

Train on 50000 samples, validate on 5000 samples
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
测试集准确率为 0.9399


##  3. RCNN (BiLstm)

In [159]:
from keras import Input, Model
from keras import backend as K
from keras.layers import Embedding, Dense, Lambda, Concatenate, Conv1D, GlobalMaxPooling1D, LSTM, Bidirectional
import numpy as np
from keras.callbacks import EarlyStopping
from keras.datasets import imdb
from keras.preprocessing import sequence
from keras import initializers, regularizers, constraints
from keras.engine.topology import Layer

class RCNN_Att_BiLstm(object):
    def __init__(self, maxlen, max_features, embedding_dims,
                 class_num=1,
                 last_activation='sigmoid'):
        self.maxlen = maxlen
        self.max_features = max_features
        self.embedding_dims = embedding_dims
        self.class_num = class_num
        self.last_activation = last_activation

    def get_model(self):
        input_current = Input((self.maxlen,))
        input_left = Input((self.maxlen,))
        input_right = Input((self.maxlen,))

        embedder = Embedding(self.max_features, self.embedding_dims, input_length=self.maxlen)
        embedding_current = embedder(input_current)
        embedding_left = embedder(input_left)
        embedding_right = embedder(input_right)
    
        x_left = Bidirectional(LSTM(128, return_sequences=True))(embedding_left)
        x_right = Bidirectional(LSTM(128, return_sequences=True, go_backwards=True))(embedding_right)
        x_right = Lambda(lambda x: K.reverse(x, axes=1))(x_right)
        x = Concatenate(axis=2)([x_left, embedding_current, x_right])

        x = Conv1D(64, kernel_size=1, activation='tanh')(x)
        x = GlobalMaxPooling1D()(x)

        output = Dense(self.class_num, activation=self.last_activation)(x)
        model = Model(inputs=[input_current, input_left, input_right], outputs=output)
        return model

In [None]:
max_features = 5000
maxlen = 600
batch_size = 32
embedding_dims = 100
epochs=5

x_train_current = x_train
x_train_left = np.hstack([np.expand_dims(x_train[:, 0], axis=1), x_train[:, 0:-1]])
x_train_right = np.hstack([x_train[:, 1:], np.expand_dims(x_train[:, -1], axis=1)])
x_val_current = x_val
x_val_left = np.hstack([np.expand_dims(x_val[:, 0], axis=1), x_val[:, 0:-1]])
x_val_right = np.hstack([x_val[:, 1:], np.expand_dims(x_val[:, -1], axis=1)])
x_test_current = x_test
x_test_left = np.hstack([np.expand_dims(x_test[:, 0], axis=1), x_test[:, 0:-1]])
x_test_right = np.hstack([x_test[:, 1:], np.expand_dims(x_test[:, -1], axis=1)])

model = RCNN_Att_BiLstm(maxlen, max_features, embedding_dims, class_num=10, last_activation='softmax').get_model()
model.compile('adam', 'binary_crossentropy', metrics=['accuracy'])

early_stopping = EarlyStopping(monitor='val_acc', patience=3, mode='max')
model.fit([x_train_left, x_train_current, x_train_right], y_train,
          batch_size=batch_size,
          epochs=epochs,
          callbacks=[early_stopping],
          validation_data=([x_val_current, x_val_left, x_val_right], y_val))

result = model.predict([x_test_current, x_test_left, x_test_right])
acc = np.mean([result[i].argmax() == y_test[i].argmax() for i in range(0 , len(result))])
print('测试集准确率为', acc)

Train on 50000 samples, validate on 5000 samples
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
