### 使用深度学习库keras做文本分类
- 数据是sogou的[语料库](http://www.sogou.com/labs/dl/c.html)
- 方法是卷积神经网络，可以参考kim的那篇文献
- 工具是keras库，它是基于theano构建的深度学习框架
- 问题是对sogou的新闻进行自动分类

In [10]:
from os import path
import os
import re
import codecs
import pandas as pd
import numpy as np

In [9]:
rootdir = 'SogouC.reduced/Reduced'
dirs = os.listdir(rootdir)
dirs = [path.join(rootdir,f) for f in dirs if f.startswith('C')]
dirs

['SogouC.reduced/Reduced/C000022',
 'SogouC.reduced/Reduced/C000023',
 'SogouC.reduced/Reduced/C000016',
 'SogouC.reduced/Reduced/C000008',
 'SogouC.reduced/Reduced/C000024',
 'SogouC.reduced/Reduced/C000010',
 'SogouC.reduced/Reduced/C000013',
 'SogouC.reduced/Reduced/C000020',
 'SogouC.reduced/Reduced/C000014']

In [19]:
def load_txt(x):
    with open(x) as f:
        res = [t.decode('gbk','ignore') for t in f]
        return ''.join(res)

In [20]:
print load_txt('SogouC.reduced/Reduced/C000024/30.txt')

俄制“Ansat-LL”轻型试验用直升机
　　俄罗斯lenta网站2006年5月2日报道 
梁赞直升机厂向俄罗斯海军交付了轻型试验用直升机“Ansat-LL”。海军将使用这种直升机进行各种武器装备的试验。俄海军总司令玛索林称，直升机将用于发展海军的武器装备。直升机对于继续发展俄海军武器装备具有十分重要的意义。轻型多用途直升机“Ansat”有几种型号，分为进攻型，运输型、客机型、医用型和训练型。于1994年开始设计。 
由梁赞直升机厂和“雷达”科研生产联合体共同研制。直升机最大飞行重量3.3吨，可在520千米和距离上运载1.3吨的有效负载，乘员为9人。


In [21]:
text_t = {}
for i, d in enumerate(dirs):
    files = os.listdir(d)
    files = [path.join(d, x) for x in files if x.endswith('txt') and not x.startswith('.')]
    text_t[i] = [load_txt(f) for f in files]

In [23]:
flen = [len(t) for t in text_t.values()]

In [24]:
labels = np.repeat(text_t.keys(),flen)

In [25]:
# flatter nested list
import itertools
merged = list(itertools.chain.from_iterable(text_t.values()))

In [26]:
df = pd.DataFrame({'label': labels, 'txt': merged})
df.head()

Unnamed: 0,label,txt
0,0,记者： 刚刚结束的“office lady榜样”评选中，你被《瑞丽》评为“office ...
1,0,本报讯(记者 王佳琳 通讯员 唐松寒) 从昨天开始，北京市4689家非连续生产型工业企业...
2,0,第1页:如果你是透明人你会想做些什么事第2页:A你的野心很大第3页:B你自觉能力不错第4页:...
3,0,面对应聘者迫切的求职心理和对高薪的渴望，一些企业打出了过激的招聘启事。\r\n 专家指...
4,0,第1页:顶着压力办网站第2页:网上收废是发展方向\r\n 废品网站为居民解难\r\n 沸...


In [29]:
df['ready_seg'] =df['txt'].str.replace(ur'\W+', ' ',flags=re.U)  # 非正常字符转空格
df['ready_seg'] =df['ready_seg'].str.replace(r'[A-Za-z]+', ' ENG ')   # 英文转ENG
df['ready_seg'] =df['ready_seg'].str.replace(r'\d+', ' NUM ')   # 数字转NUM

In [30]:
# cut word
import jieba
def cutword_1(x):
    words = jieba.cut(x)
    return ' '.join(words)

In [169]:
df['seg_word'] = df.ready_seg.map(cutword_1)

In [172]:
df.head()

Unnamed: 0,label,txt,ready_seg,seg_word
0,0,记者： 刚刚结束的“office lady榜样”评选中，你被《瑞丽》评为“office ...,记者 刚刚结束的 ENG ENG 榜样 评选中 你被 瑞丽 评为 ENG EN...,记者 刚刚 结束 的 ENG ENG 榜样 评选 中 ...
1,0,本报讯(记者 王佳琳 通讯员 唐松寒) 从昨天开始，北京市4689家非连续生产型工业企业...,本报讯 记者 王佳琳 通讯员 唐松寒 从昨天开始 北京市 NUM 家非连续生产型工业企业 ...,本报讯 记者 王佳琳 通讯员 唐松寒 从 昨天 开始 北京市 ...
2,0,第1页:如果你是透明人你会想做些什么事第2页:A你的野心很大第3页:B你自觉能力不错第4页:...,第 NUM 页 如果你是透明人你会想做些什么事第 NUM 页 ENG 你的野心很大第 NU...,第 NUM 页 如果 你 是 透明人 你 会 想 做些 什么 事 第 NUM...
3,0,面对应聘者迫切的求职心理和对高薪的渴望，一些企业打出了过激的招聘启事。\r\n 专家指...,面对应聘者迫切的求职心理和对高薪的渴望 一些企业打出了过激的招聘启事 专家指出 民间统计 ...,面对 应聘者 迫切 的 求职 心理 和 对 高薪 的 渴望 一些 企业 打出 了 过...
4,0,第1页:顶着压力办网站第2页:网上收废是发展方向\r\n 废品网站为居民解难\r\n 沸...,第 NUM 页 顶着压力办网站第 NUM 页 网上收废是发展方向 废品网站为居民解难 沸沸洋...,第 NUM 页 顶 着 压力 办 网站 第 NUM 页 网上 收废 ...


In [None]:
#  文本整理完毕，后面建模需要将词汇转成数字编号，可以人工转，也可以让keras转

In [173]:
textraw = df.seg_word.values.tolist()
textraw = [line.encode('utf-8') for line in textraw] # 需要存为str才能被keras使用

In [273]:
# keras处理token
maxfeatures = 50000 # 只选择最重要的词
from keras.preprocessing.text import Tokenizer
token = Tokenizer(nb_words=maxfeatures)
token.fit_on_texts(textraw) #如果文本较大可以使用文本流
text_seq = token.texts_to_sequences(textraw)

In [175]:
#maxfeatures = len(token.word_counts)
#print maxfeatures  # 语料库的词汇个数

In [264]:
np.median([len(x) for x in text_seq]) #  每条新闻平均400个词汇

400.0

In [177]:
y = df.label.values # 定义好标签
nb_classes = len(np.unique(y))
print(nb_classes)

9


In [316]:
from __future__ import absolute_import
from keras.optimizers import RMSprop
from keras.preprocessing import sequence
from keras.models import Sequential
from keras.layers.core import Dense, Dropout, Activation, Flatten
from keras.layers.embeddings import Embedding
from keras.layers.convolutional import Convolution1D, MaxPooling1D
from keras.layers.recurrent  import SimpleRNN, GRU, LSTM
from keras.callbacks import EarlyStopping

In [374]:
maxlen = 600 # 定义文本最大长度
batch_size = 32 # 批次
word_dim = 100 # 词向量维度
nb_filter = 200  # 卷积核个数
filter_length = 10 # 卷积窗口大小
hidden_dims = 50  # 隐藏层神经元个数
nb_epoch = 10      # 训练迭代次数
pool_length = 50   # 池化窗口大小

In [275]:
from sklearn.cross_validation import train_test_split
train_X, test_X, train_y, test_y = train_test_split(text_seq, y , train_size=0.8, random_state=1)

In [276]:
# 转为等长矩阵，长度为maxlen
print("Pad sequences (samples x time)")
X_train = sequence.pad_sequences(train_X, maxlen=maxlen,padding='post', truncating='post')
X_test = sequence.pad_sequences(test_X, maxlen=maxlen,padding='post', truncating='post')
print('X_train shape:', X_train.shape)
print('X_test shape:', X_test.shape)

Pad sequences (samples x time)
('X_train shape:', (14328, 600))
('X_test shape:', (3582, 600))


In [277]:
# 将y的格式展开成one-hot
Y_train = np_utils.to_categorical(train_y, nb_classes)
Y_test = np_utils.to_categorical(test_y, nb_classes)

In [375]:
# CNN 模型
print('Build model...')
model = Sequential()

# 词向量嵌入层，输入：词典大小，词向量大小，文本长度
model.add(Embedding(maxfeatures, word_dim,input_length=maxlen)) 
model.add(Dropout(0.25))
model.add(Convolution1D(nb_filter=nb_filter,
                        filter_length=filter_length,
                        border_mode="valid",
                        activation="relu"))
# 池化层
model.add(MaxPooling1D(pool_length=pool_length))
model.add(Flatten())
# 全连接层
model.add(Dense(hidden_dims))
model.add(Dropout(0.25))
model.add(Activation('relu'))
model.add(Dense(nb_classes))
model.add(Activation('softmax'))
model.compile(loss='categorical_crossentropy', optimizer='rmsprop')

Build model...


In [376]:
earlystop = EarlyStopping(monitor='val_loss', patience=1, verbose=1)
result = model.fit(X_train, Y_train, batch_size=batch_size, nb_epoch=nb_epoch, 
            validation_split=0.1, show_accuracy=True,callbacks=[earlystop])

Train on 12895 samples, validate on 1433 samples
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 00006: early stopping


In [377]:
score = earlystop.model.evaluate(X_test, Y_test, batch_size=batch_size)
print('Test score:', score)
classes = earlystop.model.predict_classes(X_test, batch_size=batch_size)
acc = np_utils.accuracy(classes, test_y) # 要用没有转换前的y
print('Test accuracy:', acc)

('Test score:', 0.44877443717618948)
('Test accuracy:', 0.88972640982691231)


In [378]:
# LSTM
print('Build model...')
model = Sequential()

# 词向量嵌入层，输入：词典大小，词向量大小，文本长度
model.add(Embedding(maxfeatures, word_dim,input_length=maxlen)) 
#model.add(Dropout(0.25))
model.add(LSTM(100)) 
model.add(Flatten())
# 全连接层
model.add(Dense(hidden_dims))
model.add(Dropout(0.25))
model.add(Activation('relu'))
model.add(Dense(nb_classes))
model.add(Activation('softmax'))
model.compile(loss='categorical_crossentropy', optimizer='rmsprop')

INFO (theano.gof.compilelock): Refreshing lock /home/openmind/.theano/compiledir_Linux-3.19--generic-x86_64-with-debian-jessie-sid-x86_64-2.7.10-64/lock_dir/lock
INFO:theano.gof.compilelock:Refreshing lock /home/openmind/.theano/compiledir_Linux-3.19--generic-x86_64-with-debian-jessie-sid-x86_64-2.7.10-64/lock_dir/lock


Build model...


In [None]:
result = model.fit(X_train, Y_train, batch_size=batch_size, nb_epoch=1, 
            validation_split=0.1, show_accuracy=True)

In [338]:
# CNN + LSTM
print('Build model...')
model = Sequential()

# 词向量嵌入层，输入：词典大小，词向量大小，文本长度
model.add(Embedding(maxfeatures, word_dim,input_length=maxlen)) 
model.add(Dropout(0.25))
model.add(Convolution1D(nb_filter=nb_filter,
                        filter_length=filter_length,
                        border_mode="valid",
                        activation="relu"))
# 池化层
model.add(MaxPooling1D(pool_length=pool_length))
# lstm
model.add(LSTM(100))
# 全连接层
#model.add(Flatten())
model.add(Dropout(0.25))
model.add(Activation('relu'))
model.add(Dense(nb_classes))
model.add(Activation('softmax'))
model.compile(loss='categorical_crossentropy', optimizer='rmsprop')

INFO (theano.gof.compilelock): Refreshing lock /home/openmind/.theano/compiledir_Linux-3.19--generic-x86_64-with-debian-jessie-sid-x86_64-2.7.10-64/lock_dir/lock
INFO:theano.gof.compilelock:Refreshing lock /home/openmind/.theano/compiledir_Linux-3.19--generic-x86_64-with-debian-jessie-sid-x86_64-2.7.10-64/lock_dir/lock


Build model...


In [371]:
# 整合3个cnn
from keras.models import Graph
fw = [2,10, 5]
pool_length = [2,50, 10]
print('Build model...')
graph = Graph()
graph.add_input(name='input', input_shape=(maxlen,), dtype=int)
graph.add_node(Embedding(maxfeatures, word_dim, input_length=maxlen),
               name='embedding', input='input')

# 卷积2个字
graph.add_node(Convolution1D(nb_filter=nb_filter,filter_length=fw[0],
                        activation="relu"),
               name='conv1', input='embedding') 
graph.add_node(MaxPooling1D(pool_length =pool_length[0], ignore_border = False), name='pool1', input = 'conv1')
graph.add_node(Flatten(), name='flat1', input='conv1')


# 卷积10个字
graph.add_node(Convolution1D(nb_filter=nb_filter,filter_length=fw[1],
                        activation="relu"),
               name='conv2', input='embedding') 
graph.add_node(MaxPooling1D(pool_length =pool_length[1], ignore_border = False), name='pool2', input = 'conv2')
graph.add_node(Flatten(), name='flat2', input='conv2')

#卷积5个字
graph.add_node(Convolution1D(nb_filter=nb_filter,filter_length=fw[2],
                        activation="relu"),
               name='conv3', input='embedding') 
graph.add_node(MaxPooling1D(pool_length =pool_length[2], ignore_border = False), name='pool3', input = 'conv3')
graph.add_node(Flatten(), name='flat3', input='conv3')


# 整合
graph.add_node(Dense(hidden_dims,activation='relu'), name='dense1', 
               inputs=['flat1', 'flat2', 'flat3'], merge_mode='concat')
graph.add_node(Dropout(0.5), name='drop1', input='dense1')
graph.add_node(Dense(nb_classes, activation='softmax'), name='softmax', input='drop1')
graph.add_output(name='output', input='softmax')
graph.compile('Adam', loss = {'output': 'categorical_crossentropy'})

Build model...


In [372]:
result = graph.fit({'input':X_train, 'output':Y_train}, 
                   nb_epoch=3,batch_size=batch_size,
                   validation_split=0.1)

Train on 12895 samples, validate on 1433 samples
Epoch 1/3
Epoch 2/3
Epoch 3/3


In [370]:
predict = graph.predict({'input':X_test}, batch_size=batch_size)
predict = predict['output']
classes = predict.argmax(axis=1)
acc = np_utils.accuracy(classes, test_y) # 要用没有转换前的y
print('Test accuracy:', acc)

('Test accuracy:', 0.89335566722501392)
