In [1]:
import os  
import numpy as np  
np.random.seed(1337)  
import jieba  #处理中文
import nltk  #处理英文
#from sklearn.naive_bayes import MultinomialNB

from sklearn.model_selection import train_test_split
from collections import Counter
from keras.preprocessing.text import Tokenizer  
from keras.preprocessing.sequence import pad_sequences  
from keras.utils.np_utils import to_categorical  
from keras.layers import Dense, Input, Flatten  
from keras.layers import Conv1D, MaxPooling1D, Embedding  
from keras.models import Model  
from keras.optimizers import *  
from keras.models import Sequential  
from keras.layers import Merge  
import sys
def make_word_set(words_file):
    words_set = set()
    with open(words_file, 'rb') as fp:
        for line in fp.readlines():
            word = line.strip().decode('utf-8')
            if len(word)>0 and word not in words_set: # 去重
                words_set.add(word)
    return words_set


# 文本处理，也就是样本生成过程
def text_processing(folder_path, test_size=0.2):
    folder_list = os.listdir(folder_path)
    data_list = []
    class_list = []
    folder_id = 0
    # 遍历文件夹
    for folder in folder_list:
        new_folder_path = os.path.join(folder_path, folder)
        files = os.listdir(new_folder_path)
        # 读取文件
        j = 1
        for file in files:
            if j > 100:  # 怕内存爆掉，只取100个样本文件，你可以注释掉取完
                break
            with open(os.path.join(new_folder_path, file), 'rb') as fp:
                raw = fp.read().decode('utf-8')
            ## 是的，随处可见的jieba中文分词
            #jieba.enable_parallel(4)  # 开启并行分词模式，参数为并行进程数，不支持windows
            word_cut = jieba.cut(raw, cut_all=False)  # 精确模式，返回的结构是一个可迭代的genertor
            word_list = list(word_cut)  # genertor转化为list，每个词unicode格式
            #jieba.disable_parallel()  # 关闭并行分词模式

            data_list.append(word_list)  # 训练集list
            class_list.append(folder_id)
            # 类别
            j += 1
        folder_id += 1

    # 其实可以用sklearn自带的部分做
    train_data_list, test_data_list, train_class_list, test_class_list = train_test_split(data_list, class_list, test_size=test_size)

    all_words_dict = {}
    for word_list in data_list:
        words_dict = Counter([word for word in word_list])
        all_words_dict = dict(Counter(all_words_dict)+words_dict)
    # key函数利用词频进行降序排序
    all_words_tuple_list = sorted(all_words_dict.items(), key=lambda f: f[1], reverse=True)  # 内建函数sorted参数需为list
    all_words_list = list(zip(*all_words_tuple_list))[0]

    return data_list, class_list, all_words_list, train_data_list, test_data_list, train_class_list, test_class_list


def words_dict(all_words_list, deleteN, stopwords_set=set()):
    # 选取特征词
    feature_words = []
    n = 1
    for t in range(deleteN, len(all_words_list), 1):
        if n > 1000:  # feature_words的维度1000
            break

        if not all_words_list[t].isdigit() and all_words_list[t] not in stopwords_set and 1 < len(
                all_words_list[t]) < 5:
            feature_words.append(all_words_list[t])
            n += 1
    return feature_words

# 文本特征
def text_features(data_list, train_data_list, test_data_list,  feature_words, flag='nltk'):
    def text_features(text, feature_words):
        text_words = set(text)
        ## -----------------------------------------------------------------------------------
        if flag == 'nltk':
            ## nltk特征 dict
            features = {word:1 if word in text_words else 0 for word in feature_words}
        elif flag == 'sklearn':
            ## sklearn特征 list
            features = [word for word in feature_words if word in text_words]
        else:
            features = []
        ## -----------------------------------------------------------------------------------
        return features
    data = [text_features(text, feature_words) for text in data_list]
    train_feature_list = [text_features(text, feature_words) for text in train_data_list]
    test_feature_list = [text_features(text, feature_words) for text in test_data_list]
    return data, train_feature_list, test_feature_list


print ("start")


## 文本预处理
folder_path = 'Database/SogouC/Sample'
data_list, class_list, all_words_list, train_data_list, test_data_list, train_class_list, test_class_list = text_processing(folder_path, test_size=0.2)

# 生成stopwords_set
stopwords_file = 'stopwords_cn.txt'
stopwords_set = make_word_set(stopwords_file)

## 文本特征提取和分类
# flag = 'nltk'
flag = 'sklearn'
test_accuracy_list = []
deleteN = 20
feature_words = words_dict(all_words_list, deleteN, stopwords_set)
data, train_feature_list, test_feature_list = text_features(data_list, train_data_list, test_data_list, feature_words, flag)


print(len(data))
data_list = []
pure_text = ' '
for item in data:    
    data_list.append(pure_text.join(item))

print ("finished")




Using TensorFlow backend.
Building prefix dict from the default dictionary ...


start


Dumping model to file cache C:\Windows\TEMP\jieba.cache
Loading model cost 1.065 seconds.
Prefix dict has been built succesfully.


90
finished


In [2]:
embeddings_index = {}
f = open('glove.6B.50d.txt','rb') # 读入50维的词向量文件，可以改成100维或者其他  
for line in f:  
    values = line.split()  
    word = values[0]  
    coefs = np.asarray(values[1:], dtype='float32')  
    embeddings_index[word] = coefs  
f.close()  

In [3]:
import keras.preprocessing.text as T
from keras.preprocessing.text import Tokenizer

tokenizer = Tokenizer(num_words=20000) #num_words:None或整数,处理的最大单词数量。少于此数的单词丢掉
tokenizer.fit_on_texts(data_list)
sequences = tokenizer.texts_to_sequences(data_list)
word_index = tokenizer.word_index
print(len(word_index))
data = pad_sequences(sequences, maxlen=1000)

1000


In [4]:
labels = to_categorical(np.asarray(class_list))
print('Shape of data tensor:', data.shape)  
print('Shape of label tensor:', labels.shape)  

X_train, X_test, y_train, y_test = train_test_split(data, labels, test_size=0.4, random_state=42)
# # split the data into a training set and a validation set,下面这段代码，主要是将数据集分为，训练集和测试集（英文原意是验证集，但是我略有改动代码）  
# indices = np.arange(data.shape[0])  
# np.random.shuffle(indices)  
# data = data[indices]  
# labels = labels[indices]  
# nb_validation_samples = int(0.4 * data.shape[0])  
  
# x_train = data[:-nb_validation_samples] # 训练集  
# y_train = labels[:-nb_validation_samples]# 训练集的标签  
# x_val = data[-nb_validation_samples:] # 测试集，英文原意是验证集  
# y_val = labels[-nb_validation_samples:] # 测试集的标签 
# print(x_train.shape)
# print(y_train.shape)


Shape of data tensor: (90, 1000)
Shape of label tensor: (90, 9)


In [6]:
MAX_SEQUENCE_LENGTH = 1000 # 每个文本的最长选取长度，较短的文本可以设短些  
MAX_NB_WORDS = 20000 # 整体词库字典中，词的多少，可以略微调大或调小  
EMBEDDING_DIM = 50 # 词向量的维度，可以根据实际情况使用，如果不了解暂时不要改 
nb_words = min(MAX_NB_WORDS, len(word_index))
embedding_matrix = np.zeros((nb_words + 1, EMBEDDING_DIM))  
for word, i in word_index.items():  
    if i > MAX_NB_WORDS:  
        continue  
    embedding_vector = embeddings_index.get(word)  
    if embedding_vector is not None:  
        # words not found in embedding index will be all-zeros.  
        embedding_matrix[i] = embedding_vector # wor
print(embedding_matrix.shape)

(1001, 50)


In [11]:
# 神经网路的第一层，词向量层，本文使用了预训练glove词向量，可以把trainable那里设为False  
embedding_layer = Embedding(nb_words + 1,  
                            EMBEDDING_DIM,  
                            input_length=MAX_SEQUENCE_LENGTH,  
                            weights=[embedding_matrix],  
                            trainable=True)  
  
print('Training model.')  
  
# train a 1D convnet with global maxpoolinnb_wordsg  
  
#left model 第一块神经网络，卷积窗口是5*50（50是词向量维度）  
model_left = Sequential()  
#model.add(Input(shape=(MAX_SEQUENCE_LENGTH,), dtype='int32'))  
model_left.add(embedding_layer)  
model_left.add(Conv1D(128, 5, activation='tanh'))  
model_left.add(MaxPooling1D(5))  
model_left.add(Conv1D(128, 5, activation='tanh'))  
model_left.add(MaxPooling1D(5))  
model_left.add(Conv1D(128, 5, activation='tanh'))  
model_left.add(MaxPooling1D(35))  
model_left.add(Flatten())  
  
#right model <span style="font-family:Arial, Helvetica, sans-serif;">第二块神经网络，卷积窗口是4*50</span>  
  
model_right = Sequential()  
model_right.add(embedding_layer)  
model_right.add(Conv1D(128, 4, activation='tanh'))  
model_right.add(MaxPooling1D(4))  
model_right.add(Conv1D(128, 4, activation='tanh'))  
model_right.add(MaxPooling1D(4))  
model_right.add(Conv1D(128, 4, activation='tanh'))  
model_right.add(MaxPooling1D(28))  
model_right.add(Flatten())  
  
#third model <span style="font-family:Arial, Helvetica, sans-serif;">第三块神经网络，卷积窗口是6*50</span>  
model_3 = Sequential()  
model_3.add(embedding_layer)  
model_3.add(Conv1D(128, 6, activation='tanh'))  
model_3.add(MaxPooling1D(3))  
model_3.add(Conv1D(128, 6, activation='tanh'))  
model_3.add(MaxPooling1D(3))  
model_3.add(Conv1D(128, 6, activation='tanh'))  
model_3.add(MaxPooling1D(30))  
model_3.add(Flatten())  
  
  
merged = Merge([model_left, model_right,model_3], mode='concat') # 将三种不同卷积窗口的卷积层组合 连接在一起，当然也可以只是用三个model中的一个，一样可以得到不错的效果，只是本文采用论文中的结构设计  
model = Sequential()  
model.add(merged) # add merge  
model.add(Dense(128, activation='tanh')) # 全连接层  
model.add(Dense(9, activation='softmax')) # softmax，输出文本属于20种类别中每个类别的概率  
  
# 优化器我这里用了adadelta，也可以使用其他方法  
model.compile(loss='categorical_crossentropy',  
              optimizer='Adadelta',  
              metrics=['accuracy'])  

Training model.




In [12]:
model.fit(X_train, y_train, nb_epoch=100)  
# score = model.evaluate(x_train, y_train, verbose=0) # 评估模型在训练集中的效果，准确率约99%  
# print('train score:', score[0])  
# print('train accuracy:', score[1])  
score, acc = model.evaluate(X_test, y_test, batch_size=32, verbose=1)



Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 30/100
Epoch 31/100
Epoch 32/100
Epoch 33/100
Epoch 34/100
Epoch 35/100
Epoch 36/100
Epoch 37/100
Epoch 38/100
Epoch 39/100
Epoch 40/100
Epoch 41/100
Epoch 42/100
Epoch 43/100
Epoch 44/100
Epoch 45/100
Epoch 46/100
Epoch 47/100
Epoch 48/100
Epoch 49/100
Epoch 50/100
Epoch 51/100
Epoch 52/100
Epoch 53/100
Epoch 54/100
Epoch 55/100
Epoch 56/100
Epoch 57/100
Epoch 58/100
Epoch 59/100
Epoch 60/100
Epoch 61/100
Epoch 62/100
Epoch 63/100
Epoch 64/100
Epoch 65/100
Epoch 66/100
Epoch 67/100
Epoch 68/100
Epoch 69/100
Epoch 70/100
Epoch 71/100
Epoch 72/100
Epoch 73/100
Epoch 74/100
Epoch 75/100
Epoch 76/100
Epoch 77/100
Epoch 78

Epoch 92/100
Epoch 93/100
Epoch 94/100
Epoch 95/100
Epoch 96/100
Epoch 97/100
Epoch 98/100
Epoch 99/100
Epoch 100/100
