# 文件导入与处理

```
author: yudake
date: 2/18/2018
```

本项目是利用 **文本卷积神经网络（TextCNN）** 对文章进行二分类，以区别文章是否为色情小说。

## 一、导入将要使用的库

- numpy：Anconda环境下自带
- sklearn.utils.shuffle：Anconda环境下自带
- datetime：Anconda环境下自带
- sklearn.model_selection.train_test_split：Anconda环境下，高版本sklearn自带
- tensorflow：用来进行分词，需自行安装
- pickle：用来进行存储处理后的数据，需自行安装

In [1]:
import numpy as np
from sklearn.utils import shuffle
import datetime
from sklearn.model_selection import train_test_split
import tensorflow as tf
import pickle
import time

## 二、分割训练集与测试集

数据中后2000条为测试数据

In [2]:
data = pickle.load(open('data_processed/data_processed.p', mode='rb'))
word_set = pickle.load(open('data_processed/word_set.p', mode='rb'))
data.drop(0, axis=0, inplace=True)
data['label'] = data['label'].astype('float32')
data = shuffle(data)  # 打乱数据顺序
data.shape

(23747, 2)

In [3]:
train = data[0:21747]
test = data[21747:23747]

## 三、神经网络模型

#### 3.1参数设置

In [4]:
word_set_size = len(word_set)
sentences_size = 1500
  
num_epochs = 3
batch_size = 16
learningrate = 0.005
  
window_sizes = {2, 3, 4, 5}
filter_num = 2
embed_dim = 32

save_dir = './save/save'

In [5]:
def variable_summaries(var):
    """Attach a lot of summaries to a Tensor (for TensorBoard visualization)."""
    with tf.name_scope('summaries'):
        # 计算参数的均值，并使用tf.summary.scaler记录
        mean = tf.reduce_mean(var)
        tf.summary.scalar('mean', mean)
        
        # 计算参数的标准差
        with tf.name_scope('stddev'):
            stddev = tf.sqrt(tf.reduce_mean(tf.square(var - mean)))
            # 使用tf.summary.scaler记录记录下标准差，最大值，最小值
            tf.summary.scalar('stddev', stddev)
            tf.summary.scalar('max', tf.reduce_max(var))
            tf.summary.scalar('min', tf.reduce_min(var))
            # 用直方图记录参数的分布
            tf.summary.histogram('histogram', var)

#### 3.2TextCNN

In [6]:
def text_cnn(fiction, dropout_keep_prob):
    # 嵌入层
    with tf.name_scope('word_embedding'):
        fiction_embed_matrix = tf.Variable(tf.random_uniform([word_set_size+1, embed_dim], -1, 1), name = "fiction_embed_matrix")
        fiction_embed_layer = tf.nn.embedding_lookup(fiction_embed_matrix, fiction, name = "fiction_embed_layer")
        fiction_embed_layer_expand = tf.expand_dims(fiction_embed_layer, -1)
          
    # 卷积池化层
    pool_layer_lst = []
    for window_size in window_sizes:
        with tf.name_scope("fiction_conv_maxpool_{}".format(window_size)):
            filter_weights = tf.Variable(tf.truncated_normal([window_size, embed_dim, 1, filter_num],stddev=0.1),name = "filter_weights")
            variable_summaries(filter_weights)
            filter_bias = tf.Variable(tf.constant(0.1, shape=[filter_num]), name="filter_bias")
            variable_summaries(filter_bias)
              
            conv_layer = tf.nn.conv2d(fiction_embed_layer_expand, filter_weights, [1,1,1,1], padding="VALID", name="conv_layer")
            relu_layer = tf.nn.relu(tf.nn.bias_add(conv_layer,filter_bias), name ="relu_layer")
              
            maxpool_layer = tf.nn.max_pool(relu_layer, [1,sentences_size - window_size + 1 ,1,1], [1,1,1,1], padding="VALID", name="maxpool_layer")
            pool_layer_lst.append(maxpool_layer)
      
    # Dropout层
    with tf.name_scope('pool_dropout'):
        pool_layer = tf.concat(pool_layer_lst, 3, name ="pool_layer")
        max_num = len(window_sizes) * filter_num
        pool_layer_flat = tf.reshape(pool_layer , [-1, 1, max_num], name = "pool_layer_flat")
      
        dropout_layer = tf.nn.dropout(pool_layer_flat, dropout_keep_prob, name = "dropout_layer")
          
    # softmax层
    with tf.name_scope('softmax'):
        reshape = tf.reshape(dropout_layer, shape=[batch_size, -1])
        dim = reshape.get_shape()[1].value
        weights = tf.get_variable('softmax_linear',
                                  shape = [8, 2],
                                  dtype = tf.float32,
                                  initializer = tf.truncated_normal_initializer(stddev=0.005, dtype=tf.float32))
        biases = tf.get_variable('biases',
                                  shape = [2],
                                  dtype = tf.float32,
                                  initializer = tf.constant_initializer(0.1))
        softmax_linear = tf.add(tf.matmul(reshape, weights), biases, name='softmax_linear')
        tf.summary.histogram('softmax_linear', softmax_linear)
    return softmax_linear

#### 3.3计算图

In [7]:
tf.reset_default_graph()
train_graph = tf.Graph()
with train_graph.as_default():
    fiction = tf.placeholder(dtype=tf.int32, shape=[None, 1500], name='fiction')
    targets = tf.placeholder(tf.int32, [None, 1], name="targets")
    LearningRate = tf.placeholder(tf.float32, name="LearningRate")
    dropout_keep_prob = tf.placeholder(tf.float32, name="dropout_keep_prob")
    targets_reshape = tf.reshape(targets, shape=[batch_size])
      
    # inference
    with tf.variable_scope("inference"):
        softmax_linear = text_cnn(fiction, dropout_keep_prob)
      
    # loss计算
    with tf.name_scope("loss") as scope:
        cross_entropy = tf.nn.sparse_softmax_cross_entropy_with_logits(logits=softmax_linear, labels=targets_reshape, name='xentropy_per_example')
        loss = tf.reduce_mean(cross_entropy, name='loss')
      
    # 训练
    with tf.name_scope('optimizer'):
        optimizer = tf.train.AdamOptimizer(learning_rate=LearningRate)
        global_step = tf.Variable(0, name='global_step', trainable=False)
        train_op = optimizer.minimize(loss, global_step=global_step)
      
    with tf.name_scope('accuracy'):
        correct = tf.nn.in_top_k(softmax_linear, targets_reshape, 1)
        correct = tf.cast(correct, tf.float16)
        accuracy = tf.reduce_mean(correct)

#### 3.4取得batch

In [8]:
def get_batches(Xs, ys, batch_size):
    for start in range(0, len(Xs), batch_size):
        end = min(start + batch_size, len(Xs))
        yield Xs[start:end], ys[start:end]

#### 3.5训练与预测

In [9]:
losses = {'train':[], 'cv':[]}
with tf.Session(graph=train_graph) as sess:
    
    # Summaries for loss and accuracy
    loss_summary = tf.summary.scalar("loss", loss)
    accuracy_summary = tf.summary.scalar("accuracy", accuracy)
    
    # Train Summaries
    train_summary_op = tf.summary.merge([loss_summary, accuracy_summary])
    train_summary_writer = tf.summary.FileWriter("runs/train", sess.graph)
    
    sess.run(tf.global_variables_initializer())
    saver = tf.train.Saver()
    for epoch_i in range(num_epochs):
        train_X, cv_X, train_y, cv_y = train_test_split(train[['fiction']].values,
                                                           train['label'].values,
                                                           test_size = 0.1,
                                                           random_state = 0)
        
        train_batches = get_batches(train_X, train_y, batch_size)
        cv_batches = get_batches(cv_X, cv_y, batch_size)
          
        #训练的迭代，保存训练损失
        for batch_i in range(len(train_X) // batch_size):
            x, y = next(train_batches)
            
            fictions = np.zeros([batch_size, sentences_size])
            for i in range(batch_size):
                fictions[i] = x.take(0,axis=1)[i]
          
            feed = {fiction: fictions,
                    targets: np.reshape(y, [batch_size, 1]),
                    LearningRate: learningrate,
                    dropout_keep_prob: 0.5}
              
            step, train_loss, summaries, _, accurate = sess.run([global_step, loss, train_summary_op, train_op, accuracy], feed)  #cost
            losses['train'].append(train_loss)
            train_summary_writer.add_summary(summaries, step)
          
            if (epoch_i * (len(train_X) // batch_size) + batch_i) % 200 == 0:
                time_str = datetime.datetime.now().isoformat()
                print('{}: Epoch {:>3} Batch {:>4}/{}   train_loss = {:.3f}    accuracy = {:.3f}'.format(time_str,
                                                                                                         epoch_i,
                                                                                                         batch_i,
                                                                                                         (len(train_X) // batch_size),
                                                                                                         train_loss,
                                                                                                         accurate))
        accurates = []
        for batch_i  in range(len(cv_X) // batch_size):
            x, y = next(cv_batches)
            
            fictions = np.zeros([batch_size, sentences_size])
            for i in range(batch_size):
                fictions[i] = x.take(0,axis=1)[i]
               
            feed = {fiction: fictions,
                    targets: np.reshape(y, [batch_size, 1]),
                    LearningRate: learningrate,
                    dropout_keep_prob: 1.0}
            
            step, cv_loss, summaries, accurate = sess.run([global_step, loss, train_summary_op, accuracy], feed)  #cost
            losses['cv'].append(cv_loss)
            train_summary_writer.add_summary(summaries, step)
            
            accurates.append(accurate)
            if (epoch_i * (len(cv_X) // batch_size) + batch_i) % 50 == 0:
                time_str = datetime.datetime.now().isoformat()
                print('{}: Epoch {:>3} Batch {:>4}/{}   cv_loss = {:.3f}    accuracy = {:.3f}'.format(time_str,
                                                                                                      epoch_i,
                                                                                                      batch_i,
                                                                                                      (len(cv_X) // batch_size),
                                                                                                      cv_loss,
                                                                                                      accurate))
        print('accurate is: ' + str(np.mean(np.array(accurates),axis=0)))
    
    accurates = []
    
    test_X = test[['fiction']].values
    test_y = test['label'].values
    test_batches = get_batches(test_X, test_y, batch_size)
    for batch_i  in range(len(test_X) // batch_size):
        x, y = next(test_batches)

        fictions = np.zeros([batch_size, sentences_size])
        for i in range(batch_size):
            fictions[i] = x.take(0,axis=1)[i]

        feed = {fiction: fictions,
                targets: np.reshape(y, [batch_size, 1]),
                LearningRate: learningrate,
                dropout_keep_prob: 1.0}

        step, accurate = sess.run([global_step, accuracy], feed)  #cost

        accurates.append(accurate)
        if (epoch_i * (len(test_X) // batch_size) + batch_i) % 50 == 0:
            time_str = datetime.datetime.now().isoformat()
            print('{}: Batch {:>4}/{} accuracy = {:.3f}'.format(time_str,
                                                                batch_i,
                                                                (len(test_X) // batch_size),
                                                                accurate))
    print('accurate is: ' + str(np.mean(np.array(accurates),axis=0)))
        
    ## save model
    saver.save(sess, save_dir)
    print('Model Trained and Saved')

2018-02-19T11:05:54.953488: Epoch   0 Batch    0/1223   train_loss = 0.690    accuracy = 0.562
2018-02-19T11:07:03.556690: Epoch   0 Batch  200/1223   train_loss = 0.381    accuracy = 0.812
2018-02-19T11:08:19.117430: Epoch   0 Batch  400/1223   train_loss = 0.375    accuracy = 0.875
2018-02-19T11:09:30.177000: Epoch   0 Batch  600/1223   train_loss = 0.387    accuracy = 0.812
2018-02-19T11:10:36.180178: Epoch   0 Batch  800/1223   train_loss = 0.258    accuracy = 0.875
2018-02-19T11:11:42.207720: Epoch   0 Batch 1000/1223   train_loss = 0.241    accuracy = 0.812
2018-02-19T11:12:46.132675: Epoch   0 Batch 1200/1223   train_loss = 0.056    accuracy = 1.000
2018-02-19T11:12:53.265707: Epoch   0 Batch    0/135   cv_loss = 0.029    accuracy = 1.000
2018-02-19T11:12:55.838315: Epoch   0 Batch   50/135   cv_loss = 0.028    accuracy = 1.000
2018-02-19T11:12:58.392323: Epoch   0 Batch  100/135   cv_loss = 0.011    accuracy = 1.000
accurate is: 0.99072
2018-02-19T11:13:57.951073: Epoch   1 Bat

## 四、训练效果

Tensorboard中的accurate折线图

![accurate折线图](https://github.com/yudake/sundries/blob/master/porn_classify_arrurate.png?raw=true)

从图中可以得出，训练后准确率稳定在90%以上，平均准确率在98%。

在交叉验证集和测试集上的准确率都在99%。

经过多次验证，准确率可以在98%以上。