# 文本向量处理

In [38]:
import numpy as np
import random 
import datetime
import collections
with open("cnews/cnews.train.txt", 'r', encoding="utf-8") as file:
    train = file.readlines()
with open("cnews/cnews.test.txt", 'r', encoding="utf-8") as file:
    test = file.readlines()

In [2]:
def exactua(data):
    random.shuffle(data) #数据集打乱
    x_data = []
    y_data = []
    for line in data:
        x_data.append(line.replace('\n', '').split('\t')[1])
        y_data.append(line.replace('\n', '').split('\t')[0])
    return x_data, y_data

In [3]:
x_train_data, y_train_data = exactua(train)
x_test_data, y_test_data = exactua(test)

In [4]:
# 1000条 训练集
# 100条 测试
x_train_data, y_train_data = x_train_data[:1000], y_train_data[:1000]
x_test_data, y_test_data= x_test_data[:100], y_test_data[:100]

In [3]:
def stopwordslist():
    import jieba 
    stopwords = [line.strip() for line in open('stopwords.txt',encoding='UTF-8').readlines()]
    stopwords.append(' ')
    return stopwords

In [4]:
stopwords = stopwordslist()

In [5]:
def splitsentence(x_train_data, x_test_data):
    #去停用词并进行结巴分词
    import jieba
    trainlists = []
    for i in x_train_data:
        word_list = [word for word in jieba.cut(i) if word not in stopwords]
        trainlists.append(' '.join(word_list))
    testlists = []
    for i in x_test_data:
        word_list = [word for word in jieba.cut(i) if word not in stopwords]
        testlists.append(' '.join(word_list))
    return trainlists, testlists

In [8]:
x_train, x_test = splitsentence(x_train_data, x_test_data)

Building prefix dict from the default dictionary ...
Loading model from cache C:\Users\wttree\AppData\Local\Temp\jieba.cache
Loading model cost 0.872 seconds.
Prefix dict has been built succesfully.


In [6]:
def sort_by_count(d):
    #字典排序
    d = collections.OrderedDict(sorted(d.items(), key = lambda t: -t[1]))
    return d

def create_dict(data):
    word_dic = {}
    word_dic['PAD'] = 0
    for i in data:
        for word in i.split(' '):
            if word == '':
                continue
            if word not in word_dic:
                word_dic[word] = 1
            else:
                word_dic[word] += 1
    word_dic = dict(sort_by_count(word_dic))
    word2vecdic = {}
    word2vecdic["PAD"] = 0
    count = 1
    for i,j in word_dic.items():
        if j > 5:
            word2vecdic[i] = count
            count += 1
    return word2vecdic

In [10]:
word2vecdic = create_dict(x_train)

In [7]:
def token(data, word2vecdic):
    veclist = []
    for i in data:
        vec = [word2vecdic.get(i) if i in word2vecdic else 0 for i in i.split(' ')]
        veclist.append(vec)
    return veclist

In [29]:
x_train = token(x_train, word2vecdic)
x_test = token(x_test, word2vecdic)

In [34]:
# sentence size 
np.array([len(i) for i in x_train]).mean()
sentence_size = 300

In [8]:
def padding(data, sentence_size):
    for i in range(len(data)):
        length = len(data[i])
        if length > sentence_size:
            data[i] = data[i][:sentence_size]
        elif length < sentence_size:
            data[i].extend([0]*(sentence_size-length))
        else:
            continue
    return data

In [40]:
x_train = padding(x_train, sentence_size)
x_test = padding(x_test, sentence_size)

In [15]:
def y2label(y_train, y_test):
    dic = {}
    count = 0
    for i in set(y_train):
        dic[i] = count 
        count += 1
    y_train, y_test = [dic.get(i) for i in y_train], [dic.get(i) for i in y_test]
    return y_train, y_test, dic

In [17]:
import numpy as np
import random 
import collections
from sklearn.preprocessing import OneHotEncoder

def load_data():

    with open("cnews/cnews.train.txt", 'r', encoding="utf-8") as file:
        train = file.readlines()
    with open("cnews/cnews.test.txt", 'r', encoding="utf-8") as file:
        test = file.readlines()   
    x_train_data, y_train_data = exactua(train)
    x_test_data, y_test_data = exactua(test)   
    x_train_data, y_train_data = x_train_data[:1000], y_train_data[:1000]
    x_test_data, y_test_data= x_test_data[:100], y_test_data[:100]
    stopwords = stopwordslist()
    x_train, x_test = splitsentence(x_train_data, x_test_data)
    word2vecdic = create_dict(x_train)
    x_train = token(x_train, word2vecdic)
    x_test = token(x_test, word2vecdic)
    sentence_size = 300
    x_train = padding(x_train, sentence_size)
    x_test = padding(x_test, sentence_size)
    y_train, y_test, labeldic = y2label(y_train_data, y_test_data)
    
    enc = OneHotEncoder()
    y_train = np.array(y_train)
    y_train = y_train[:,np.newaxis]
    a=enc.fit_transform(y_train)
    y_train = a.toarray()
    y_test = np.array(y_test)
    y_test= y_test[:,np.newaxis]
    a=enc.fit_transform(y_test)
    y_test = a.toarray()
    return x_train, y_train, x_test, y_test, word2vecdic, labeldic

In [18]:
x_train, y_train, x_test, y_test, word2vecdic, labeldic = load_data()

## 文本卷积网络

In [19]:
import numpy as np
x_train = np.array(x_train)
x_test = np.array(x_test)

In [30]:
y_train

[3,
 8,
 5,
 0,
 6,
 1,
 7,
 1,
 3,
 8,
 4,
 7,
 0,
 2,
 9,
 1,
 6,
 8,
 1,
 6,
 2,
 8,
 5,
 0,
 9,
 3,
 1,
 2,
 5,
 5,
 5,
 4,
 6,
 7,
 7,
 8,
 0,
 8,
 2,
 0,
 9,
 7,
 3,
 6,
 2,
 7,
 8,
 6,
 3,
 3,
 8,
 0,
 8,
 1,
 3,
 0,
 6,
 6,
 2,
 2,
 5,
 4,
 1,
 5,
 8,
 2,
 4,
 4,
 2,
 9,
 2,
 2,
 9,
 1,
 0,
 9,
 2,
 9,
 7,
 2,
 2,
 6,
 7,
 3,
 7,
 1,
 2,
 6,
 4,
 0,
 8,
 6,
 0,
 4,
 8,
 7,
 3,
 7,
 3,
 4,
 9,
 6,
 5,
 4,
 1,
 6,
 8,
 3,
 9,
 7,
 8,
 3,
 9,
 1,
 7,
 2,
 6,
 8,
 8,
 2,
 1,
 1,
 4,
 2,
 9,
 0,
 1,
 5,
 3,
 2,
 3,
 0,
 4,
 1,
 1,
 0,
 5,
 4,
 2,
 2,
 1,
 8,
 8,
 4,
 5,
 5,
 1,
 2,
 2,
 0,
 8,
 9,
 4,
 9,
 3,
 3,
 8,
 1,
 2,
 2,
 3,
 0,
 4,
 0,
 1,
 1,
 7,
 2,
 8,
 9,
 3,
 8,
 7,
 8,
 6,
 4,
 1,
 6,
 3,
 2,
 5,
 0,
 9,
 0,
 6,
 6,
 6,
 4,
 6,
 4,
 6,
 8,
 9,
 8,
 0,
 7,
 1,
 5,
 3,
 2,
 7,
 5,
 8,
 8,
 9,
 5,
 3,
 6,
 1,
 6,
 5,
 7,
 5,
 0,
 7,
 6,
 8,
 8,
 1,
 6,
 1,
 9,
 4,
 4,
 2,
 3,
 3,
 8,
 7,
 7,
 4,
 5,
 2,
 0,
 7,
 1,
 1,
 8,
 8,
 5,
 9,
 8,
 6,
 7,
 7,
 9,
 3,
 4,
 6,
 8,


In [36]:
sequence_length = x_train.shape[1]
classes_num = len(labeldic)
vocabulary_size = len(word2vecdic)
embedding_size = 300
filters_height = [2, 3, 4]
filter_num_per_height = [100, 100, 100]
l2_lambda = 0.01
batch_size = 3
epochs = 2
drop_keep_prob = 0.2

In [28]:
import tensorflow as tf
graph = tf.Graph()
with graph.as_default():
    train_inputs = tf.placeholder(tf.int32, [None, sequence_length])
    train_labels = tf.placeholder(tf.float32, [None, classes_num])
    keep_prob = tf.placeholder(tf.float32)
    l2_loss = tf.constant(0.0)
    
    with tf.device('/cpu:0'):
        # embedding layer
        embeddings = tf.Variable(tf.random_uniform([vocabulary_size, embedding_size], -1.0, 1.0))
        embed = tf.nn.embedding_lookup(embeddings, train_inputs)
        conv_inputs = tf.expand_dims(embed, -1)
        
    features_pooled = []
    for filter_height, filter_num in zip(filters_height, filter_num_per_height):
        conv_filter = tf.Variable(tf.truncated_normal([filter_height, embedding_size, 1, filter_num], stddev=0.1))
        
        conv = tf.nn.conv2d(conv_inputs, conv_filter, strides=[1, 1, 1, 1], padding="VALID")
        bias = tf.Variable(tf.constant(0.1, shape=[filter_num]))
        feature_map = tf.nn.relu(tf.nn.bias_add(conv, bias))
        feature_pooled = tf.nn.max_pool(feature_map, ksize=[1, sequence_length - filter_height + 1, 1, 1],
                                            strides=[1, 1, 1, 1],
                                            padding='VALID')
        features_pooled.append(feature_pooled)
        
    filter_num_total = sum(filter_num_per_height)
    # fully connected layer
    features_pooled_flat = tf.reshape(tf.concat(features_pooled, 3), [-1, filter_num_total])
    features_pooled_flat_drop = tf.nn.dropout(features_pooled_flat, keep_prob) 

    W = tf.get_variable("W", shape=[filter_num_total, classes_num], initializer=tf.contrib.layers.xavier_initializer())
    b = tf.Variable(tf.constant(0.1, shape=[classes_num]))
    l2_loss += tf.nn.l2_loss(W)
    l2_loss += tf.nn.l2_loss(b)
    scores = tf.nn.xw_plus_b(features_pooled_flat_drop, W, b)
    
    losses = tf.nn.softmax_cross_entropy_with_logits(logits=scores, labels=train_labels)
    loss = tf.reduce_mean(losses) + l2_lambda * l2_loss

    predictions = tf.argmax(scores, 1)
    correct_predictions = tf.equal(predictions, tf.argmax(train_labels, 1))
    accuracy = tf.reduce_mean(tf.cast(correct_predictions, "float"))   
    

In [34]:
def get_batch(data, batch_size, num_epochs):
    data = list(data)
    data_size = len(data)
    num_batches_per_epoch = int(data_size / batch_size)
    for epoch in range(num_epochs):
        shuffle_indices = np.random.permutation(np.arange(data_size))
        data_shuffled = np.array(data)[shuffle_indices]
        for batch_num in range(num_batches_per_epoch):
            start_index = batch_num * batch_size
            end_index = min((batch_num + 1) * batch_size, data_size)
            yield data_shuffled[start_index:end_index]

In [None]:
with tf.Session(graph=graph) as sess:
    global_step = tf.Variable(0, trainable=False)
    optimizer = tf.train.AdamOptimizer(1e-4)
    grads_and_vars = optimizer.compute_gradients(loss, aggregation_method=2)
    train_op = optimizer.apply_gradients(grads_and_vars, global_step=global_step)

    loss_summary = tf.summary.scalar("loss", loss)
    acc_summary = tf.summary.scalar("accuracy", accuracy)

    sess.run(tf.initialize_all_variables())
    batches = get_batch(zip(x_train, y_train), batch_size, epochs)
    for batch in batches:
        x_batch, y_batch = zip(*batch)
        feed_dict = {train_inputs: x_batch, train_labels: y_batch, keep_prob: drop_keep_prob}
        _, step, _loss, _accuracy = sess.run([train_op, global_step, loss, accuracy], feed_dict)
        time_str = datetime.datetime.now().strftime("%d, %b %Y %H:%M:%S")
        print("{}: step {}, loss {:g}, acc {:g}".format(time_str, step, _loss, _accuracy))

25, Apr 2019 21:35:57: step 1, loss 24.8607, acc 0
25, Apr 2019 21:35:57: step 2, loss 9.07715, acc 0.333333
25, Apr 2019 21:35:57: step 3, loss 5.84646, acc 0.333333
25, Apr 2019 21:35:57: step 4, loss 17.2226, acc 0
25, Apr 2019 21:35:57: step 5, loss 18.5765, acc 0.333333
25, Apr 2019 21:35:57: step 6, loss 17.0964, acc 0
25, Apr 2019 21:35:58: step 7, loss 7.30108, acc 0.333333
25, Apr 2019 21:35:58: step 8, loss 28.9857, acc 0
25, Apr 2019 21:35:58: step 9, loss 27.0813, acc 0
25, Apr 2019 21:35:58: step 10, loss 19.3843, acc 0
25, Apr 2019 21:35:58: step 11, loss 17.5641, acc 0
25, Apr 2019 21:35:58: step 12, loss 9.97443, acc 0.333333
25, Apr 2019 21:35:59: step 13, loss 27.7803, acc 0
25, Apr 2019 21:35:59: step 14, loss 20.5349, acc 0.333333
25, Apr 2019 21:35:59: step 15, loss 9.93464, acc 0
25, Apr 2019 21:35:59: step 16, loss 0.105621, acc 1
25, Apr 2019 21:35:59: step 17, loss 16.0312, acc 0
25, Apr 2019 21:35:59: step 18, loss 23.9938, acc 0
25, Apr 2019 21:36:00: step 19

25, Apr 2019 21:36:24: step 153, loss 16.6318, acc 0.333333
25, Apr 2019 21:36:24: step 154, loss 14.3632, acc 0
25, Apr 2019 21:36:24: step 155, loss 14.7603, acc 0
25, Apr 2019 21:36:24: step 156, loss 8.99372, acc 0.666667
25, Apr 2019 21:36:25: step 157, loss 26.745, acc 0
25, Apr 2019 21:36:25: step 158, loss 4.44551, acc 0.333333
25, Apr 2019 21:36:25: step 159, loss 16.4985, acc 0
25, Apr 2019 21:36:25: step 160, loss 18.9258, acc 0
25, Apr 2019 21:36:25: step 161, loss 23.3052, acc 0
25, Apr 2019 21:36:26: step 162, loss 21.0449, acc 0.333333
25, Apr 2019 21:36:26: step 163, loss 28.6778, acc 0
25, Apr 2019 21:36:26: step 164, loss 18.5459, acc 0
25, Apr 2019 21:36:26: step 165, loss 14.1849, acc 0
25, Apr 2019 21:36:26: step 166, loss 11.9146, acc 0
25, Apr 2019 21:36:27: step 167, loss 42.212, acc 0
25, Apr 2019 21:36:27: step 168, loss 33.8663, acc 0
25, Apr 2019 21:36:27: step 169, loss 15.5751, acc 0
25, Apr 2019 21:36:27: step 170, loss 17.2227, acc 0.333333
25, Apr 2019 