In [3]:
import pandas as pd
import numpy as np
from collections import Counter
import tensorflow as tf
import math

In [4]:
train = pd.read_csv('../senta_data/new_train.tsv', sep='\t')
val = pd.read_csv('../senta_data/dev.tsv')
test = pd.read_csv('../senta_data/test.tsv')

ParserError: Error tokenizing data. C error: Expected 1 fields in line 5, saw 2


# 1、构造词典

In [3]:
sentences = ''.join(train.review.values)

In [5]:
words_counter = Counter(words_list)
words_counter['UNK'] = 10000000
words_counter['PAD'] = 10000001

In [6]:
words_counter_list = sorted(words_counter.items(), key=lambda x:x[1], reverse=True)

# 2、构建id2word和word2id

In [7]:
id2word = {i: j[0] for i, j in enumerate(words_counter_list)}
word2id = {j[0]: i for i, j in enumerate(words_counter_list)}

# 3、语料转换成id向量

In [8]:
def generate_ids_features(sentence):
    words = jieba.lcut(sentence)
    ids_feature = [word2id.get(word,word2id['UNK']) for word in words]
    return ids_feature

# 4、添加位置特征

In [9]:
punctuation = [',', '.', '，', '。', '!', '?', '？', '！', ':', '：', ';', '；']

In [10]:
def generate_position_features(sentence):
    words = jieba.lcut(sentence)
    position_feature = []
    i = 0
    for word in words:
        i += 1
        if word not in punctuation:
            position_feature.append(i)
        if word in punctuation:
            position_feature.append(i)
            i = 0
    return position_feature

In [11]:
def get_data(df):
    data = []
    for index, sentence in enumerate(df.review.values):
        ids_feature = generate_ids_features(sentence)
        position_feature = generate_position_features(sentence)
        label = df.label.values[index]
        data.append([ids_feature, position_feature, label])
    return data

In [12]:
train_data = get_data(train)
val_data = get_data(val)
test_data = get_data(test)

# 5、数据集封装

In [13]:
class BatchManager(object):

    def __init__(self, data,  batch_size):
        self.batch_data = self.sort_and_pad(data, batch_size)
        self.len_data = len(self.batch_data)
        self.batch_size = batch_size
        self._indicator = 0

    def sort_and_pad(self, data, batch_size):
        num_batch = int(math.ceil(len(data) / batch_size))
        sorted_data = sorted(data, key=lambda x: len(x[0]))
        batch_data = list()
        for i in range(num_batch):
            batch_data.append(self.pad_data(sorted_data[i*int(batch_size) : (i+1)*int(batch_size)]))
        return batch_data

    @staticmethod
    def pad_data(data):
        ids_features = []
        position_features = []
        targets = []
        max_length = 1277
#         max_length = max([len(sentence[0]) for sentence in data])
        for line in data:
            ids_feature, position_feature, target = line
            padding = [0] * (max_length - len(ids_feature))
            ids_features.append(ids_feature + padding)
            position_features.append(position_feature + padding)
            targets.append(target)
        return [ids_features, position_features, targets]
    
    def next_batch(self):
        end_indicator = self._indicator + 1
        if end_indicator > self.len_data:
            self._indicator = 0
            end_indicator = 1
        if end_indicator > self.len_data:
            raise Execption("batch_size: %d is too large" % batch_size)
        
        batch_data = self.batch_data[self._indicator: end_indicator]
        self._indicator = end_indicator
        return batch_data

In [14]:
train_manager = BatchManager(train_data, 100)

In [15]:
num_words = len(word2id)

In [16]:
with tf.variable_scope('word_embedding', reuse=tf.AUTO_REUSE):
    word_embedding_vec = tf.get_variable(name='word_embedding_vec', shape=[num_words, 100], initializer=tf.contrib.layers.xavier_initializer())
with tf.variable_scope('position_embedding', reuse=tf.AUTO_REUSE):
    position_embedding_vec = tf.get_variable(name='seg_embedding_vec', shape=[num_words, 20], initializer=tf.contrib.layers.xavier_initializer())
with tf.Session() as sess:
    sess.run(tf.global_variables_initializer())
    word_weights = sess.run(word_embedding_vec.read_value())
    position_weights = sess.run(position_embedding_vec.read_value())

# 6、构建模型图

In [17]:
# class Model(object):
#     def __init__(self, num_words, lr, length):
#         self.ids_inputs = tf.placeholder(dtype=tf.int64, shape=[None, length], name='ids_inputs')
#         self.positions_inputs = tf.placeholder(dtype=tf.int64, shape=[None, length], name='positions_inputs')
#         self.target_outputs = tf.placeholder(dtype=tf.int64, shape=[None, 1], name='target_outputs')
#         self.keep_prob = tf.placeholder(dtype=tf.float32, name='keep_prob')
#         self.num_words = num_words
#         self.lr = lr
#         self.layers = [{'dilation':1}, {'dilation':1}, {'dilation':2}]
        
#     def main(self):
#         embedding_matrix = self.embedding_layers(self.ids_inputs, self.positions_inputs)
#         model_inputs = tf.nn.dropout(embedding_matrix, self.keep_prob)
#         model_outputs = self.IDCNN_layer(model_inputs)
#         logits = self.project_layer_idcnn(model_outputs)
#         loss = self.loss_layer(logits)
#         opt = tf.train.AdamOptimizer(self.lr)
#         grads_vars = opt.compute_gradients(loss)
#         clip_gradients = [(tf.clip_by_value(t=grad, clip_value_max=5, clip_value_min=-5),var)
#                       for grad, var in grads_vars if grad is not None]
#         train_op = opt.apply_gradients(clip_gradients)
#         return loss, logits
        
#     def embedding_layers(self, ids_inputs, positions_inputs):
#         embedding_ids_matrix = tf.nn.embedding_lookup(word_weights, ids_inputs)
#         embedding_positions_matrix = tf.nn.embedding_lookup(position_weights, positions_inputs)
#         embedding_matrix = tf.concat([embedding_ids_matrix, embedding_positions_matrix], axis=-1)
#         return embedding_matrix
        
#     def IDCNN_layer(self, model_inputs):
#         model_inputs = tf.expand_dims(model_inputs, 1)
#         with tf.variable_scope("idcnn", reuse=tf.AUTO_REUSE):
#             layerInput = tf.layers.conv2d(inputs=model_inputs,
#                                         filters=100,
#                                         kernel_size=(1, 3),
#                                         padding='same',
#                                         activation=tf.nn.relu,
#                                         name='conv'
#                                        )
#             print('layerInput',layerInput)
#             finalOutFromLayers = []
#             totalWidthForLastDim = 0
#             for j in range(4):
#                 for i in range(len(self.layers)):
#                     dilation = self.layers[i]['dilation']
#                     isLast = True if i == (len(self.layers) - 1) else False
#                     with tf.variable_scope("atrous-conv-layer-%d" % i,reuse=tf.AUTO_REUSE):
#                         conv = tf.layers.conv2d(inputs=layerInput,
#                                                     filters=100,
#                                                     kernel_size=(1, 3),
#                                                     padding='same',
#                                                     activation=tf.nn.relu,
#                                                     dilation_rate=dilation,
#                                                     name='conv_%d_%d'%(j,i))
#                         print('atrous_conv_%d'%i,conv)
#                         if isLast:
#                             finalOutFromLayers.append(conv)
#                             totalWidthForLastDim += 100
#                         layerInput = conv
#             finalOut = tf.concat(axis=3, values=finalOutFromLayers)
#             finalOut = tf.nn.dropout(finalOut, self.keep_prob)
#             print(finalOut)
#             finalOut = tf.squeeze(finalOut, [1])
#             print(finalOut)
#             print(totalWidthForLastDim)
#         return finalOut
        
#     def project_layer_idcnn(self, model_outputs):
#         flatten = tf.layers.flatten(model_outputs)
#         logits = tf.layers.dense(flatten, 1)
#         return logits
        
#     def loss_layer(self, logits):
#         loss = tf.losses.sigmoid_cross_entropy(multi_class_labels=self.target_outputs, logits=logits)
# #             loss = tf.losses.sparse_softmax_cross_entropy(labels=target_outputs, logits=pred)
#         return loss
        
#     def create_feed_dict(self, is_train, batch):
#         ids, positions, targets = batch[0]
#         feed_dict = {
#                 self.ids_inputs: np.asarray(ids),
#                 self.positions_inputs: np.asarray(positions),
#                 self.keep_prob: 1.0,
#             }
#         if is_train:
#             feed_dict[self.target_outputs] = np.asarray(targets).reshape(-1,1)
#             feed_dict[self.keep_prob] = 0.5
#         return feed_dict
    
#     def run_step(self, sess, is_train, batch):
#         feed_dict = self.create_feed_dict(is_train, batch)
#         loss, logits = self.main()
#         if is_train:
#             loss = sess.run(loss, feed_dict)
#             return loss
#         else:
#             logits = sess.run(logits, feed_dict)
#             return logits

In [18]:
# with tf.Session() as sess:
#     for i in range(100):
#         batch_data = train_manager.next_batch()
#         length = len(batch_data[0][0][0])
#         model = Model(num_words, 0.0003, length)
#         print(model.ids_inputs) 
#         sess.run(tf.global_variables_initializer())
#         loss = model.run_step(sess, True, batch_data)
#         print(loss)
# #     for i in range(100):
# #         batch_data = train_manager.next_batch()
# #         feed_dict = model.create_feed_dict(True, batch_data)
# #         loss = model.run_step(sess, True, batch_data, feed_dict)
# #         print(loss)

In [19]:
def creat_model():
    ids_inputs = tf.placeholder(dtype=tf.int32, shape=[None, 1277])
    positions_inputs = tf.placeholder(dtype=tf.int32, shape=[None, 1277])
    target_outputs = tf.placeholder(dtype=tf.int32, shape=[None,])
    keep_prob = tf.placeholder(dtype=tf.float32)
    embedding_ids_matrix = tf.nn.embedding_lookup(word_weights, ids_inputs)
    embedding_positions_matrix = tf.nn.embedding_lookup(position_weights, positions_inputs)
    embedding_matrix = tf.concat([embedding_ids_matrix, embedding_positions_matrix], axis=-1)
    layers = [{'dilation':1}, {'dilation':1}, {'dilation':2}]
    model_inputs = tf.nn.dropout(embedding_matrix, keep_prob=keep_prob)
    model_inputs = tf.expand_dims(model_inputs, 1)
    
    with tf.variable_scope("idcnn", reuse=tf.AUTO_REUSE):
        layerInput = tf.layers.conv2d(inputs=model_inputs,
                                filters=100,
                                kernel_size=(1, 3),
                                padding='same',
                                activation=tf.nn.relu,
                                name='conv'
                               )
        print('layerInput',layerInput)
        finalOutFromLayers = []
        totalWidthForLastDim = 0
        for j in range(4):
            for i in range(len(layers)):
                dilation = layers[i]['dilation']
                isLast = True if i == (len(layers) - 1) else False
                with tf.variable_scope("atrous-conv-layer-%d" % i,reuse=tf.AUTO_REUSE):
                    conv = tf.layers.conv2d(inputs=layerInput,
                                            filters=100,
                                            kernel_size=(1, 3),
                                            padding='same',
                                            activation=tf.nn.relu,
                                            dilation_rate=dilation,
                                            name='conv_%d_%d'%(j,i))
                    print('atrous_conv_%d'%i,conv)
                    if isLast:
                        finalOutFromLayers.append(conv)
                        totalWidthForLastDim += 100
                    layerInput = conv
        finalOut = tf.concat(axis=3, values=finalOutFromLayers)
        finalOut = tf.nn.dropout(finalOut, keep_prob)
        print(finalOut)
            #Removes dimensions of size 1 from the shape of a tensor. 
                #从tensor中删除所有大小是1的维度
            
                #Given a tensor input, this operation returns a tensor of the same type with all dimensions of size 1 removed. If you don’t want to remove all size 1 dimensions, you can remove specific size 1 dimensions by specifying squeeze_dims. 
            
                #给定张量输入，此操作返回相同类型的张量，并删除所有尺寸为1的尺寸。 如果不想删除所有尺寸1尺寸，可以通过指定squeeze_dims来删除特定尺寸1尺寸。
        finalOut = tf.squeeze(finalOut, [1])
        print(finalOut)
        print(totalWidthForLastDim)
        
    with tf.name_scope('project'):
        flatten = tf.layers.flatten(finalOut)
        print(flatten)
        logits = tf.layers.dense(flatten, 2)
        print(logits)
        
    with tf.name_scope('metrics'):
        softmax_loss = tf.nn.sparse_softmax_cross_entropy_with_logits(
            logits = logits, labels = target_outputs)
        loss = tf.reduce_mean(softmax_loss)
        # [0, 1, 5, 4, 2] -> argmax: 2
        y_pred = tf.argmax(tf.nn.softmax(logits),
                           1, 
                           output_type = tf.int32)
        correct_pred = tf.equal(target_outputs, y_pred)
        accuracy = tf.reduce_mean(tf.cast(correct_pred, tf.float32))
#     with tf.variable_scope('loss', reuse=tf.AUTO_REUSE):
#         loss = tf.losses.sparse_softmax_cross_entropy(labels=target_outputs, logits=pred)
    with tf.name_scope('train_op'):
        opt = tf.train.AdamOptimizer(0.003)
        grads_vars = opt.compute_gradients(loss)
        # with tf.Session() as sess:
        #     print(sess.run(grads_vars))
        clip_gradients = [(tf.clip_by_value(t=grad, clip_value_max=5, clip_value_min=-5),var)
                      for grad, var in grads_vars if grad is not None]
        train_op = opt.apply_gradients(clip_gradients)
    return ids_inputs,positions_inputs,target_outputs,keep_prob,loss,accuracy,train_op
ids_inputs,positions_inputs,target_outputs,keep_prob,loss,accuracy,train_op = creat_model()

layerInput Tensor("idcnn/conv/Relu:0", shape=(?, 1, 1277, 100), dtype=float32)
atrous_conv_0 Tensor("idcnn/atrous-conv-layer-0/conv_0_0/Relu:0", shape=(?, 1, 1277, 100), dtype=float32)
atrous_conv_1 Tensor("idcnn/atrous-conv-layer-1/conv_0_1/Relu:0", shape=(?, 1, 1277, 100), dtype=float32)
atrous_conv_2 Tensor("idcnn/atrous-conv-layer-2/conv_0_2/Relu:0", shape=(?, 1, 1277, 100), dtype=float32)
atrous_conv_0 Tensor("idcnn/atrous-conv-layer-0_1/conv_1_0/Relu:0", shape=(?, 1, 1277, 100), dtype=float32)
atrous_conv_1 Tensor("idcnn/atrous-conv-layer-1_1/conv_1_1/Relu:0", shape=(?, 1, 1277, 100), dtype=float32)
atrous_conv_2 Tensor("idcnn/atrous-conv-layer-2_1/conv_1_2/Relu:0", shape=(?, 1, 1277, 100), dtype=float32)
atrous_conv_0 Tensor("idcnn/atrous-conv-layer-0_2/conv_2_0/Relu:0", shape=(?, 1, 1277, 100), dtype=float32)
atrous_conv_1 Tensor("idcnn/atrous-conv-layer-1_2/conv_2_1/Relu:0", shape=(?, 1, 1277, 100), dtype=float32)
atrous_conv_2 Tensor("idcnn/atrous-conv-layer-2_2/conv_2_2/Relu

In [20]:
init_op = tf.global_variables_initializer()
train_keep_prob_value = 0.5
test_keep_prob_value = 1.0
with tf.Session() as sess:
    sess.run(init_op)
    for i in range(10000):
        batch_data = train_manager.next_batch()
        loss_val, acc, _ = sess.run([loss, accuracy, train_op], feed_dict={ids_inputs: batch_data[0][0],
                                        positions_inputs: batch_data[0][1],
                                        target_outputs: batch_data[0][2],
                                        keep_prob:train_keep_prob_value})
        if (i + 1) % 100 == 0:
            print(loss_val, acc)

0.6910472 0.61
0.69325656 0.48
0.6911047 0.73
0.6953199 0.39
0.6930731 0.52
0.6905945 0.73
0.6954097 0.39
0.69307417 0.52
0.69058967 0.73
0.69542384 0.39
0.69307554 0.52
0.69058985 0.73
0.69543093 0.39
0.6930763 0.52
0.6905899 0.73
0.69543487 0.39
0.6930769 0.52
0.6905899 0.73
0.69543755 0.39
0.69307864 0.52
0.69058937 0.73
0.6954394 0.39
0.6930774 0.52
0.69059014 0.73
0.69544053 0.39
0.69307756 0.52
0.69059014 0.73
0.69544137 0.39
0.69307756 0.52
0.69059014 0.73
0.6954419 0.39
0.6930777 0.52
0.69059014 0.73
0.69544244 0.39
0.6930777 0.52
0.69059014 0.73
0.6954427 0.39
0.6930778 0.52
0.69059014 0.73
0.695443 0.39
0.69307786 0.52
0.69059014 0.73
0.6954431 0.39
0.6930778 0.52
0.69059014 0.73
0.6954432 0.39
0.6930778 0.52
0.69059014 0.73
0.6954433 0.39
0.69307786 0.52
0.69059014 0.73
0.6954434 0.39
0.69307786 0.52
0.69059014 0.73
0.6954434 0.39
0.69307786 0.52
0.69059014 0.73
0.6954435 0.39
0.69307786 0.52
0.69059014 0.73
0.6954434 0.39
0.69307786 0.52
0.69059014 0.73
0.6954435 0.39
0.693

In [21]:
# def IDCNN_layer(model_inputs):
#     model_inputs = tf.nn.dropout(model_inputs, keep_prob=0.5)
#     model_inputs = tf.expand_dims(model_inputs, 1)
#     print(model_inputs)
#     with tf.variable_scope("idcnn", reuse=tf.AUTO_REUSE):
#         layerInput = tf.layers.conv2d(inputs=model_inputs,
#                                 filters=100,
#                                 kernel_size=(1, 3),
#                                 padding='same',
#                                 activation=tf.nn.relu,
#                                 name='conv'
#                                )
#         print('layerInput',layerInput)
#         finalOutFromLayers = []
#         totalWidthForLastDim = 0
#         for j in range(4):
#             for i in range(len(layers)):
#                 dilation = layers[i]['dilation']
#                 isLast = True if i == (len(layers) - 1) else False
#                 with tf.variable_scope("atrous-conv-layer-%d" % i,reuse=tf.AUTO_REUSE):
#                     conv = tf.layers.conv2d(inputs=layerInput,
#                                             filters=100,
#                                             kernel_size=(1, 3),
#                                             padding='same',
#                                             activation=tf.nn.relu,
#                                             dilation_rate=dilation,
#                                             name='conv_%d_%d'%(j,i))
#                     print('atrous_conv_%d'%i,conv)
#                     if isLast:
#                         finalOutFromLayers.append(conv)
#                         totalWidthForLastDim += 100
#                     layerInput = conv
#         finalOut = tf.concat(axis=3, values=finalOutFromLayers)
#         keepProb = 0.5
#         finalOut = tf.nn.dropout(finalOut, keepProb)
#         print(finalOut)
#             #Removes dimensions of size 1 from the shape of a tensor. 
#                 #从tensor中删除所有大小是1的维度
            
#                 #Given a tensor input, this operation returns a tensor of the same type with all dimensions of size 1 removed. If you don’t want to remove all size 1 dimensions, you can remove specific size 1 dimensions by specifying squeeze_dims. 
            
#                 #给定张量输入，此操作返回相同类型的张量，并删除所有尺寸为1的尺寸。 如果不想删除所有尺寸1尺寸，可以通过指定squeeze_dims来删除特定尺寸1尺寸。
#         finalOut = tf.squeeze(finalOut, [1])
#         print(finalOut)
#         print(totalWidthForLastDim)
#         return finalOut

In [22]:
# def project_layer_idcnn(idcnn_outputs, name=None):
#     with tf.variable_scope('project', reuse=tf.AUTO_REUSE):
#         flatten = tf.layers.flatten(idcnn_outputs)
#         print(flatten)
#         pred = tf.layers.dense(flatten, 2)
#         print(pred)
#     return pred

In [23]:
# with tf.variable_scope('loss', reuse=tf.AUTO_REUSE):
#     loss = tf.losses.sparse_softmax_cross_entropy(labels=target_outputs, logits=project_logits)

In [24]:
# opt = tf.train.AdamOptimizer(0.0003)
# grads_vars = opt.compute_gradients(loss)
# # with tf.Session() as sess:
# #     print(sess.run(grads_vars))
# clip_gradients = [(tf.clip_by_value(t=grad, clip_value_max=5, clip_value_min=-5),var)
#                   for grad, var in grads_vars if grad is not None]
# train_op = opt.apply_gradients(clip_gradients)

In [25]:
# with tf.Session() as sess:
#     sess.run(tf.global_variables_initializer())
#     for i in range(100):
#         batch_data = train_manager.next_batch()
#         _, loss = sess.run([train_op, loss], feed_dict={ids_inputs: batch_data[0][0],
#                                                        positions_inputs: batch_data[0][1],
#                                                        target_outputs: batch_data[0][2]})
#         print(loss)