In [1]:
from collections import Counter
import numpy as np
import tensorflow.contrib.keras as kr
import tensorflow as tf
import time
from datetime import timedelta
import os
from sklearn import metrics

In [2]:
trainDataPath = "c1_train.txt"
devDataPath = "c1_dev.txt"
testDataPath = "c1_test.txt"
vocabPath = r'D:\lab_data\之江lab\实验模型\glove.6B\glove.6B.50d.txt'
savePath = "c1_cnn\c1_cnn"

In [3]:
categories = ['Retrieve Value', 'Filter', 'Compute Derived Value', 'Find Extremum', 'Sort', 
              'Determine Range', 'Characterize Distribution', 'Find Anomalies', 'Cluster', 'Correlate']
cat_to_id = dict(zip(categories, range(1,len(categories)+1)))
id_to_cat = dict(zip(range(1,len(categories)+1), categories))

In [4]:
id_to_cat

{1: 'Retrieve Value',
 2: 'Filter',
 3: 'Compute Derived Value',
 4: 'Find Extremum',
 5: 'Sort',
 6: 'Determine Range',
 7: 'Characterize Distribution',
 8: 'Find Anomalies',
 9: 'Cluster',
 10: 'Correlate'}

In [5]:
cat_to_id

{'Retrieve Value': 1,
 'Filter': 2,
 'Compute Derived Value': 3,
 'Find Extremum': 4,
 'Sort': 5,
 'Determine Range': 6,
 'Characterize Distribution': 7,
 'Find Anomalies': 8,
 'Cluster': 9,
 'Correlate': 10}

In [6]:
def readfile(filePath):
    """读取文件内容，返回文本和标签列表"""
    contents, labels = [], []
    with open(filePath, 'r', encoding='utf-8', errors='ignore') as f:
        for line in f:
            try:
                word = line.lower().strip().split()
                label = word[0].split(":")[0]
                content = word[1:]
                
                contents.append(content)
                labels.append(label)
            except:
                pass
    return contents, labels


def readCategory():
    """读取分类目录，固定id"""
    '''
    Retrieve Value
    Filter
    Compute Derived Value
    Find Extremum
    Sort
    Determine Range
    Characterize Distribution
    Find Anomalies
    Cluster
    Correlate
    '''
    categories = ['Retrieve Value', 'Filter', 'Compute Derived Value', 'Find Extremum', 'Sort', 
                  'Determine Range', 'Characterize Distribution', 'Find Anomalies', 'Cluster', 'Correlate']
    cat_to_id = dict(zip(categories, range(1,len(categories)+1)))
    id_to_cat = dict(zip(range(1,len(categories)+1), categories))
    return id_to_cat, cat_to_id


def loadGloVe(filename, emb_size=50):
    vocab = []
    embd = []
    print('Loading GloVe!')
    vocab.append('unk') #装载不认识的词
    embd.append([0] * emb_size) #这个emb_size可能需要指定
    file = open(filename,'r',encoding='utf-8')
    for line in file.readlines():
        row = line.strip().split(' ')
        vocab.append(row[0])
        embd.append(row[1:])
    file.close()
    print('Completed!')
    return vocab,embd

In [7]:
contents_train, labels_train = readfile(trainDataPath)
contents_dev, labels_dev = readfile(devDataPath)
contents_test, labels_test = readfile(testDataPath)

In [8]:
id_to_cat, cat_to_id = readCategory()
num_classes = len(id_to_cat)

In [9]:
num_classes, labels_train[0]

(10, '1')

In [10]:
contents_all = contents_train + contents_dev + contents_test
seq_length = 0
for content in contents_all:
    if seq_length < len(content):
        seq_length = len(content)   # seq_length = 35

In [11]:
seq_length

35

In [12]:
vocab, embd = loadGloVe(vocabPath)
vocab_size = len(vocab)
embedding_dim = len(embd[0])
embedding = np.asarray(embd)

Loading GloVe!
Completed!


In [21]:
embedding[2]

array(['0.013441', '0.23682', '-0.16899', '0.40951', '0.63812', '0.47709',
       '-0.42852', '-0.55641', '-0.364', '-0.23938', '0.13001',
       '-0.063734', '-0.39575', '-0.48162', '0.23291', '0.090201',
       '-0.13324', '0.078639', '-0.41634', '-0.15428', '0.10068',
       '0.48891', '0.31226', '-0.1252', '-0.037512', '-1.5179', '0.12612',
       '-0.02442', '-0.042961', '-0.28351', '3.5416', '-0.11956',
       '-0.014533', '-0.1499', '0.21864', '-0.33412', '-0.13872',
       '0.31806', '0.70358', '0.44858', '-0.080262', '0.63003', '0.32111',
       '-0.46765', '0.22786', '0.36034', '-0.37818', '-0.56657',
       '0.044691', '0.30392'], dtype='<U11')

In [27]:
eee = embd[1:5]
eee[0]

['0.418',
 '0.24968',
 '-0.41242',
 '0.1217',
 '0.34527',
 '-0.044457',
 '-0.49688',
 '-0.17862',
 '-0.00066023',
 '-0.6566',
 '0.27843',
 '-0.14767',
 '-0.55677',
 '0.14658',
 '-0.0095095',
 '0.011658',
 '0.10204',
 '-0.12792',
 '-0.8443',
 '-0.12181',
 '-0.016801',
 '-0.33279',
 '-0.1552',
 '-0.23131',
 '-0.19181',
 '-1.8823',
 '-0.76746',
 '0.099051',
 '-0.42125',
 '-0.19526',
 '4.0071',
 '-0.18594',
 '-0.52287',
 '-0.31681',
 '0.00059213',
 '0.0074449',
 '0.17778',
 '-0.15897',
 '0.012041',
 '-0.054223',
 '-0.29871',
 '-0.15749',
 '-0.34758',
 '-0.045637',
 '-0.44251',
 '0.18785',
 '0.0027849',
 '-0.18411',
 '-0.11514',
 '-0.78581']

In [13]:
word_to_id = dict(zip(vocab, range(vocab_size)))

In [14]:
len(embedding),embedding_dim,vocab_size

(400001, 50, 400001)

In [15]:
def process_file(contents, labels, word_to_id, num_classes, pad_max_length):
    """
    将文件转换为id表示,并且将每个单独的样本长度固定为pad_max_lengtn
    """
    # contents, labels = readfile(filePath)
    data_id, label_id = [], []
    # 将文本内容转换为对应的id形式
    for i in range(len(contents)):
        data_id.append([word_to_id[x] for x in contents[i] if x in word_to_id])
        label_id.append(int(labels[i])-1)
    # 使用keras提供的pad_sequences来将文本pad为固定长度
    x_pad = kr.preprocessing.sequence.pad_sequences(data_id, pad_max_length)
    ''' https://blog.csdn.net/TH_NUM/article/details/80904900
    pad_sequences(sequences, maxlen=None, dtype=’int32’, padding=’pre’, truncating=’pre’, value=0.) 
        sequences：浮点数或整数构成的两层嵌套列表
        maxlen：None或整数，为序列的最大长度。大于此长度的序列将被截短，小于此长度的序列将在后部填0.
        dtype：返回的numpy array的数据类型
        padding：‘pre’或‘post’，确定当需要补0时，在序列的起始还是结尾补
        truncating：‘pre’或‘post’，确定当需要截断序列时，从起始还是结尾截断
        value：浮点数，此值将在填充时代替默认的填充值0
    '''
    y_pad = kr.utils.to_categorical(label_id, num_classes=num_classes)  # 将标签转换为one-hot表示
    ''' https://blog.csdn.net/nima1994/article/details/82468965
    to_categorical(y, num_classes=None, dtype='float32')
        将整型标签转为onehot。y为int数组，num_classes为标签类别总数，大于max(y)（标签从0开始的）。
        返回：如果num_classes=None，返回len(y) * [max(y)+1]（维度，m*n表示m行n列矩阵，下同），否则为len(y) * num_classes。
    '''
    return x_pad, y_pad


def get_time_dif(start_time):
    """获取已使用时间"""
    end_time = time.time()
    time_dif = end_time - start_time
    return timedelta(seconds=int(round(time_dif)))

In [16]:
print("Loading training and validation and testing data...")
start_time = time.time()
x_train, y_train = process_file(contents_train, labels_train, word_to_id, num_classes, seq_length)  # seq_length = 600
x_dev, y_dev = process_file(contents_dev, labels_dev, word_to_id, num_classes, seq_length)
x_test, y_test = process_file(contents_test, labels_test, word_to_id, num_classes, seq_length)
time_dif = get_time_dif(start_time)
print("Loading data Time usage:", time_dif)

Loading training and validation and testing data...
Loading data Time usage: 0:00:00


In [17]:
x_train[0], y_train[0]

(array([   0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,  103,   15,    1,  486,    4, 4792,   14,  376, 2694,
         813,  189]), array([1., 0., 0., 0., 0., 0., 0., 0., 0., 0.]))

In [18]:
contents_train[0]

['what',
 'is',
 'the',
 'population',
 'of',
 'arkansas',
 'on',
 'july',
 '1st',
 '2002',
 '?']

In [19]:
word_to_id["arkansas"]

4792

In [29]:
print(embedding_inputs)

Tensor("embedding_lookup_2/Reshape_1:0", shape=(1, 1), dtype=string)


In [50]:
eee=[[  7, -10,  -7],
       [  7,  -1,  -2],
       [  3,  -9,  -2],
       [ -6,  -6,   5],
       [ -6,   0,  -6]]

In [49]:
eee

[[7, -10, -7], [7, -1, -2], [3, -9, -2], [-6, -6, 5], [-6, 0, -6]]

In [43]:
eeenp = np.array(eee)
eeenp

array([[  7, -10,  -7],
       [  7,  -1,  -2],
       [  3,  -9,  -2],
       [ -6,  -6,   5],
       [ -6,   0,  -6]])

In [51]:
eeenp2 = np.asarray(eee)
eeenp2

array([[  7, -10,  -7],
       [  7,  -1,  -2],
       [  3,  -9,  -2],
       [ -6,  -6,   5],
       [ -6,   0,  -6]])

In [47]:
embedding_inputs = tf.nn.embedding_lookup(eee, [[1]])
embedding_inputs_eeenp = tf.nn.embedding_lookup(eeenp, [[1]])
embedding_inputs_eeenp2 = tf.nn.embedding_lookup(eeenp2, [[1]])

In [48]:
print(session.run(embedding_inputs))
print(session.run(embedding_inputs_eeenp))
print(session.run(embedding_inputs_eeenp2))

[[7]]
[[[ 7 -1 -2]]]
[[[ 7 -1 -2]]]


In [20]:
# ======================================================CNN Model Start===============================================
# 输入内容及对应的标签
input_x = tf.placeholder(tf.int32, [None, seq_length], name='input_x')
input_y = tf.placeholder(tf.float32, [None, num_classes], name='input_y')
# dropout的损失率
keep_prob = tf.placeholder(tf.float32, name='keep_prob')

# 词向量映射;实际上此处的词向量并不是用的预训练好的词向量，而是未经任何训练直接生成了一个矩阵，将此矩阵作为词向量矩阵使用，效果也还不错。
# 若使用训练好的词向量，或许训练此次文本分类的模型时会更快，更好。
# embedding = tf.get_variable('embedding', [vocab_size, embedding_dim])
embedding_inputs = tf.nn.embedding_lookup(embedding, input_x)

  if params is None or params in ((), []):


KeyboardInterrupt: 

In [None]:
num_filters = 256
kernel_size = 5
hidden_dim = 128
learning_rate = 1e-3
dropout_keep_prob = 0.5

num_epochs = 20
batch_size = 64
print_per_batch = 100  # 每多少轮输出一次结果
save_per_batch = 10  # 每多少轮存入tensorboard


# CNN layer
conv = tf.layers.conv1d(embedding_inputs, num_filters, kernel_size, name='conv')  # num_filters = 256 这是个啥
''' https://blog.csdn.net/khy19940520/article/details/89934335
tf.layers.conv1d：一维卷积一般用于处理文本数据，常用语自然语言处理中，输入一般是文本经过embedding的二维数据。
    inputs： 输入tensor， 维度(batch_size, seq_length, embedding_dim) 是一个三维的tensor；其中，
        batch_size指每次输入的文本数量；
        seq_length指每个文本的词语数或者单字数；
        embedding_dim指每个词语或者每个字的向量长度；
        例如每次训练输入2篇文本，每篇文本有100个词，每个词的向量长度为20，那input维度即为(2, 100, 20)。
    filters：过滤器（卷积核）的数目
    kernel_size：卷积核的大小，卷积核本身应该是二维的，这里只需要指定一维，因为第二个维度即长度与词向量的长度一致，卷积核只能从上往下走，不能从左往右走，即只能按照文本中词的顺序，也是列的顺序。
'''
# global max pooling layer
gmp = tf.reduce_max(conv, reduction_indices=[1], name='gmp')  # https://blog.csdn.net/lllxxq141592654/article/details/85345864

# 全连接层，后面接dropout以及relu激活
fc = tf.layers.dense(gmp, hidden_dim, name='fc1')  # hidden_dim：128
''' https://blog.csdn.net/yangfengling1023/article/details/81774580
dense ：全连接层  inputs：输入该网络层的数据；units：输出的维度大小，改变inputs的最后一维
'''
fc = tf.nn.dropout(fc, keep_prob)
fc = tf.nn.relu(fc)

# 分类器
logits = tf.layers.dense(fc, num_classes, name='fc2')
y_pred_cls = tf.argmax(tf.nn.softmax(logits), 1)  # 预测类别 tf.argmax：返回每一行或每一列的最大值 1为里面（每一行），0为外面（每一列）

# 损失函数，交叉熵
cross_entropy = tf.nn.softmax_cross_entropy_with_logits(logits=logits, labels=input_y)
loss = tf.reduce_mean(cross_entropy)
# 优化器
optim = tf.train.AdamOptimizer(learning_rate=learning_rate).minimize(loss)

# 准确率
correct_pred = tf.equal(tf.argmax(input_y, 1), y_pred_cls)
acc = tf.reduce_mean(tf.cast(correct_pred, tf.float32))
# ======================================================CNN Model End============================================

In [None]:
# 创建session
session = tf.Session()
saver = tf.train.Saver()
session.run(tf.global_variables_initializer())

In [None]:
# ======================================================Train Start===============================================
# 训练模型的代码，如果要重新训练则打开注释即可。因为后面调用了已训练好的模型，所以此处先注释掉。
print('Training and evaluating...')
start_time = time.time()
total_batch = 0  # 总批次
best_acc_val = 0.0  # 最佳验证集准确率
last_improved = 0  # 记录上一次提升批次
require_improvement = 500  # 如果超过1000轮未提升，提前结束训练
flag = False

for epoch in range(num_epochs):  # 20
    print('Epoch:', epoch + 1)
    batch_train = batch_iter(x_train, y_train, batch_size)
    for x_batch, y_batch in batch_train:
        feed_dict = {input_x: x_batch, input_y: y_batch, keep_prob: dropout_keep_prob}
        session.run(optim, feed_dict=feed_dict)  # 运行优化
        total_batch += 1

        if total_batch % print_per_batch == 0:
            # 每多少轮次输出在训练集和验证集上的性能
            feed_dict[keep_prob] = 1.0
            loss_train, acc_train = session.run([loss, acc], feed_dict=feed_dict)
            loss_val, acc_val = evaluate(session, x_val, y_val, loss, acc)
            if acc_val > best_acc_val:
                # 保存最好结果
                best_acc_val = acc_val
                last_improved = total_batch
                saver.save(sess=session, save_path=savePath)
                improved_str = '*'
            else:
                improved_str = ''

            time_dif = get_time_dif(start_time)
            msg = 'Iter: {0:>6}, Train Loss: {1:>6.2}, Train Acc: {2:>7.2%},' \
                  + ' Val Loss: {3:>6.2}, Val Acc: {4:>7.2%}, Time: {5} {6}'
            print(msg.format(total_batch, loss_train, acc_train, loss_val, acc_val, time_dif, improved_str))

        if total_batch - last_improved > require_improvement:
            # 验证集正确率长期不提升，提前结束训练
            print("No optimization for a long time, auto-stopping...")
            flag = True
            break  # 跳出循环
    if flag:  # 同上
        break
# ======================================================Train End===============================================

In [None]:
def batch_iter(x_pad, y_pad, batch_size):
    """生成批次数据"""
    data_len = len(x_pad)
    num_batch = int((data_len - 1) / batch_size) + 1
    # np.arange()生成0到data_len的等差数列，默认等差为1；np.random.permutation()打乱生成的等差序列的顺序
    # 下面三句语句是为了将训练或测试文本的顺序打乱，因为原文本中每个分类的样本全部挨在一起，这样每个batch训练的都是同一个分类，不太好，打乱后每个batch可包含不同分类
    indices = np.random.permutation(np.arange(data_len))
    x_shuffle = x_pad[indices]
    y_shuffle = y_pad[indices]

    # 返回所有batch的数据
    for i in range(num_batch):
        start_id = i * batch_size
        end_id = min((i + 1) * batch_size, data_len)
        yield x_shuffle[start_id:end_id], y_shuffle[start_id:end_id]
        
        
def evaluate(sess, x_pad, y_pad, loss1, acc1):
    """评估在某一数据上的准确率和损失"""
    data_len = len(x_pad)
    batch_eval = batch_iter(x_pad, y_pad, batch_size)  # 128
    total_loss = 0.0
    total_acc = 0.0
    # print(dropout_keep_prob)
    for x_batch1, y_batch1 in batch_eval:
        batch_len = len(x_batch1)
        feed_dict1 = {input_x: x_batch1, input_y: y_batch1, keep_prob: 1.0}
        lossTmp, accTmp = sess.run([loss1, acc1], feed_dict=feed_dict1)
        total_loss += lossTmp * batch_len
        total_acc += accTmp * batch_len

    return total_loss / data_len, total_acc / data_len