In [1]:
import numpy as np
import tensorflow.contrib.keras as kr
import tensorflow as tf
import string

vocabPath = r"D:\lab_data\之江lab\AIforVis\model\glove.6B.100d.txt"
savePath = r"D:\lab_data\之江lab\AIforVis\model\c1_cnn_2"


def loadGloVe(filename):
    vocab = []
    embd = []
    print('Loading GloVe!')
    file = open(filename, 'r', encoding='utf-8')
    for line in file.readlines():
        row = line.strip().split(' ')
        vocab.append(row[0])
        embd.append([float(ei) for ei in row[1:]])
    file.close()
    print('Completed!')
    return vocab, embd


categories = ['Retrieve Value', 'Filter', 'Compute Derived Value', 'Find Extremum', 'Sort',
              'Determine Range', 'Characterize Distribution', 'Find Anomalies', 'Cluster', 'Correlate']
num_classes = len(categories)
seq_length = 35

vocab, embd = loadGloVe(vocabPath)
vocab_size = len(vocab)
embedding_dim = len(embd[0])
embedding = np.asarray(embd)
word_to_id = dict(zip(vocab, range(vocab_size)))

# ======================================================CNN Model Start===============================================
# 输入内容及对应的标签
input_x = tf.placeholder(tf.int32, [None, seq_length], name='input_x')
input_y = tf.placeholder(tf.float32, [None, num_classes], name='input_y')
# dropout的损失率
keep_prob = tf.placeholder(tf.float64, name='keep_prob')

# 词向量映射;实际上此处的词向量并不是用的预训练好的词向量，而是未经任何训练直接生成了一个矩阵，将此矩阵作为词向量矩阵使用，效果也还不错。
# 若使用训练好的词向量，或许训练此次文本分类的模型时会更快，更好。
# embedding = tf.get_variable('embedding', [vocab_size, embedding_dim])
embedding_inputs = tf.nn.embedding_lookup(embedding, input_x)

num_filters = 256
kernel_size = 5
hidden_dim = 128
learning_rate = 1e-3
dropout_keep_prob = 0.5

num_epochs = 20
batch_size = 64
print_per_batch = 20  # 每多少轮输出一次结果
# save_per_batch = 5  # 每多少轮存入tensorboard


# CNN layer
conv = tf.layers.conv1d(embedding_inputs, num_filters, kernel_size, name='conv')  # num_filters = 256 这是个啥
''' https://blog.csdn.net/khy19940520/article/details/89934335
tf.layers.conv1d：一维卷积一般用于处理文本数据，常用语自然语言处理中，输入一般是文本经过embedding的二维数据。
    inputs： 输入tensor， 维度(batch_size, seq_length, embedding_dim) 是一个三维的tensor；其中，
        batch_size指每次输入的文本数量；
        seq_length指每个文本的词语数或者单字数；
        embedding_dim指每个词语或者每个字的向量长度；
        例如每次训练输入2篇文本，每篇文本有100个词，每个词的向量长度为20，那input维度即为(2, 100, 20)。
    filters：过滤器（卷积核）的数目
    kernel_size：卷积核的大小，卷积核本身应该是二维的，这里只需要指定一维，因为第二个维度即长度与词向量的长度一致，卷积核只能从上往下走，不能从左往右走，即只能按照文本中词的顺序，也是列的顺序。
'''
# global max pooling layer
gmp = tf.reduce_max(conv, reduction_indices=[1],
                    name='gmp')  # https://blog.csdn.net/lllxxq141592654/article/details/85345864

# 全连接层，后面接dropout以及relu激活
fc = tf.layers.dense(gmp, hidden_dim, name='fc1')  # hidden_dim：128
''' https://blog.csdn.net/yangfengling1023/article/details/81774580
dense ：全连接层  inputs：输入该网络层的数据；units：输出的维度大小，改变inputs的最后一维
'''
fc = tf.nn.dropout(fc, keep_prob)
fc = tf.nn.relu(fc)

# 分类器
logits = tf.layers.dense(fc, num_classes, name='fc2')
y_pred_cls = tf.argmax(tf.nn.softmax(logits), 1)  # 预测类别 tf.argmax：返回每一行或每一列的最大值 1为里面（每一行），0为外面（每一列）

# 损失函数，交叉熵
cross_entropy = tf.nn.softmax_cross_entropy_with_logits(logits=logits, labels=input_y)
loss = tf.reduce_mean(cross_entropy)
# 优化器
optim = tf.train.AdamOptimizer(learning_rate=learning_rate).minimize(loss)

# 准确率
correct_pred = tf.equal(tf.argmax(input_y, 1), y_pred_cls)
acc = tf.reduce_mean(tf.cast(correct_pred, tf.float32))
# ======================================================CNN Model End============================================

# 创建session
session = tf.Session()
saver = tf.train.Saver()
session.run(tf.global_variables_initializer())
saver.restore(sess=session, save_path=savePath)


def preprocess_sentence(sent):
    new_sent = ''
    for i in range(len(sent)):
        if sent[i] in string.punctuation:
            if i > 0 and i < len(sent) - 1:
                if sent[i - 1] != ' ':
                    new_sent += ' ' + sent[i]
                elif sent[i + 1] != ' ':
                    new_sent += sent[i] + ' '
                else:
                    new_sent += sent[i]
            elif i == 0:
                if sent[i + 1] != ' ':
                    new_sent += sent[i] + ' '
                else:
                    new_sent += sent[i]
            else:
                if sent[i - 1] != ' ':
                    new_sent += ' ' + sent[i]
                else:
                    new_sent += sent[i]
        else:
            new_sent += sent[i]
    return new_sent.lower()


Loading GloVe!
Completed!
Instructions for updating:

Future major versions of TensorFlow will allow gradients to flow
into the labels input on backprop by default.

See @{tf.nn.softmax_cross_entropy_with_logits_v2}.

INFO:tensorflow:Restoring parameters from D:\lab_data\之江lab\AIforVis\model\c1_cnn_2


In [38]:
def predict(predict_sentences, pad_max_length=seq_length):
    """
    将文件转换为id表示,并且将每个单独的样本长度固定为pad_max_lengtn
    """

    data_id = []
    # 将文本内容转换为对应的id形式
    for i in range(len(predict_sentences)):
        data_id.append([word_to_id[x] for x in predict_sentences[i].lower().strip().split() if x in word_to_id])

    # 使用keras提供的pad_sequences来将文本pad为固定长度
    x_pad = kr.preprocessing.sequence.pad_sequences(data_id, pad_max_length)
    ''' https://blog.csdn.net/TH_NUM/article/details/80904900
    pad_sequences(sequences, maxlen=None, dtype=’int32’, padding=’pre’, truncating=’pre’, value=0.) 
        sequences：浮点数或整数构成的两层嵌套列表
        maxlen：None或整数，为序列的最大长度。大于此长度的序列将被截短，小于此长度的序列将在后部填0.
        dtype：返回的numpy array的数据类型
        padding：‘pre’或‘post’，确定当需要补0时，在序列的起始还是结尾补
        truncating：‘pre’或‘post’，确定当需要截断序列时，从起始还是结尾截断
        value：浮点数，此值将在填充时代替默认的填充值0
    '''
    feed_dict = {
        input_x: x_pad,
        keep_prob: 1.0
    }
    predict_result = session.run(y_pred_cls, feed_dict=feed_dict)
    predict_result = [i + 1 for i in predict_result]
    return predict_result


def predict11(predict_sentences, probability_threshold=0.3, pad_max_length=seq_length):
    """
    将文件转换为id表示,并且将每个单独的样本长度固定为pad_max_lengtn
    """

    data_id = []
    # 将文本内容转换为对应的id形式
    for i in range(len(predict_sentences)):
        data_id.append([word_to_id[x] for x in predict_sentences[i].lower().strip().split() if x in word_to_id])

    # 使用keras提供的pad_sequences来将文本pad为固定长度
    x_pad = kr.preprocessing.sequence.pad_sequences(data_id, pad_max_length)
    ''' https://blog.csdn.net/TH_NUM/article/details/80904900
    pad_sequences(sequences, maxlen=None, dtype=’int32’, padding=’pre’, truncating=’pre’, value=0.) 
        sequences：浮点数或整数构成的两层嵌套列表
        maxlen：None或整数，为序列的最大长度。大于此长度的序列将被截短，小于此长度的序列将在后部填0.
        dtype：返回的numpy array的数据类型
        padding：‘pre’或‘post’，确定当需要补0时，在序列的起始还是结尾补
        truncating：‘pre’或‘post’，确定当需要截断序列时，从起始还是结尾截断
        value：浮点数，此值将在填充时代替默认的填充值0
    '''
    feed_dict = {
        input_x: x_pad,
        keep_prob: 1.0
    }
    predict_result = session.run(tf.nn.softmax(logits), feed_dict=feed_dict)
    print(predict_result)
    result = []
    for i in predict_result:
        if max(i) > probability_threshold:
            result.append(i.argmax()+1)
        else:
            result.append(0)
    return result

In [3]:
predict_sentences = ["In the sixtieth ceremony , where were all of the winners from ?",  # 7
                     "On how many devices has the app \" CF SHPOP ! \" been installed ?",  # 1
                     "List center - backs by what their transfer _ fee was ."]  # 5

# print(predict(predict_sentences))

In [5]:
predict11(predict_sentences)

array([[9.45090617e-02, 1.23237253e-01, 7.14768966e-04, 1.88490579e-02,
        1.20412807e-01, 5.54553983e-02, 5.85205611e-01, 8.03466736e-04,
        3.26596466e-04, 4.85979236e-04],
       [9.99991166e-01, 2.53736569e-07, 7.45170653e-06, 3.93062304e-07,
        5.27571381e-10, 1.18413987e-07, 9.59816961e-08, 5.07930054e-10,
        2.01731920e-07, 3.17914399e-07],
       [3.32020655e-02, 8.11915295e-03, 9.67328274e-03, 6.49981063e-03,
        7.19423535e-01, 1.74614078e-01, 6.27792668e-04, 3.57554671e-04,
        4.74083149e-02, 7.44134907e-05]])

In [6]:
result = np.array([[9.45090617e-02, 1.23237253e-01, 7.14768966e-04, 1.88490579e-02,
        1.20412807e-01, 5.54553983e-02, 5.85205611e-01, 8.03466736e-04,
        3.26596466e-04, 4.85979236e-04],
       [9.99991166e-01, 2.53736569e-07, 7.45170653e-06, 3.93062304e-07,
        5.27571381e-10, 1.18413987e-07, 9.59816961e-08, 5.07930054e-10,
        2.01731920e-07, 3.17914399e-07],
       [3.32020655e-02, 8.11915295e-03, 9.67328274e-03, 6.49981063e-03,
        7.19423535e-01, 1.74614078e-01, 6.27792668e-04, 3.57554671e-04,
        4.74083149e-02, 7.44134907e-05]])

In [8]:
result.shape

(3, 10)

In [17]:
a=np.array([12,17,8])

In [18]:
a.argmax()

1

In [10]:
max(result[0]),result[0].index(max(result[0]))

AttributeError: 'numpy.ndarray' object has no attribute 'index'

In [30]:
for i in result:
    if max(i) > 0.7:
        print(i.argmax()+1)
    else:
        print(11)

11
1
5


In [32]:
predict_sentences

['In the sixtieth ceremony , where were all of the winners from ?',
 'On how many devices has the app " CF SHPOP ! " been installed ?',
 'List center - backs by what their transfer _ fee was .']

In [40]:
predict11(["good"])

[[5.09907288e-04 2.04830967e-01 4.59961172e-04 6.39203623e-01
  8.33358360e-02 2.98951811e-02 3.76134953e-02 9.60751354e-04
  5.64579871e-04 2.62569765e-03]]


[4]

In [43]:
predict11(["good bad R I mm"])

[[9.20806431e-02 2.44805163e-02 3.71637749e-03 6.49520190e-03
  2.03403166e-05 8.72867751e-01 8.55812662e-05 6.47232636e-05
  1.66088055e-04 2.27769668e-05]]


[6]

In [48]:
predict11(["i love china !"])

[[4.08122986e-01 5.46142299e-01 1.09633785e-04 2.12000632e-02
  2.54067842e-04 1.13652993e-02 1.23952700e-02 3.14969231e-05
  5.30361397e-05 3.25848737e-04]]


[2]

In [7]:
h

In [4]:
predict11(predict_sentences)

[7, 1, 5]

In [8]:
predict11(predict_sentences)

[7, 1, 5]

In [12]:
a = [2*x if x else -1 for x in range(5)]

In [13]:
a[-1]

8

In [24]:
word_to_id['UNK'], word_to_id['sandberger']

KeyError: 'UNK'

In [23]:
embd[299999]

[-0.258,
 -0.068239,
 -0.1293,
 0.40934,
 0.24704,
 -0.30138,
 0.64817,
 -0.39733,
 0.098767,
 0.27254,
 -0.22292,
 0.29851,
 0.38641,
 -0.20509,
 0.13445,
 0.1114,
 -0.065329,
 0.60735,
 0.42454,
 -0.16439,
 -0.42364,
 -0.014459,
 -0.49806,
 -0.084324,
 -0.53053,
 -0.14708,
 -0.14706,
 -0.19774,
 -0.065882,
 0.32439,
 0.55762,
 0.3363,
 0.35643,
 -0.16911,
 0.29504,
 -0.41179,
 -0.033898,
 -0.34218,
 0.0972,
 -0.14092,
 -0.063052,
 -0.080234,
 0.059456,
 -0.050595,
 -0.42402,
 0.41918,
 -0.0025027,
 0.35303,
 -0.070322,
 0.43291,
 -0.26104,
 0.04959,
 -0.30767,
 0.19803,
 0.41325,
 1.0292,
 -0.3959,
 -0.014833,
 -0.3658,
 -0.47339,
 0.12888,
 -0.45944,
 0.27612,
 0.11627,
 -0.40329,
 0.21118,
 -0.38505,
 -0.1359,
 -0.36774,
 0.013439,
 0.81402,
 0.23368,
 0.080804,
 0.54235,
 0.39422,
 -0.18939,
 0.34036,
 -0.37009,
 0.71362,
 -0.46628,
 -0.50555,
 -0.16086,
 0.26761,
 -0.49872,
 0.61128,
 -0.029987,
 0.11944,
 -0.23577,
 -0.45469,
 0.032653,
 0.035162,
 0.26197,
 0.42505,
 -0.032472,

In [14]:
vocab[-1]

'sandberger'

In [28]:
a = {"sdf":12, "2":[1,3]}
import json

In [10]:
import json
import numpy as np

In [14]:
aa = np.array([2],dtype=int64)

NameError: name 'int64' is not defined

In [13]:
aa

array(2)

In [8]:
a=1
aa = int(json.dumps(a,cls=NpEncoder))

In [9]:
type(aa)

int

In [4]:
class NpEncoder(json.JSONEncoder):
    def default(self, obj):
        if isinstance(obj, np.integer):
            return int(obj)
        elif isinstance(obj, np.floating):
            return float(obj)
        elif isinstance(obj, np.ndarray):
            return obj.tolist()
        else:
            return super(NpEncoder, self).default(obj)