In [1]:
import numpy as np
import tensorflow.contrib.keras as kr
import tensorflow as tf
import string

vocabPath = r"D:\lab_data\之江lab\AIforVis\model\glove.6B.100d.txt"
savePath = r"D:\lab_data\之江lab\AIforVis\model\model"


def loadGloVe(filename):
    vocab = []
    embd = []
    print('Loading GloVe!')
    file = open(filename, 'r', encoding='utf-8')
    for line in file.readlines():
        row = line.strip().split(' ')
        vocab.append(row[0])
        embd.append([float(ei) for ei in row[1:]])
    file.close()
    print('Completed!')
    return vocab, embd

num_classes = 10

seq_length = 41
vocab, embd = loadGloVe(vocabPath)
vocab_size = len(vocab)
embedding_dim = len(embd[0])
# embedding = np.asarray(embd)
embedding = embd
word_to_id = dict(zip(vocab, range(vocab_size)))


class ZhouCLSTMModel:
    '''
    Implementation proposal of: https://arxiv.org/pdf/1511.08630
    '''

    def __init__(self, embedding,
                 conv_size=3,
                 conv_filters=300,
                 drop_rate=0.5,
                 lstm_units=150):
        '''Constructor.
        # Parameters:
        conv_size: Size of the convolutions. Number of words that takes each
            convolution step.
        conv_filters: Number of convolution filters.
        drop_rate: Drop rate for the final output of the LSTM layer.
        lstm_units: Size of the states of the LSTM layer.
        '''
        self._embedding = embedding
        self._conv_size = conv_size
        self._conv_filters = conv_filters
        self._drop_rate = drop_rate
        self._lstm_units = lstm_units

    def __call__(self, x_input):
        self._embedding_tf = self._create_embedding_layer(
            self._embedding, x_input)

        self._convolution_tf = self._create_convolutional_layers(
            self._conv_size,
            self._conv_filters,
            self._drop_rate,
            self._embedding_tf)

        self._lstm_tf = self._create_lstm_layer(
            self._lstm_units,
            self._convolution_tf)

        return self._lstm_tf

    def summary(self):
        print('embedding:', str(self._embedding_tf.shape))
        print('conv:', str(self._convolution_tf.shape))
        print('lstm:', str(self._lstm_tf.shape))

    def _create_embedding_layer(self, embedding, input_x):
        embedding = tf.Variable(initial_value=embedding)

        embedded_chars = tf.nn.embedding_lookup(
            embedding, tf.cast(input_x, 'int32'))

        return embedded_chars

    def _create_convolutional_layers(self,
                                     conv_size, num_filters, drop_rate, embedding):
        filter_height = conv_size
        filter_width = embedding.shape[2].value

        filter_shape = [filter_height, filter_width, 1, num_filters]

        W = tf.Variable(
            initial_value=tf.truncated_normal(
                shape=filter_shape,
                stddev=0.1))
        b = tf.Variable(
            initial_value=tf.truncated_normal(
                shape=[num_filters]))

        embedding_expanded = tf.expand_dims(embedding, -1)
        conv = tf.nn.conv2d(
            input=embedding_expanded,
            filter=W,
            strides=[1, 1, 1, 1],
            padding='VALID')
        conv_reduced = tf.reshape(
            tensor=conv,
            shape=[-1, conv.shape[1], conv.shape[3]])

        bias = tf.nn.bias_add(conv_reduced, b)
        c = tf.nn.relu(bias)

        d = tf.nn.dropout(c, keep_prob=drop_rate)
        return d

    def _create_lstm_layer(self, lstm_units, conv_input):
        lstm_cell = tf.nn.rnn_cell.LSTMCell(lstm_units)
        sequence = tf.unstack(conv_input, axis=1)
        (_, (h, _)) = tf.nn.static_rnn(lstm_cell, sequence, dtype=tf.float32)

        return h


learning_rate = 1e-3
dropout_keep_prob = 0.5

# 输入内容及对应的标签
input_x = tf.placeholder(tf.int32, [None, seq_length], name='input_x')
input_y = tf.placeholder(tf.float32, [None, num_classes], name='input_y')
keep_prob = tf.placeholder(tf.float32, name='keep_prob')

model = ZhouCLSTMModel(embedding, drop_rate = keep_prob)
fc = model(input_x)
model.summary()

# 分类器
logits = tf.layers.dense(fc, num_classes, name='fc2')
y_pred_cls = tf.argmax(tf.nn.softmax(logits), 1)  # 预测类别 tf.argmax：返回每一行或每一列的最大值 1为里面（每一行），0为外面（每一列）

# 损失函数，交叉熵
cross_entropy = tf.nn.softmax_cross_entropy_with_logits(logits=logits, labels=input_y)
loss = tf.reduce_mean(cross_entropy)
# 优化器
optim = tf.train.AdamOptimizer(learning_rate=learning_rate).minimize(loss)

# 准确率
correct_pred = tf.equal(tf.argmax(input_y, 1), y_pred_cls)
acc = tf.reduce_mean(tf.cast(correct_pred, tf.float32))
saver = tf.train.Saver()

session = tf.Session()
saver.restore(sess=session, save_path=savePath)

Loading GloVe!
Completed!
embedding: (?, 41, 100)
conv: (?, 39, 300)
lstm: (?, 150)
Instructions for updating:

Future major versions of TensorFlow will allow gradients to flow
into the labels input on backprop by default.

See @{tf.nn.softmax_cross_entropy_with_logits_v2}.

INFO:tensorflow:Restoring parameters from D:\lab_data\之江lab\AIforVis\model\model


In [23]:
def preprocess_sentence(sent):
    new_sent = ''
    for i in range(len(sent)):
        if sent[i] in string.punctuation:
            if i > 0 and i < len(sent) - 1:
                if sent[i] in ",." and sent[i-1].isdigit() and sent[i+1].isdigit():
                    new_sent += sent[i]
                    continue
                if sent[i] == "%" and sent[i-1].isdigit():
                    new_sent += sent[i]
                    continue
                if sent[i] == "$" and (sent[i-1].isdigit() or sent[i+1].isdigit()):
                    new_sent += sent[i]
                    continue
                if sent[i-1] != ' ':
                    new_sent += ' ' + sent[i]
                elif sent[i+1] != ' ':
                    new_sent += sent[i] + ' '
                else:
                    new_sent += sent[i]
            elif i == 0:
                if sent[i] == "$" and sent[i+1].isdigit():
                    new_sent += sent[i]
                    continue
                if sent[i+1] != ' ':
                    new_sent += sent[i] + ' '
                else:
                    new_sent += sent[i]
            else:
                if sent[i] == "%" and sent[i-1].isdigit():
                    new_sent += sent[i]
                    continue
                if sent[i] == "$" and sent[i-1].isdigit():
                    new_sent += sent[i]
                    continue
                if sent[i-1] != ' ':
                    new_sent += ' ' + sent[i]
                else:
                    new_sent += sent[i]
        else:
            new_sent += sent[i]
    return new_sent.strip().lower()


def predict11(predict_sentences):
    """
    将文件转换为id表示,并且将每个单独的样本长度固定为pad_max_lengtn
    """
    data_id = []
    # 将文本内容转换为对应的id形式
    for psi in predict_sentences:

        data_id.append([word_to_id[x] for x in preprocess_sentence(psi).split() if x in word_to_id])

    # 使用keras提供的pad_sequences来将文本pad为固定长度
    x_pad = kr.preprocessing.sequence.pad_sequences(data_id, seq_length)
    feed_dict = {
        input_x: x_pad,
        keep_prob: 1.0
    }
    predict_result = session.run(tf.nn.softmax(logits), feed_dict=feed_dict)
    # print(predict_result)
    result = []
    for i in predict_result:
        result.append([max(i), i.argmax()+1])
    return result

In [14]:
predict_sentences = ["In the sixtieth ceremony , where were all of the winners from ?",  # 7
                     "On how many devices has the app \" CF SHPOP ! \" been installed ?",  # 1
                     "List center - backs by what their transfer _ fee was .",  # 5
                     "can you tell me what is arkansas 's population on the date july 1st of 2002 ?",  # 1
                     "show the way the number of likes were distributed .",  # 7
                     "is it true that people living on average depends on higher gdp of a country"  # 10
                     ]

result = predict11(["In the sixtieth ceremony , where were all of the winners from ?"])

[[1.7058551e-02 4.5138117e-02 9.2223249e-02 6.8682201e-02 1.6280174e-02
  9.2392340e-02 6.6164529e-01 1.5889378e-03 4.8943777e-03 9.6813492e-05]]


In [15]:
type(result[0])

numpy.int64

In [13]:
int(json.dumps(result[0]))

TypeError: Object of type 'int64' is not JSON serializable

In [6]:
import json

class NpEncoder(json.JSONEncoder):
    def default(self, obj):
        if isinstance(obj, np.integer):
            return int(obj)
        elif isinstance(obj, np.floating):
            return float(obj)
        elif isinstance(obj, np.ndarray):
            return obj.tolist()
        else:
            return super(NpEncoder, self).default(obj)

In [9]:
task = int(json.dumps(result[0],cls=NpEncoder))

In [11]:
type(task)

int

In [54]:
def random_split():
    tasks_sentences = []
    labels = []
    with open("corpus最终版\corpus_5.txt", "r", encoding='utf-8') as fp:
        for line in fp.readlines():
            lsp = line.split()
            sentence = " ".join(lsp[1:])
            tasks_sentences.append(sentence)
            label = int(lsp[0].split(":")[0])
            labels.append(label)
            
    return tasks_sentences,labels

In [55]:
tasks_sentences,labels = random_split()

In [56]:
labels[:3]

[1, 1, 1]

In [57]:
result = predict11(tasks_sentences)

In [83]:
result = np.asarray(result)

In [90]:
min(result[:,0])

0.24597807228565216

In [33]:
result[:,0].argmin()

812

In [59]:
result[812]

[0.6002466, 4]

In [47]:
result[813][1],labels[0]

(3.0, 1)

In [69]:
all_num = len(labels)
right_array = []
for i in range(all_num):
    if result[i][1] == labels[i]:
        right_array.append(result[i][0])

len(right_array)/all_num

0.9569647310295689

In [71]:
right_result = np.asarray(right_array)

In [72]:
right_result[:3]

array([0.99642915, 0.9995763 , 0.9992624 ], dtype=float32)

In [76]:
min(right_result), min(right_array)

(0.26189747, 0.26189747)

In [77]:
right_result.argmin()

2616

In [80]:
right_result[2616]

0.26189747

In [88]:
result[:,0]

array([0.99642915, 0.99957627, 0.99926239, ..., 0.99830377, 0.57879895,
       0.99996567])

In [95]:
predict11(["distribution"])

[[0.79089814, 7]]