<a href="https://colab.research.google.com/github/zhousanfu/Tensorflow_Demo/blob/master/tensorflow_NLP.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

## 需要模块

In [None]:
!pip install -q tensorflow-datasets
import tensorflow_datasets as tfds
import numpy as np
import tensorflow as tf
from tensorflow.keras.preprocessing.text import  Tokenizer
from tensorflow.keras.preprocessing.sequence import  pad_sequences
import io, time
import matplotlib.pyplot as plt
import pandas as pd

# 一、上传训练数据

## 1.1 上传文件

In [None]:
from google.colab import files
uploaded = files.upload()
for fn in uploaded.keys():
  print('User uploaded file "{name}" with length {length} bytes'.format(name=fn, length=len(uploaded[fn])))

## 1.2 数据预处理

In [None]:
# imdb, info = tfds.load('imdb_reviews', with_info=True, as_supervised=True)
# train_data, test_data = imdb['train'], imdb['test']
train_data = pd.read_csv('train.csv', sep='\t').values.tolist()
test_data = pd.read_csv('test.csv', sep='\t').values.tolist()

training_sentences = []
training_labels = []
testing_sentences = []
testing_labels = []

for s, l in train_data:
  training_sentences.append(l)
  training_labels.append(s)
for s, l in test_data:
  testing_sentences.append(l)
  testing_labels.append(s)

training_labels_final = np.array(training_labels)
testing_labels_final = np.array(testing_labels)

# 二、模型训练

## 2.1 超参数设置

In [None]:
vocab_size = 10000
embedding_dim = 16
max_length = 120
trunc_type = 'post'
oov_tok = '<oov>'

tokenizer = Tokenizer(num_words=vocab_size, oov_token=oov_tok)
tokenizer.fit_on_texts(training_sentences)
word_index = tokenizer.word_index
sequences = tokenizer.texts_to_sequences(training_sentences)
padded = pad_sequences(sequences, maxlen=max_length, truncating=trunc_type)

testing_sentences = tokenizer.texts_to_sequences(testing_sentences)
testing_padded = pad_sequences(testing_sentences, maxlen=max_length)

## 2.2 网络-损失函数和优化器

In [None]:
model = tf.keras.Sequential([
                tf.keras.layers.Embedding(vocab_size, embedding_dim, input_length=max_length),
                tf.keras.layers.Flatten(),
                # tf.keras.layers.GlobalAveragePooling1D()
                tf.keras.layers.Dense(6, activation='relu'),
                tf.keras.layers.Dense(1, activation='sigmoid')
])

model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
model.summary()

## 2.3 模型训练

In [None]:
num_epochs = 10

history = model.fit(
    padded,
    training_labels_final,
    epochs=num_epochs,
    validation_data=(testing_padded, testing_labels_final),
    verbose=2
    )
print(history)

# 三、训练结果

## 3.1 loss可视化

In [None]:
# def plot_graphs(history, string):
#   # history.history.keys()
#   # plt.plot(history.epoch, history.history.get('acc'))
#   plt.plot(history.history[string])
#   plt.plot(history.history['val_'+string])
#   plt.xlabel(string)
#   plt.ylabel(string)
#   plt.legend([string, 'val_'+string])
#   plt.show()

# plot_graphs(history, "acc")
# plot_graphs(history, "loss")

fig = plt.gcf()
fig.set_size_inches(10, 5)
ax1 = fig.add_subplot(111)
ax1.set_title('Train and Validation Picture')
ax1.set_ylabel('Loss value')
line1, = ax1.plot(history.history['loss'], color=(0.5, 0.5, 1.0), label='Loss train')
line2, = ax1.plot(history.history['val_loss'], color=(0.5, 1.0, 0.5), label='Loss valid')
ax2 = ax1.twinx()
ax2.set_ylabel('Accuracy value')
line3, = ax2.plot(history.history['accuracy'], color=(0.5, 0.5, 0.5), label='Accuracy train')
line4, = ax2.plot(history.history['val_accuracy'], color=(1, 0, 0), label='Accuracy valid')
plt.legend(handles=(line1, line2, line3, line4), loc='best')
plt.xlim(-1, 10)
plt.show()

## 3.2 向量反转

In [None]:
e = model.layers[0]
weights = e.get_weights()[0]
print(weights.shape)

reverse_word_index = dict([(value, key) for (key, value) in word_index.items()])
def decode_review(text):
  return ' '.join([reverse_word_index.get(i, '?') for i in text])

print(decode_review(padded[1]))
print(training_sentences[1])

## 3.3 向量保存

In [None]:
out_v = io.open('vecs.tsv', 'w', encoding='utf-8')
out_m = io.open('meta.tsv', 'w', encoding='utf-8')
for word_num in range(1, vocab_size):
  word = reverse_word_index[word_num]
  embeddings = weights[word_num]
  out_m.write(word + '\n')
  out_v.write('\t'.join([str(x) for x in embeddings]) + '\n')
out_v.close()
out_m.close()

In [None]:
try:
  from google.colab import files
except ImportError:
  pass
else:
  files.download('vecs.tsv')
  files.download('meta.tsv') 

# 四、模型保存

In [None]:
# 模型保存
## 保存模型结构
json_config = model.to_json()
with open('model_config.json', 'w') as json_file:
  json_file.write(json_config)

## HDF5 文件,单个二进制 blob，里面同时保存了模型权重、模型结构和优化器配置
model.save_weights('path_to_my_weights.h5')

## SavedModel,使用 TensorFlow Serving 来部署模型时必须使用 SavedModel 格式
saved_model_path = './saved_models/{}'.format(int(time.time()))
tf.keras.models.save_model(model, saved_model_path)

## checkpoint格式
model.save_weights("xxx.ckpt", save_format="tf")

## 五、模型预测

In [None]:

# text_to_list = tf.keras.preprocessing.text.text_to_word_sequence(test_data[0][1])
# num_list = [word_index[key] for key in text_to_list]
# print(text_to_list, num_list)
# test_list = tf.keras.preprocessing.sequence.pad_sequences([num_list],
#                              padding='post',
#                              truncating='post',
#                              maxlen=128)

# print(np.array(test_list))
# pred = model.predict(test_list)
# print('predict value:', sentiment_dict[np.argmax(pred)])

In [None]:
print(decode_review(testing_padded[0]))
predictions = model.predict(testing_padded) # model.predict_classes 返回的是类别的索引，即该样本所属的类别标签
print(predictions[0])

print(np.argmax(predictions, axis=1))


# results = model.evaluate(testing_padded, testing_padded)
# print(results)

In [None]:
# 重新创建完全相同的模型，包括权重和优化器
new_model = models.load_model('path_to_my_weights.h5')
new_model.summary()

loss, acc = new_model.evaluate(test_images)
print("Restored model, accuracy: {:5.2f}%".format(100*acc))

In [None]:
import tensorflow as tf
import numpy as np
import re
import os
import cv2
import time
 
model_dir = '/home/xuqiong/code/effcient/'
#path = '/home/xuqiong/data/testimg/nosee/test/'
path = '/mnt/nas/cv_data/imagequality/fromQA/n2/5'
 
timeall = 0
# create graph
with tf.Session() as sess:
    with tf.gfile.FastGFile(os.path.join(model_dir, 'mymodel.pb'), 'rb') as f:
        graph_def = tf.GraphDef()
        graph_def.ParseFromString(f.read())
        tf.import_graph_def(graph_def, name='')
        #[print(n.name) for n in tf.get_default_graph().as_graph_def().node]
 
    graph = tf.get_default_graph()
    pred = graph.get_tensor_by_name('output_1:0')
 
    #read image
    filelist = os.listdir(path)
    for item in filelist:
        total_num_file = len(filelist)
        image_dir = os.path.join(os.path.abspath(path), item)
        image_data = cv2.imread(image_dir)
        image_data = cv2.resize(image_data, dsize=(256,256),interpolation=cv2.INTER_LINEAR)
        image_data = image_data.astype(float)
        image_data /= 255
        image_data = np.array([image_data])
 
        time0 = time.time()
        res = sess.run(pred, feed_dict={"input_1:0": image_data})
        time1 = time.time()
        timeclass = time1 - time0
        timeall = timeall + timeclass
 
        #print("res: ", res)
    print("timeall: ", timeall*1000, "ms, avg: ", timeall*1000/total_num_file, "ms")