<a href="https://colab.research.google.com/github/zhousanfu/Tensorflow_Demo/blob/master/tensorflow_NLP.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

## 需要模块

In [None]:
!pip install -q tensorflow-datasets
import tensorflow_datasets as tfds
import numpy as np
import tensorflow as tf
from tensorflow.keras.preprocessing.text import  Tokenizer
from tensorflow.keras.preprocessing.sequence import  pad_sequences
import io, time
import matplotlib.pyplot as plt
import pandas as pd
from sklearn.datasets.samples_generator import make_blobs
from sklearn.preprocessing import MinMaxScaler

# 一、上传训练数据

## 1.1 上传文件

In [None]:
from google.colab import files
uploaded = files.upload()
for fn in uploaded.keys():
  print('User uploaded file "{name}" with length {length} bytes'.format(name=fn, length=len(uploaded[fn])))

## 1.2 数据预处理

In [None]:
# imdb, info = tfds.load('imdb_reviews', with_info=True, as_supervised=True)
# train_data, test_data = imdb['train'], imdb['test']
train_data = pd.read_csv('train.csv', sep='\t', encoding='utf-8').values.tolist()
test_data = pd.read_csv('test.csv', sep='\t', encoding='utf-8').values.tolist()

training_sentences = []
training_labels = []
testing_sentences = []
testing_labels = []

for s, l in train_data:
  training_sentences.append(l)
  training_labels.append(s)
for s, l in test_data:
  testing_sentences.append(l)
  testing_labels.append(s)

training_labels_final = np.array(training_labels)
testing_labels_final = np.array(testing_labels)

# 二、模型训练

## 2.1 超参数设置

In [None]:
vocab_size = 10000
embedding_dim = 16
max_length = 120
trunc_type = 'post'
oov_tok = '<oov>'

tokenizer = Tokenizer(num_words=vocab_size, oov_token=oov_tok)
tokenizer.fit_on_texts(training_sentences)
word_index = tokenizer.word_index

sequences = tokenizer.texts_to_sequences(training_sentences)
padded = pad_sequences(sequences, maxlen=max_length, truncating=trunc_type)
testing_sentences = tokenizer.texts_to_sequences(testing_sentences)
testing_padded = pad_sequences(testing_sentences, maxlen=max_length)

## 2.2 网络-损失函数和优化器

In [None]:
model = tf.keras.Sequential([
                tf.keras.layers.Embedding(vocab_size, embedding_dim, input_length=max_length),
                tf.keras.layers.Flatten(),
                # tf.keras.layers.GlobalAveragePooling1D()
                tf.keras.layers.Dense(6, activation='relu'),
                tf.keras.layers.Dense(1, activation='sigmoid')
])

model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
model.summary()

## 2.3 模型训练

In [None]:
num_epochs = 10

history = model.fit(
    padded,
    training_labels_final,
    epochs=num_epochs,
    validation_data=(testing_padded, testing_labels_final),
    verbose=2
    )
print(history)

# 三、训练结果

## 3.1 loss可视化

In [None]:
fig = plt.gcf()
fig.set_size_inches(10, 5)
ax1 = fig.add_subplot(111)
ax1.set_title('Train and Validation Picture')
ax1.set_ylabel('Loss value')
line1, = ax1.plot(history.history['loss'], color=(0.5, 0.5, 1.0), label='Loss train')
line2, = ax1.plot(history.history['val_loss'], color=(0.5, 1.0, 0.5), label='Loss valid')
ax2 = ax1.twinx()
ax2.set_ylabel('Accuracy value')
line3, = ax2.plot(history.history['accuracy'], color=(0.5, 0.5, 0.5), label='Accuracy train')
line4, = ax2.plot(history.history['val_accuracy'], color=(1, 0, 0), label='Accuracy valid')
plt.legend(handles=(line1, line2, line3, line4), loc='best')
plt.xlim(-1, 10)
plt.show()

## 3.2 向量反转

In [None]:
e = model.layers[0]
weights = e.get_weights()[0]
print(weights.shape)

reverse_word_index = dict([(value, key) for (key, value) in word_index.items()])
def decode_review(text):
  return ' '.join([reverse_word_index.get(i, '?') for i in text])

print(decode_review(padded[1]))
print(training_sentences[1])

## 3.3 向量保存

In [None]:
out_v = io.open('vecs.tsv', 'w', encoding='utf-8')
out_m = io.open('meta.tsv', 'w', encoding='utf-8')
for word_num in range(1, vocab_size):
  word = reverse_word_index[word_num]
  embeddings = weights[word_num]
  out_m.write(word + '\n')
  out_v.write('\t'.join([str(x) for x in embeddings]) + '\n')
out_v.close()
out_m.close()

In [None]:
try:
  from google.colab import files
except ImportError:
  pass
else:
  files.download('vecs.tsv')
  files.download('meta.tsv') 

# 四、模型保存

In [None]:
# # 保存模型结构
# json_config = model.to_json()
# with open('model_config.json', 'w') as json_file:
#   json_file.write(json_config)

# ## HDF5 文件,单个二进制 blob，里面同时保存了模型权重、模型结构和优化器配置
# model.save_weights('path_to_my_weights.h5')

# SavedModel,使用 TensorFlow Serving 来部署模型时必须使用 SavedModel 格式
tf.keras.models.save_model(model, './saved_models/1')

# # checkpoint格式
# model.save_weights("xxx.ckpt", save_format="tf")

# 五、模型预测
```
predictions = model.predict(my_test_data_padded) # model.predict_classes() 返回的是类别的索引，即该样本所属的类别标签
print(my_test_data_padded)
print(predictions)
print(np.argmax(predictions, axis=1))

#results = model.evaluate(testing_padded, testing_padded)
#print(results)
```

In [None]:
my_test_sentences = pd.read_excel('imo_v1.xlsx', usecols=[0,1,2,3,4]).values.tolist()
data = []

for sen in my_test_sentences:
  try:
    my_test_sentences = tokenizer.texts_to_sequences(sen[2])
    my_test_data_padded = pad_sequences(my_test_sentences, maxlen=max_length)
    predictions_classify = model.predict_classes(my_test_data_padded)
    predictions = model.predict(my_test_data_padded)
    if int(predictions[0][0]) >= 1:
      data.append([sen[0], sen[1], sen[2], sen[3], sen[4], int(predictions_classify[0][0]), predictions[0][0]])
      #print(predictions[0][0]], sen[2])
  except:
    pass

df = pd.DataFrame(data, columns=['day', 'opt_type', 'message', 'resource_id', 'comment_id', '标签(1=违规)', '得分'])
df.to_excel("预测.xlsx", index=False, sheet_name='Sheet1', encoding='utf-8')

# 六、Tensorflow Server

In [None]:
!curl -sSL https://get.daocloud.io/docker | sh
!docker pull tensorflow/serving
!curl https://storage.googleapis.com/tensorflow-serving-apt/tensorflow-serving.release.pub.gpg | apt-key add -
!apt update
!echo "deb http://storage.googleapis.com/tensorflow-serving-apt stable tensorflow-model-server tensorflow-model-server-universal" | tee /etc/apt/sources.list.d/tensorflow-serving.list && \
curl https://storage.googleapis.com/tensorflow-serving-apt/tensorflow-serving.release.pub.gpg | apt-key add -
!apt update
!apt-get install tensorflow-model-server

In [None]:
import os
os.environ["MODEL_DIR"] = "/content/saved_models"
!ls -l /content/saved_models
!saved_model_cli show --dir /content/saved_models/1 --all

In [None]:
%%bash --bg 
nohup tensorflow_model_server \
  --rest_api_port=8501 \
  --model_name=helloworld \
  --model_base_path="${MODEL_DIR}" >server.log 2>&1

In [None]:
import json
import requests

def get_model_predict(list_data):
  headers = {"content-type": "application/json"}
  req_data = {"signature_name": "serving_default", "instances": list_data.tolist()}
  json_response = requests.post('http://localhost:8501/v1/models/helloworld/versions/1:predict/classes', data=json.dumps(req_data), headers=headers)

  return json_response



my_test_sentences = pd.read_excel('imo_v1.xlsx', usecols=[2]).values.tolist()

my_test_sentences = tokenizer.texts_to_sequences(['আপু রা এড দাও মজা পাবে আসো'])
my_test_data_padded = pad_sequences(my_test_sentences, maxlen=max_length)
json_response = get_model_predict(my_test_data_padded)
print(json_response.json())
# predictions = np.array(json.loads(json_response.text)['predictions'])
# print(json_response.json(), np.argmax(predictions, axis=1), type(np.argmax(predictions, axis=1)[0]), int(predictions), type(int(predictions)))



# imo_test_data = []
# for sen in my_test_sentences:
#   try:
#     my_test_sentences = tokenizer.texts_to_sequences(sen)
#     my_test_data_padded = pad_sequences(my_test_sentences, maxlen=max_length)
#     json_response = get_model_predict(my_test_data_padded)
#     predictions = np.array(json.loads(json_response.text)['predictions'])
#     print(predictions, np.argmax(predictions, axis=1)[0], type(np.argmax(predictions, axis=1)[0]), int(predictions), type(int(predictions)))
#     imo_test_data.append([sen[0], predictions[0][0]])
#   except:
#     pass

# df = pd.DataFrame(imo_test_data, columns=['原文','标签'])
# # df.to_excel("测试.xlsx", index=False, sheet_name='Sheet1', encoding='utf-8')
# df.to_csv('test_imo_content.txt', sep='\t', index=False)