<a href="https://colab.research.google.com/github/sunyingjian/tf-course-youdao-2021/blob/main/%E8%AF%8D%E5%B5%8C%E5%85%A5embedding%E5%8F%AF%E8%A7%86%E5%8C%96.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# 导入数据集

In [1]:
! /opt/bin/nvidia-smi

Sat Feb 27 08:41:27 2021       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 460.32.03    Driver Version: 460.32.03    CUDA Version: 11.2     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  Tesla T4            Off  | 00000000:00:04.0 Off |                    0 |
| N/A   37C    P8     9W /  70W |      0MiB / 15109MiB |      0%      Default |
|                               |                      |                  N/A |
+-------------------------------+----------------------+----------------------+
                                                                               
+-----------------------------------------------------------------------------+
| Proces

In [4]:
import tensorflow as tf 
import tensorflow_datasets as tfds

In [5]:
imdb, info = tfds.load('imdb_reviews', with_info=True, as_supervised=True)

In [6]:
#数据集被分为两部分25000做测试集，25000做训练集
import numpy as np
train_data, test_data = imdb['train'], imdb['test']

In [7]:
#储存训练数据与测试数据中的句子与标签
training_sentences = []
training_labels = []
testing_sentences = []
testing_labels = []

In [8]:
#本来他们就是tensor 所以现在需要寻找他们的值，我们就用.numpy()方法实现
for s,l in train_data:
  training_sentences.append(str(s.numpy))
  training_labels.append(l.numpy())
for s,l in test_data:
  testing_sentences.append(str(s.numpy()))
  testing_labels.append(l.numpy())

In [9]:
#我们希望标签是numpy格式
training_labels_final = np.array(training_labels)
testing_labels_final = np.array(testing_labels)

## 对数据进行词条化

In [10]:
vocab_size = 10000
embedding_dim = 16
max_length = 120
trunc_type = 'post'
oov_tok = "<OOV>"

In [11]:
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

In [12]:
tokenizer = Tokenizer(num_words=vocab_size, oov_token=oov_tok)
tokenizer.fit_on_texts(training_sentences)
word_index = tokenizer.word_index
sequences = tokenizer.texts_to_sequences(training_sentences)
padded = pad_sequences(sequences, maxlen=max_length, truncating=trunc_type)#这个时候输入数据也就是X,truncating是如果超出maxlen的化，超出部分会被截断，截断的化截断后面
#因为前面的trunc_type='post

In [13]:
testing_sequences = tokenizer.texts_to_sequences(testing_sentences)
testing_padded = pad_sequences(testing_sequences, maxlen=max_length)

In [14]:
#构建模型,embedding 是嵌入层
#在句子中意思相近的单词的距离会比较近，因此我们可以在一个高维空间中找到一组相似的向量来表示情感相同的单词
#这些向量会因为相似的数据标签，而逐渐聚集在一起，所以就可以建立向量与标签之间的关系，这其实SVM很像啊
#执行嵌入后的结果会得到一个二维的数组，其行和列分别为句子的长度也就是maxlength和embedding_dim的维度，
model = tf.keras.Sequential([
                             tf.keras.layers.Embedding(vocab_size,embedding_dim, input_length=max_length),
                             tf.keras.layers.Flatten(),#然后我们也会像使用二维图像一样将结果展平，针对展平操作还可以使用GloalAveragePooling1D()，如model2所示
                             tf.keras.layers.Dense(6, activation = 'relu'),
                             tf.keras.layers.Dense(1,activation='sigmoid')
])

In [15]:
model.summary()

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding (Embedding)        (None, 120, 16)           160000    
_________________________________________________________________
flatten (Flatten)            (None, 1920)              0         
_________________________________________________________________
dense (Dense)                (None, 6)                 11526     
_________________________________________________________________
dense_1 (Dense)              (None, 1)                 7         
Total params: 171,533
Trainable params: 171,533
Non-trainable params: 0
_________________________________________________________________


In [22]:
padded.shape

(25000, 120)

In [28]:
input = tf.keras.Input(shape=(120))

In [31]:
x = tf.keras.layers.Embedding(vocab_size,embedding_dim,input_length=max_length)(input)
x = tf.keras.layers.GlobalAveragePooling1D()(x)
x = tf.keras.layers.Dense(6, activation='relu')(x)
x = tf.keras.layers.Dense(1, activation='sigmoid')(x)
model = tf.keras.Model(inputs = input, outputs = x)

In [32]:
model.summary()

Model: "model"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_1 (InputLayer)         [(None, 120)]             0         
_________________________________________________________________
embedding_6 (Embedding)      (None, 120, 16)           160000    
_________________________________________________________________
global_average_pooling1d_4 ( (None, 16)                0         
_________________________________________________________________
dense_8 (Dense)              (None, 6)                 102       
_________________________________________________________________
dense_9 (Dense)              (None, 1)                 7         
Total params: 160,109
Trainable params: 160,109
Non-trainable params: 0
_________________________________________________________________


In [16]:
model2 = tf.keras.Sequential([
                              tf.keras.layers.Embedding(vocab_size, embedding_dim, input_length=max_length),
                              tf.keras.layers.GlobalAveragePooling1D(),#在每个向量的维度上取平均值进行输出
                              #tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(32,return_sequences=True)),
                              tf.keras.layers.Dense(6,activation='relu'),
                              tf.keras.layers.Dense(1,activation='sigmoid')
])

In [17]:
model2.summary()#对比model的Flatten层的output_shape与Global_average_pooling1D的output_shape 我们可以发现后者的更小
#这样的化 整个模型的执行速度会更快且更加简洁

Model: "sequential_1"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_1 (Embedding)      (None, 120, 16)           160000    
_________________________________________________________________
global_average_pooling1d (Gl (None, 16)                0         
_________________________________________________________________
dense_2 (Dense)              (None, 6)                 102       
_________________________________________________________________
dense_3 (Dense)              (None, 1)                 7         
Total params: 160,109
Trainable params: 160,109
Non-trainable params: 0
_________________________________________________________________


In [18]:
#编译模型
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

In [19]:
model2.compile(loss='binary_crossentropy', optimizer='adam', metrics = ['accuracy'])

In [20]:
num_epochs = 10
model.fit(padded,
          training_labels_final,
          epochs=num_epochs,
          validation_data=(testing_padded, testing_labels_final))

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<tensorflow.python.keras.callbacks.History at 0x7f22f0021fd0>

In [21]:
num_epochs = 10
model2.fit(padded,
          training_labels_final,
          epochs=num_epochs,
          validation_data=(testing_padded, testing_labels_final))

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<tensorflow.python.keras.callbacks.History at 0x7f2291c03a50>

## 更加深入的讨论词嵌入

In [25]:
#首先获得神经网络第0层的权值
e = model.layers[0]
weights = e.get_weights()[0]
weights.shape #shape:(vocab_size, embedding_dim) 我们有10000个单词，每个单词被转换为了16个向量

(10000, 16)

In [None]:
#为了对其进行可视化，我们需要将word_index中这个字典的value和key进行调换

In [26]:
reverse_word_index = dict([(value,key) for (key,value) in word_index.items()])

In [27]:
#我们将更改后的word_index分别写入out_v与out_m
import io
out_v = io.open('vecs.tsv', 'w', encoding = 'utf-8')
out_m = io.open('meta.tsv', 'w', encoding = 'utf-8')
for word_num in range(1, vocab_size):
  word = reverse_word_index[word_num]
  embeddings = weights[word_num]
  out_m.write(word+'\n')#写入每个单词
  out_v.write('\t'.join([str(x) for x in embeddings])+'\n')#写入每个单词的向量
out_v.close()
out_m.close()

In [28]:
#然后进入 https://projector.tensorflow.org 进行可视化