In [17]:
from tensorflow.keras import layers as L
from tensorflow.keras.models import Model

### Links

1. Difference betweeen Embedding Layer and Dense Layer
    * https://medium.com/logivan/neural-network-embedding-and-dense-layers-whats-the-difference-fa177c6d0304
    
2. Source
    * https://www.tensorflow.org/tutorials/text/word_embeddings

### What does Different Layers do

In [1]:
from tensorflow.keras import layers as L

import numpy as np

In [2]:
embedding = L.Embedding(10, 2)
embedding(np.array([[0,1,9],[5,3,4]]))

<tf.Tensor: shape=(2, 3, 2), dtype=float32, numpy=
array([[[ 0.02515623, -0.03187831],
        [-0.01517564,  0.00728263],
        [-0.00040847,  0.02596793]],

       [[ 0.03853666, -0.02476275],
        [-0.00047541, -0.04866255],
        [ 0.00736698, -0.03209202]]], dtype=float32)>

In [3]:
pooling = L.GlobalAveragePooling1D()
pooling(embedding(np.array([[0,1,9],[5,3,4]])))

<tf.Tensor: shape=(2, 2), dtype=float32, numpy=
array([[ 0.00319071,  0.00045742],
       [ 0.01514274, -0.03517244]], dtype=float32)>

In [4]:
pooling = L.GlobalAveragePooling1D(data_format='channels_first')
pooling(embedding(np.array([[0,1,9],[5,3,4]])))

<tf.Tensor: shape=(2, 3), dtype=float32, numpy=
array([[-0.00336104, -0.00394651,  0.01277973],
       [ 0.00688695, -0.02456898, -0.01236252]], dtype=float32)>

In [5]:
from tensorflow import keras
from tensorflow.keras import layers as L
import tensorflow as tf

from tensorflow.keras.models import Model

In [6]:
import tensorflow_datasets as tfds

In [7]:
(train_data, test_data), info = tfds.load('imdb_reviews/subwords8k',
                                          split = (tfds.Split.TRAIN, tfds.Split.TEST),
                                          with_info=True, as_supervised=True)

encoder = info.features['text'].encoder



In [8]:
train_batches = train_data.shuffle(1000).padded_batch(10, padded_shapes=([None],[]))
test_batches = test_data.shuffle(1000).padded_batch(10, padded_shapes=([None],[]))

In [9]:
train_batch, train_labels = next(iter(train_batches))
train_batch.numpy()

array([[ 519,  198,   46, ...,    0,    0,    0],
       [7984, 7986, 7976, ...,    0,    0,    0],
       [3187,   89, 6545, ...,    0,    0,    0],
       ...,
       [ 968,   47, 3142, ...,    0,    0,    0],
       [ 601, 6144, 8002, ...,    0,    0,    0],
       [ 156,  151, 7968, ...,    6, 1405, 7975]])

In [84]:
for train_example, train_label in train_data.take(1):
    print('Encoded text:', train_example[:10].numpy())
    print('Label:', train_label.numpy())

Encoded text: [  62   18   41  604  927   65    3  644 7968   21]
Label: 0


In [85]:
for example_batch, label_batch in train_batches.take(2):
    print("Batch shape:", example_batch.shape)
    print("label shape:", label_batch.shape)

Batch shape: (10, 1352)
label shape: (10,)
Batch shape: (10, 712)
label shape: (10,)


In [10]:

EMBEDDING_DIM = 16
VOCAB_SIZE = encoder.vocab_size

model = keras.Sequential([
    L.Embedding(VOCAB_SIZE, EMBEDDING_DIM),
    L.GlobalAveragePooling1D(),
    L.Dense(16, activation='relu'),
    L.Dense(1)
])


In [11]:
model.summary()

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_1 (Embedding)      (None, None, 16)          130960    
_________________________________________________________________
global_average_pooling1d_2 ( (None, 16)                0         
_________________________________________________________________
dense (Dense)                (None, 16)                272       
_________________________________________________________________
dense_1 (Dense)              (None, 1)                 17        
Total params: 131,249
Trainable params: 131,249
Non-trainable params: 0
_________________________________________________________________


In [13]:
16*16 + 16

272

In [12]:
VOCAB_SIZE*EMBEDDING_DIM

130960

In [19]:
VOCAB_SIZE

8185

In [14]:

model.compile(optimizer='adam',
              loss=tf.keras.losses.BinaryCrossentropy(from_logits=True),
              metrics=['accuracy'])

history = model.fit(
        train_batches,
        epochs=5,
        validation_data=test_batches,
        validation_steps=20)

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


In [131]:
#info.description
#info.download_size
#info.supervised_keys
#info.features

### Visualizing Word Embeddings

In [15]:
model.layers

[<tensorflow.python.keras.layers.embeddings.Embedding at 0x13b3351d0>,
 <tensorflow.python.keras.layers.pooling.GlobalAveragePooling1D at 0x10330fb90>,
 <tensorflow.python.keras.layers.core.Dense at 0x13c40d910>,
 <tensorflow.python.keras.layers.core.Dense at 0x13c34af90>]

In [16]:
e = model.layers[0]

In [92]:
weights = e.get_weights()[0]
print(weights.shape) # shape: (vocab_size, embedding_dim)

(8185, 16)


In [95]:
import random
random.choice(['A','B','C','D'])

'C'

In [96]:
import io

encoder = info.features['text'].encoder

out_v = io.open('../big-files/word-embeddings-vecs.tsv', 'w', encoding='utf-8')
out_m = io.open('../big-files/word-embeddings-meta.tsv', 'w', encoding='utf-8')

out_m.write("\t".join(["word","class"]) + "\n")

for num, word in enumerate(encoder.subwords):
    vec = weights[num+1] # skip 0, it's padding.
    out_m.write("\t".join([word,random.choice(['A','B','C','D'])]) + "\n")
    out_v.write('\t'.join([str(x) for x in vec]) + "\n")

out_v.close()
out_m.close()

### Visualize the embeddings
To visualize our embeddings upload them to the embedding projector.

Open the Embedding Projector (this can also run in a local TensorBoard instance).

* Click on "Load data".

* Upload the two files created above: vecs.tsv and meta.tsv.

The embeddings you have trained will now be displayed. You can search for words to find their closest neighbors. For example, try searching for "beautiful". You may see neighbors like "wonderful".

In [97]:
!head -5 word-embeddings-meta.tsv

word	class
the_	C
, 	B
. 	B
a_	A


In [98]:
!head -5 word-embeddings-vecs.tsv

0.09168589	-0.03764703	0.13426588	0.045826133	0.10915152	0.12124654	0.057822347	0.105642885	0.06465526	-0.08695891	0.0044927956	0.060063865	-0.048089758	0.009972526	0.017446194	0.05941992
-0.011089035	-0.02632265	0.08001026	0.026919086	0.020393694	0.11729438	-0.025492614	0.057734393	0.073117696	-0.07000069	-0.010621369	0.0038523786	0.0051375018	0.0068875127	0.017043302	-0.036084823
-0.014711189	-0.0068150936	-0.021027738	0.12178983	-0.010772438	0.077707574	-0.05735729	0.02820964	0.09139164	0.031960886	-0.0079053715	0.047761936	0.030932683	0.0064102286	0.030124169	-0.08493775
-0.03751501	-0.032466944	0.041288	0.09144153	0.064260535	0.095885985	0.0018964682	0.117076196	0.05275915	0.016736258	-0.048899982	0.010198308	-0.0040065846	-0.0042498633	-0.001717417	0.00035502313
0.105565146	-0.040354658	0.13522404	-0.010742345	0.15480578	0.16393423	0.036755394	0.10789068	0.1774977	-0.1026288	0.010162583	0.0900577	-0.09602754	-0.076360814	0.07845271	0.062755905


In [29]:
len(encoder.subwords)

7928

In [88]:
weights.shape

(8185, 16)

In [52]:
encoder.subwords.index('the_')

0

In [89]:
_a = encoder.encode("theyq1111111opiopjwoqjdoiwj")
_a

[466,
 8042,
 5191,
 5191,
 5191,
 7978,
 5656,
 1096,
 8035,
 1647,
 8042,
 8035,
 7554,
 8048,
 8035]

In [90]:
''.join([encoder.decode([x]) for x in _a])

'theyq1111111opiopjwoqjdoiwj'

In [61]:
[encoder.subwords[x] for x in [19, 466, 8, 5191]]

['as_', '10_', 'is_', 'yell']

In [46]:
encoder.decode([7929])

'\x00'

In [47]:
encoder.decode([7930])

'\x01'

In [69]:
encoder.decode([8024])

'_'

In [40]:
encoder.decode([8184])

'�'

In [91]:
encoder.decode([8183])

'�'