This notebook source copied from https://www.tensorflow.org/text/guide/word_embeddings

In [10]:
#@title Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# https://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

In [11]:
import io
import os
import re
import shutil
import string
import tensorflow as tf

from tensorflow.keras import Sequential
from tensorflow.keras.layers import Dense, Embedding, GlobalAveragePooling1D
from tensorflow.keras.layers import TextVectorization

In [12]:
url = r"https://ai.stanford.edu/~amaas/data/sentiment/aclImdb_v1.tar.gz"

dataset = tf.keras.utils.get_file(r"aclImdb_v1.tar.gz", url,
                                  untar=True, cache_dir=r'/tmp/.keras',
                                  cache_subdir=r'./aclImdb_v1')
dataset_dir = os.path.join(os.path.dirname(dataset), r'aclImdb')
print(dataset_dir)
os.listdir(dataset_dir)

/tmp/.keras/./aclImdb_v1/aclImdb


['README', 'imdb.vocab', 'imdbEr.txt', 'train', 'test']

In [13]:
train_dir = os.path.join(dataset_dir, r'train')
os.listdir(train_dir)

['neg',
 'pos',
 'unsupBow.feat',
 'urls_pos.txt',
 'urls_neg.txt',
 'urls_unsup.txt',
 'labeledBow.feat',
 'unsup']

In [14]:
remove_dir = os.path.join(train_dir, r'unsup')
shutil.rmtree(remove_dir)

In [16]:
batch_size = 1024
seed = 123

def process_text_dataset(subset):
    return tf.keras.preprocessing.text_dataset_from_directory(
        os.path.join(dataset_dir, r'train'),
        batch_size=batch_size, validation_split=0.2,
        subset=subset, seed=seed)

train_ds = process_text_dataset(subset=r'training')
val_ds = process_text_dataset(subset=r'validation')

Found 25000 files belonging to 2 classes.
Using 20000 files for training.
Found 25000 files belonging to 2 classes.
Using 5000 files for validation.


In [18]:
for text_batch, label_batch in train_ds.take(1):
    for i in range(5):
        print(label_batch[i].numpy(), text_batch.numpy()[i])

0 b"Wow. Some movies just leave me speechless. This was undeniably one of those movies. When I left the theatre, not a single word came to my mouth. All I had was an incredible urge to slam my head against the theatre wall to help me forget about the last hour and a half. Unfortunately, it didn't work. Honestly, this movie has nothing to recommend. The humor was at the first grade level, at best, the acting was overly silly, and the plot was astronomically far-fetched. I hearby pledge never to see an other movie starring Chris Kattan or any other cast-member of SNL."
1 b'If any show in the last ten years deserves a 10, it is this rare gem. It allows us to escape back to a time when things were simpler and more fun. Filled with heart and laughs, this show keeps you laughing through the three decades of difference. The furniture was ugly, the clothes were colorful, and the even the drugs were tolerable. The hair was feathered, the music was accompanied by roller-skates, and in the words 

In [19]:
AUTOTUNE = tf.data.AUTOTUNE

train_ds = train_ds.cache().prefetch(buffer_size=AUTOTUNE)
val_ds = val_ds.cache().prefetch(buffer_size=AUTOTUNE)

In [20]:
embedding_layer = tf.keras.layers.Embedding(1000, 5)

In [21]:
result = embedding_layer(tf.constant([1, 2, 3]))
result.numpy()

array([[-0.03438661,  0.00320473,  0.02156866, -0.02502697,  0.01475221],
       [-0.04914664, -0.005404  , -0.03327751,  0.00777481,  0.02983249],
       [-0.02312477, -0.03747673, -0.01817928,  0.02494278,  0.01719629]],
      dtype=float32)

In [22]:
result = embedding_layer(tf.constant([[0, 1, 2], [3, 4, 5]]))
result.shape

TensorShape([2, 3, 5])

In [24]:
# Create a custom standardization function to strip HTML break tags '<br />'.
def custom_standardization(input_data):
  lowercase = tf.strings.lower(input_data)
  stripped_html = tf.strings.regex_replace(lowercase, r'<br />', r' ')
  return tf.strings.regex_replace(stripped_html,
                                  '[%s]' % re.escape(string.punctuation), '')


# Vocabulary size and number of words in a sequence.
vocab_size = 10000
sequence_length = 100

# Use the text vectorization layer to normalize, split, and map strings to
# integers. Note that the layer uses the custom standardization defined above.
# Set maximum_sequence length as all samples are not of the same length.
vectorize_layer = TextVectorization(
    standardize=custom_standardization,
    max_tokens=vocab_size,
    output_mode=r'int',
    output_sequence_length=sequence_length)

# Make a text-only dataset (no labels) and call adapt to build the vocabulary.
text_ds = train_ds.map(lambda x, y: x)
vectorize_layer.adapt(text_ds)

In [25]:
embedding_dim=16

model = Sequential([
  vectorize_layer,
  Embedding(vocab_size, embedding_dim, name=r'embedding'),
  GlobalAveragePooling1D(),
  Dense(16, activation=r'relu'),
  Dense(1)
])

In [26]:
tensorboard_callback = tf.keras.callbacks.TensorBoard(log_dir=r'logs')

In [27]:
model.compile(optimizer=r'adam',
              loss=tf.keras.losses.BinaryCrossentropy(from_logits=True),
              metrics=[r'accuracy'])

In [28]:
model.fit(
    train_ds,
    validation_data=val_ds,
    epochs=15,
    callbacks=[tensorboard_callback])

Epoch 1/15
Epoch 2/15
Epoch 3/15
Epoch 4/15
Epoch 5/15
Epoch 6/15
Epoch 7/15
Epoch 8/15
Epoch 9/15
Epoch 10/15
Epoch 11/15
Epoch 12/15
Epoch 13/15
Epoch 14/15
Epoch 15/15


<keras.callbacks.History at 0x7f4734e1fcf8>

In [29]:
model.summary()

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
text_vectorization_1 (TextVe (None, 100)               0         
_________________________________________________________________
embedding (Embedding)        (None, 100, 16)           160000    
_________________________________________________________________
global_average_pooling1d (Gl (None, 16)                0         
_________________________________________________________________
dense (Dense)                (None, 16)                272       
_________________________________________________________________
dense_1 (Dense)              (None, 1)                 17        
Total params: 160,289
Trainable params: 160,289
Non-trainable params: 0
_________________________________________________________________


In [30]:
%load_ext tensorboard
%tensorboard --logdir logs

In [31]:
weights = model.get_layer(r'embedding').get_weights()[0]
vocab = vectorize_layer.get_vocabulary()

In [33]:
with io.open(r'vectors.tsv', r'w', encoding=r'utf-8') as out_v:
    with io.open(r'metadata.tsv', r'w', encoding=r'utf-8') as out_m:

        for index, word in enumerate(vocab):
            if index == 0:
                continue  # skip 0, it's padding.
            vec = weights[index]
            out_v.write('\t'.join([str(x) for x in vec]) + "\n")
            out_m.write(word + "\n")
