In [52]:
import io
import os
import re
import shutil
import string
import tensorflow as tf

from tensorflow.keras import Sequential
from tensorflow.keras.layers import Dense, Embedding, GlobalAveragePooling1D
from tensorflow.keras.layers import TextVectorization


In [53]:
url = "https://ai.stanford.edu/~amaas/data/sentiment/aclImdb_v1.tar.gz"

dataset ='/home/turning/Desktop/CL_project/CL_PROJECT_CODE/Kerasmodel/aclImdb'

dataset_dir = os.path.join(os.path.dirname(dataset), 'aclImdb')
os.listdir(dataset_dir)

['README', 'train']

In [54]:
train_dir = '/home/turning/Desktop/CL_project/CL_PROJECT_CODE/Kerasmodel/aclImdb/train/'
os.listdir(train_dir)

['pos']

In [23]:
remove_dir = os.path.join(train_dir, 'unsup')
shutil.rmtree(remove_dir)

FileNotFoundError: [Errno 2] No such file or directory: '/home/turning/Desktop/CL_project/CL_PROJECT_CODE/Kerasmodel/aclImdb/train/unsup'

In [55]:
batch_size = 8
seed = 123
train_ds = tf.keras.utils.text_dataset_from_directory(
    'aclImdb/train', batch_size=batch_size, validation_split=0.2,
    subset='training', seed=seed)
val_ds = tf.keras.utils.text_dataset_from_directory(
    'aclImdb/train', batch_size=batch_size, validation_split=0.2,
    subset='validation', seed=seed)

Found 100 files belonging to 1 classes.
Using 80 files for training.
Found 100 files belonging to 1 classes.
Using 20 files for validation.


In [56]:
AUTOTUNE = tf.data.AUTOTUNE

train_ds = train_ds.cache().prefetch(buffer_size=AUTOTUNE)
val_ds = val_ds.cache().prefetch(buffer_size=AUTOTUNE)

In [57]:
# Embed a 1,000 word vocabulary into 5 dimensions.
embedding_layer = tf.keras.layers.Embedding(100000, 5)

In [58]:
result = embedding_layer(tf.constant([1, 2, 3]))
result.numpy()

array([[ 0.0324355 ,  0.04094839, -0.00080473, -0.01767687,  0.01185206],
       [ 0.02696994,  0.04724102,  0.03067691, -0.04152416, -0.02323771],
       [ 0.00131531,  0.02184465, -0.01064578,  0.01855883, -0.04097866]],
      dtype=float32)

In [59]:
result = embedding_layer(tf.constant([[0, 1, 2], [3, 4, 5]]))
result.shape

TensorShape([2, 3, 5])

In [60]:
# Create a custom standardization function to strip HTML break tags '<br />'.
def custom_standardization(input_data):
  lowercase = tf.strings.lower(input_data)
  stripped_html = tf.strings.regex_replace(lowercase, '<br />', ' ')
  return tf.strings.regex_replace(stripped_html,
                                  '[%s]' % re.escape(string.punctuation), '')


# Vocabulary size and number of words in a sequence.
vocab_size = 1000000
sequence_length = 100

# Use the text vectorization layer to normalize, split, and map strings to
# integers. Note that the layer uses the custom standardization defined above.
# Set maximum_sequence length as all samples are not of the same length.
vectorize_layer = TextVectorization(
    standardize=custom_standardization,
    max_tokens=vocab_size,
    output_mode='int',
    output_sequence_length=sequence_length)

# Make a text-only dataset (no labels) and call adapt to build the vocabulary.
text_ds = train_ds.map(lambda x, y: x)
vectorize_layer.adapt(text_ds)

In [61]:
embedding_dim=10

model = Sequential([
  vectorize_layer,
  Embedding(vocab_size, embedding_dim, name="embedding"),
  GlobalAveragePooling1D(),
  Dense(16, activation='relu'),
  Dense(1)
])

In [62]:
tensorboard_callback = tf.keras.callbacks.TensorBoard(log_dir="logs")

In [63]:
model.compile(optimizer='adam',
              loss=tf.keras.losses.BinaryCrossentropy(from_logits=True),
              metrics=['accuracy'])

In [64]:
model.fit(
    train_ds,
    validation_data=val_ds,
    epochs=15,
    callbacks=[tensorboard_callback])

Epoch 1/15
Epoch 2/15
Epoch 3/15
Epoch 4/15
Epoch 5/15
Epoch 6/15
Epoch 7/15
Epoch 8/15
Epoch 9/15
Epoch 10/15
Epoch 11/15
Epoch 12/15
Epoch 13/15
Epoch 14/15
Epoch 15/15


<keras.callbacks.History at 0x7f9bd4f1a500>

In [65]:
model.summary()

Model: "sequential_3"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 text_vectorization_3 (TextV  (None, 100)              0         
 ectorization)                                                   
                                                                 
 embedding (Embedding)       (None, 100, 10)           10000000  
                                                                 
 global_average_pooling1d_3   (None, 10)               0         
 (GlobalAveragePooling1D)                                        
                                                                 
 dense_6 (Dense)             (None, 16)                176       
                                                                 
 dense_7 (Dense)             (None, 1)                 17        
                                                                 
Total params: 10,000,193
Trainable params: 10,000,193


In [66]:
#docs_infra: no_execute
%load_ext tensorboard
%tensorboard --logdir logs

The tensorboard extension is already loaded. To reload it, use:
  %reload_ext tensorboard


Reusing TensorBoard on port 6006 (pid 33865), started 0:28:52 ago. (Use '!kill 33865' to kill it.)

In [67]:
weights = model.get_layer('embedding').get_weights()[0]
vocab = vectorize_layer.get_vocabulary()

In [68]:
out_v = io.open('vectors.tsv', 'w', encoding='utf-8')
out_m = io.open('metadata.tsv', 'w', encoding='utf-8')

for index, word in enumerate(vocab):
  if index == 0:
    continue  # skip 0, it's padding.
  vec = weights[index]
  out_v.write('\t'.join([str(x) for x in vec]) + "\n")
  out_m.write(word + "\n")
out_v.close()
out_m.close()

In [None]:
try:
  from google.colab import files
  files.download('vectors.tsv')
  files.download('metadata.tsv')
except Exception:
  pass

 [-0.00649636  0.04931908  0.04416443  0.0145332  -0.0104395 ]
[UNK] [-0.02212589  0.0263616  -0.00203462  0.00642748  0.03951794]
के [ 0.01458243 -0.02597545  0.06404933  0.01662998  0.01019375]
में [-0.03535331  0.03821914  0.02990494  0.0402509   0.00597337]
है। [0.03445628 0.0399145  0.02793489 0.04120619 0.04558567]
की [-0.01392644  0.05049116  0.02797532  0.04080502  0.02595929]
से [ 0.0376856   0.03811477 -0.0100725   0.05295699 -0.02420089]
और [-0.02225162  0.05393459  0.04259962 -0.01922766 -0.01962424]
है [ 0.06035469 -0.00850885  0.03377704  0.03438944  0.01555839]
का [ 0.01605194 -0.02025693  0.04111551  0.01542283 -0.02033423]
को [ 0.03321993  0.00808049  0.06103184 -0.03026696  0.02218003]
हैं। [-0.00735364 -0.01693727  0.03644471  0.03153728 -0.03026784]
पर [ 0.06354017 -0.03256369 -0.01883302  0.06398392  0.02532097]
हैं [ 0.04831302  0.03880343 -0.02824742  0.05528825  0.05123637]
एक [ 0.02989139 -0.01358277  0.0132876  -0.0080367   0.02216794]
तथा [ 0.04341233 -0.0346