In [0]:
!pip install -q tensorflow==2.1.0

[K     |████████████████████████████████| 421.8MB 36kB/s 
[K     |████████████████████████████████| 450kB 52.5MB/s 
[K     |████████████████████████████████| 3.9MB 60.0MB/s 
[?25h  Building wheel for gast (setup.py) ... [?25l[?25hdone


#Text Generation

In [0]:
import os
import numpy as np
import re
import shutil
import tensorflow as tf


In [0]:
DATA_DIR  = "./data"
CHECKPOINT_DIR = os.path.join(DATA_DIR, "checkpoints")

In [0]:
def download_and_read(urls):
  texts = []
  for i, url in enumerate(urls):
    p = tf.keras.utils.get_file("ex1-{:d}.txt".format(i), url, cache_dir=".")
    text = open(p, "r").read()
    # byte remove order mark
    text = text.replace("\ufeff", "")
    # remove newlines
    text = text.replace('\n', ' ')
    text = re.sub(r'\s+', " ", text)
    #add to its list
    texts.extend(text)
  return texts



In [0]:
texts = download_and_read([
                           "http://www.gutenberg.org/cache/epub/28885/pg28885.txt",
                           "https://www.gutenberg.org/files/12/12-0.txt"
])

Downloading data from http://www.gutenberg.org/cache/epub/28885/pg28885.txt
Downloading data from https://www.gutenberg.org/files/12/12-0.txt


In [0]:
# create the vocabulary
vocab = sorted(set(texts))
print("vocab size: {:d}".format(len(vocab)))

# create mapping from vocab chars to ints
char2idx = {c:i for i, c in enumerate(vocab)}
idx2char = {i:c for c,i in char2idx.items()} 

vocab size: 90
ERROR! Session/line number was not unique in database. History logging moved to new session 59


In [0]:
for i, c in idx2char.items():
  if i < 50:
    print(i, c)

0  
1 !
2 "
3 #
4 $
5 %
6 &
7 '
8 (
9 )
10 *
11 ,
12 -
13 .
14 /
15 0
16 1
17 2
18 3
19 4
20 5
21 6
22 7
23 8
24 9
25 :
26 ;
27 ?
28 @
29 A
30 B
31 C
32 D
33 E
34 F
35 G
36 H
37 I
38 J
39 K
40 L
41 M
42 N
43 O
44 P
45 Q
46 R
47 S
48 T
49 U


In [0]:
vocab

[' ',
 '!',
 '"',
 '#',
 '$',
 '%',
 '&',
 "'",
 '(',
 ')',
 '*',
 ',',
 '-',
 '.',
 '/',
 '0',
 '1',
 '2',
 '3',
 '4',
 '5',
 '6',
 '7',
 '8',
 '9',
 ':',
 ';',
 '?',
 '@',
 'A',
 'B',
 'C',
 'D',
 'E',
 'F',
 'G',
 'H',
 'I',
 'J',
 'K',
 'L',
 'M',
 'N',
 'O',
 'P',
 'Q',
 'R',
 'S',
 'T',
 'U',
 'V',
 'W',
 'X',
 'Y',
 'Z',
 '[',
 ']',
 '_',
 'a',
 'b',
 'c',
 'd',
 'e',
 'f',
 'g',
 'h',
 'i',
 'j',
 'k',
 'l',
 'm',
 'n',
 'o',
 'p',
 'q',
 'r',
 's',
 't',
 'u',
 'v',
 'w',
 'x',
 'y',
 'z',
 '·',
 'ù',
 '‘',
 '’',
 '“',
 '”']

In [0]:
# numericize the text
texts_as_ints = np.array([char2idx[c] for c in texts])
data = tf.data.Dataset.from_tensor_slices(texts_as_ints)

# number of characters to show before asking for prediction
# sequences: [None, 100]
seq_length = 100
sequences = data.batch(seq_length + 1, drop_remainder=True)

def split_train_labels(sequence):
  input_seq = sequence[0:-1]
  output_seq = sequence[1:]
  return input_seq, output_seq


sequences = sequences.map(split_train_labels)

# set up for training
# batches: [None, 64, 100]

batch_size = 64
steps_per_epoch = len(texts) // seq_length // batch_size
dataset = sequences.shuffle(10000).batch(batch_size, drop_remainder=True)

In [0]:
type(sequences)

tensorflow.python.data.ops.dataset_ops.MapDataset

In [0]:
sequencesshape = [i for i in sequences.as_numpy_iterator()]

In [0]:
len(sequencesshape)

3423

In [0]:
sequencesshape[0]

(array([44, 75, 72, 67, 62, 60, 77,  0, 35, 78, 77, 62, 71, 59, 62, 75, 64,
         7, 76,  0, 29, 69, 66, 60, 62,  7, 76,  0, 29, 61, 79, 62, 71, 77,
        78, 75, 62, 76,  0, 66, 71,  0, 51, 72, 71, 61, 62, 75, 69, 58, 71,
        61, 11,  0, 59, 82,  0, 40, 62, 80, 66, 76,  0, 31, 58, 75, 75, 72,
        69, 69,  0, 48, 65, 66, 76,  0, 62, 30, 72, 72, 68,  0, 66, 76,  0,
        63, 72, 75,  0, 77, 65, 62,  0, 78, 76, 62,  0, 72, 63,  0]),
 array([75, 72, 67, 62, 60, 77,  0, 35, 78, 77, 62, 71, 59, 62, 75, 64,  7,
        76,  0, 29, 69, 66, 60, 62,  7, 76,  0, 29, 61, 79, 62, 71, 77, 78,
        75, 62, 76,  0, 66, 71,  0, 51, 72, 71, 61, 62, 75, 69, 58, 71, 61,
        11,  0, 59, 82,  0, 40, 62, 80, 66, 76,  0, 31, 58, 75, 75, 72, 69,
        69,  0, 48, 65, 66, 76,  0, 62, 30, 72, 72, 68,  0, 66, 76,  0, 63,
        72, 75,  0, 77, 65, 62,  0, 78, 76, 62,  0, 72, 63,  0, 58]))

In [0]:
sequencesshape[3421]

(array([78, 77,  0, 44, 75, 72, 67, 62, 60, 77,  0, 35, 78, 77, 62, 71, 59,
        62, 75, 64, 12, 77, 70, 11,  0, 66, 71, 60, 69, 78, 61, 66, 71, 64,
         0, 65, 72, 80,  0, 77, 72,  0, 70, 58, 68, 62,  0, 61, 72, 71, 58,
        77, 66, 72, 71, 76,  0, 77, 72,  0, 77, 65, 62,  0, 44, 75, 72, 67,
        62, 60, 77,  0, 35, 78, 77, 62, 71, 59, 62, 75, 64,  0, 40, 66, 77,
        62, 75, 58, 75, 82,  0, 29, 75, 60, 65, 66, 79, 62,  0, 34]),
 array([77,  0, 44, 75, 72, 67, 62, 60, 77,  0, 35, 78, 77, 62, 71, 59, 62,
        75, 64, 12, 77, 70, 11,  0, 66, 71, 60, 69, 78, 61, 66, 71, 64,  0,
        65, 72, 80,  0, 77, 72,  0, 70, 58, 68, 62,  0, 61, 72, 71, 58, 77,
        66, 72, 71, 76,  0, 77, 72,  0, 77, 65, 62,  0, 44, 75, 72, 67, 62,
        60, 77,  0, 35, 78, 77, 62, 71, 59, 62, 75, 64,  0, 40, 66, 77, 62,
        75, 58, 75, 82,  0, 29, 75, 60, 65, 66, 79, 62,  0, 34, 72]))

In [0]:
steps_per_epoch

54

In [0]:
class CharGenModel(tf.keras.Model):
  def __init__(self, vocab_size, num_timesteps, embedding_dim, rnn_output_dim, **kwargs):
    super(CharGenModel, self).__init__(**kwargs)
    self.embedding_layer = tf.keras.layers.Embedding(vocab_size, embedding_dim)
    self.rnn_layer = tf.keras.layers.GRU(num_timesteps, recurrent_initializer="glorot_uniform", recurrent_activation="sigmoid", stateful=True, return_sequences=True)
    self.dense_layer = tf.keras.layers.Dense(vocab_size)

  def call(self, x):
    x = self.embedding_layer(x)
    x = self.rnn_layer(x)
    x = self.dense_layer(x)
    return x

  
vocab_size = len(vocab)
embedding_dim = 256
rnn_output_dim = 1024



model = CharGenModel(vocab_size, seq_length, embedding_dim, rnn_output_dim)
model.build(input_shape=(batch_size, seq_length))

  





In [0]:
def loss(labels, predictions):
  return tf.losses.sparse_categorical_crossentropy(labels, predictions, from_logits=True)

model.compile(optimizer=tf.optimizers.Adam(), loss=loss)

In [0]:
def generate_text(model, prefix_string, char2idx, idx2char, num_chars_to_generate=1000, temperature=1.0):
  input = [char2idx[s] for s in prefix_string]
  input = tf.expand_dims(input, 0)
  text_generated = []
  model.reset_states()
  for i in range(num_chars_to_generate):
    preds = model(input)
    preds = tf.squeeze(preds, 0) / temperature
    # predict characters by model
    pred_id = tf.random.categorical(preds, num_samples=1)[-1, 0].numpy()
    text_generated.append(idx2char[pred_id])
    # pass prediction to the next input model
    input = tf.expand_dims([pred_id], 0)

  return prefix_string + "".join(text_generated)

In [0]:
num_epochs = 50

for i in range(num_epochs // 10):
  model.fit(dataset.repeat(), epochs=10, steps_per_epoch=steps_per_epoch)
  checkpoint_file = os.path.join(CHECKPOINT_DIR, "model_epoch_{:d}".format(i+1))
  model.save_weights(checkpoint_file)

  # create generative model using the trained model so far
  gen_model = CharGenModel(vocab_size, seq_length, embedding_dim, rnn_output_dim)
  gen_model.load_weights(checkpoint_file)
  gen_model.build(input_shape=(1, seq_length))
  

  print("after epoch: {:d}".format(i+1) * 10)
  print(generate_text(gen_model, "Alice ", char2idx, idx2char))
  print("-------------")

Train for 54 steps
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
after epoch: 1after epoch: 1after epoch: 1after epoch: 1after epoch: 1after epoch: 1after epoch: 1after epoch: 1after epoch: 1after epoch: 1
Alice her Proce at she himnsty softiong, ‘Dot on ast, see ithut il you that or dithought if dook I sty and of hak the carke wood, and hew!’ All may on one. HA Mack, arker clalupped inse, Lading nereathing dookrow uld gar!" "Why hean dow mint, and plasce the to seag." I's haining one vermerty Quever suy here it itp o‘Rew freace took, she look ararentres, do see rea beathone od the Witre woGeen't wares a cauld, and liestionm!’ And ong gool it you nathed all a glean’m copsed, dim the wastere. Igquatilying al up, homent--gades,_ and a pued swelf mad. I caje, the prims, she preaieg-- he nocrwenversh the do, wo thee bittle, as the, "She thite Quee tith the moor cwas that?" she sadding: wher! as the said id FI’ve of ovood. The

#**Sentiment Analysis**

In [0]:
import numpy as np
import os
import shutil
import tensorflow as tf

from sklearn.metrics import accuracy_score, confusion_matrix

In [0]:
def download_and_read(url):
    local_file = url.split('/')[-1]
    local_file = local_file.replace("%20", " ")
    p = tf.keras.utils.get_file(local_file, url, 
        extract=True, cache_dir=".")
    local_folder = os.path.join("datasets", local_file.split('.')[0])
    labeled_sentences = []
    for labeled_filename in os.listdir(local_folder):
        if labeled_filename.endswith("_labelled.txt"):
            with open(os.path.join(local_folder, labeled_filename), "r") as f:
                for line in f:
                    sentence, label = line.strip().split('\t')
                    labeled_sentences.append((sentence, label))
    return labeled_sentences


   

In [0]:
labeled_sentences = download_and_read(
    "https://archive.ics.uci.edu/ml/machine-learning-databases/00331/sentiment%20labelled%20sentences.zip")
sentences = [s for (s, l) in labeled_sentences]
labels = [int(l) for (s, l) in labeled_sentences]

Downloading data from https://archive.ics.uci.edu/ml/machine-learning-databases/00331/sentiment%20labelled%20sentences.zip


In [0]:
sentences[0:10]

['Wow... Loved this place.',
 'Crust is not good.',
 'Not tasty and the texture was just nasty.',
 'Stopped by during the late May bank holiday off Rick Steve recommendation and loved it.',
 'The selection on the menu was great and so were the prices.',
 'Now I am getting angry and I want my damn pho.',
 "Honeslty it didn't taste THAT fresh.)",
 'The potatoes were like rubber and you could tell they had been made up ahead of time being kept under a warmer.',
 'The fries were great too.',
 'A great touch.']

In [0]:
labels[:10]

[1, 0, 0, 1, 1, 0, 0, 0, 1, 1]

In [0]:
labeled_sentences[:5]

[('Wow... Loved this place.', '1'),
 ('Crust is not good.', '0'),
 ('Not tasty and the texture was just nasty.', '0'),
 ('Stopped by during the late May bank holiday off Rick Steve recommendation and loved it.',
  '1'),
 ('The selection on the menu was great and so were the prices.', '1')]

In [0]:
tokenizer = tf.keras.preprocessing.text.Tokenizer()
tokenizer.fit_on_texts(sentences)
vocab_size = len(tokenizer.word_counts)
print("Vocabulary size: {:d}".format(vocab_size))

word2idx = tokenizer.word_index
idx2word = {v:k for (k, v) in word2idx.items()}

Vocabulary size: 5271


In [0]:
idx2word[2]

'and'

In [0]:
word2idx['and']

2

In [0]:
type(word2idx)

dict

In [0]:
seq_lengths = np.array([len(s.split()) for s in sentences])
print([(p, np.percentile(seq_lengths, p)) for p in [75, 80, 90, 95, 99, 100]])

[(75, 16.0), (80, 18.0), (90, 22.0), (95, 26.0), (99, 36.0), (100, 71.0)]


In [0]:
max_length = 64

# create a dataset
sentences_as_ints = tokenizer.texts_to_sequences(sentences)
sentences_as_ints[:2]

[[652, 215, 8, 38], [1486, 5, 13, 18]]

In [0]:
sentences_as_ints = tf.keras.preprocessing.sequence.pad_sequences(sentences_as_ints, max_length)
labels_as_ints = np.array(labels)

In [0]:
dataset = tf.data.Dataset.from_tensor_slices((sentences_as_ints, labels_as_ints))

In [0]:
dataset = dataset.shuffle(10000)
test_size = len(sentences) // 3
val_size = (len(sentences) - test_size) // 10
test_dataset = dataset.take(test_size)
val_dataset = dataset.skip(test_size).take(val_size)
train_dataset = dataset.skip(test_size + val_size)

batch_size = 64
train_dataset = train_dataset.batch(batch_size)
val_dataset = val_dataset.batch(batch_size)
test_dataset = test_dataset.batch(batch_size)

In [0]:
class SentimentAnalysisModel(tf.keras.Model):
  def __init__(self, vocab_size, max_seqlen, **kwargs):
    super(SentimentAnalysisModel, self).__init__(**kwargs)
    self.embedding = tf.keras.layers.Embedding(vocab_size, max_length)
    self.bilstm = tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(max_length))
    self.dense = tf.keras.layers.Dense(64, activation="relu")
    self.out =  tf.keras.layers.Dense(1, activation="sigmoid")


  def call(self, x):
    x = self.embedding(x)
    x = self.bilstm(x)
    x = self.dense(x)
    x = self.out(x)
    return x



In [0]:
model = SentimentAnalysisModel(vocab_size + 1, max_length)
model.build(input_shape=(batch_size, max_length))
model.summary()

Model: "sentiment_analysis_model"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding (Embedding)        multiple                  337408    
_________________________________________________________________
bidirectional (Bidirectional multiple                  66048     
_________________________________________________________________
dense (Dense)                multiple                  8256      
_________________________________________________________________
dense_1 (Dense)              multiple                  65        
Total params: 411,777
Trainable params: 411,777
Non-trainable params: 0
_________________________________________________________________


In [0]:
# compile
model.compile(
    loss= "binary_crossentropy",
    optimizer = "adam",
    metrics = ["accuracy"]
)

In [0]:
def clean_logs(data_dir):
    logs_dir = os.path.join(data_dir, "logs")
    shutil.rmtree(logs_dir, ignore_errors=True)
    return logs_dir

In [0]:
data_dir = "./data"
logs_dir = clean_logs(data_dir)

In [0]:
# train
best_model_file = os.path.join(data_dir, "best_model.h5")
checkpoint = tf.keras.callbacks.ModelCheckpoint(best_model_file,
    save_weights_only=True,
    save_best_only=True)
tensorboard = tf.keras.callbacks.TensorBoard(log_dir=logs_dir)
num_epochs = 10
history = model.fit(train_dataset, epochs=num_epochs, 
    validation_data=val_dataset,
    callbacks=[checkpoint, tensorboard])

Train for 29 steps, validate for 4 steps
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


In [0]:
!tensorboard --logdir  /content/data/logs/train

2020-03-30 20:14:19.745135: W tensorflow/stream_executor/platform/default/dso_loader.cc:55] Could not load dynamic library 'libnvinfer.so.6'; dlerror: libnvinfer.so.6: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /usr/local/nvidia/lib:/usr/local/nvidia/lib64
2020-03-30 20:14:19.745249: W tensorflow/stream_executor/platform/default/dso_loader.cc:55] Could not load dynamic library 'libnvinfer_plugin.so.6'; dlerror: libnvinfer_plugin.so.6: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /usr/local/nvidia/lib:/usr/local/nvidia/lib64
2020-03-30 20:14:19.745267: W tensorflow/compiler/tf2tensorrt/utils/py_utils.cc:30] Cannot dlopen some TensorRT libraries. If you would like to use Nvidia GPU with TensorRT, please make sure the missing libraries mentioned above are installed properly.
Serving TensorBoard on localhost; to expose to the network, use a proxy or pass --bind_all
TensorBoard 2.1.1 at http://localhost:6007/ (Press CTRL+C to q

In [0]:
!pip install tensorboardcolab



In [0]:
from tensorboardcolab import TensorBoardColab, TensorBoardColabCallback

tbc=TensorBoardColab()

Using TensorFlow backend.


Wait for 8 seconds...
TensorBoard link:
https://e42cf188.ngrok.io


In [0]:
test_loss, test_acc = model.evaluate(test_dataset)
print("test loss: {:3f}, test_accuracy: {:3f} ".format(test_loss, test_acc))


test loss: 0.003010, test_accuracy: 0.999000 


In [0]:
labels, predictions = [], []
idx2word[0] = "PAD"
is_first_batch = True
for test_batch in test_dataset:
  inputs_b, labels_b = test_batch
  pred_batch = model.predict(inputs_b)
  predictions.extend([(1 if p > 0.5 else 0) for p in pred_batch])
  labels.extend([l for l in labels_b])
  if is_first_batch:
    # print first batch of label, prediction and sentence
    for rid in range(inputs_b.shape[0]):
      words = [idx2word[idx] for idx in inputs_b[rid].numpy()]
      words = [w for w in words if w != "PAD"]
      sentence = " ".join(words)
      print("{:d}\t{:d}\t{:s}".format(labels[rid], predictions[rid], sentence))

    is_first_batch = False


print("accuracy score: {:3f}".format(accuracy_score(labels, predictions)))
print("confusion matrix")
print(confusion_matrix(labels, predictions))


1	1	the owner used to work at nobu so this place is really similar for half the price
1	1	waitress was good though
1	1	the story is also both funny and poignant at times
0	0	unfortunately it was not good
0	0	this one just fails to create any real suspense
1	1	back to good bbq lighter fare reasonable pricing and tell the public they are back to the old ways
1	1	we were promptly greeted and seated
1	1	the grilled chicken was so tender and yellow from the saffron seasoning
1	1	as for the service i thought it was good
1	1	and those baby owls were adorable
0	0	lobster bisque bussell sprouts risotto filet all needed salt and pepper and of course there is none at the tables
0	0	what happened next was pretty off putting
1	1	if you have not seen this movie i definitely recommend it
0	0	worst service to boot but that is the least of their worries
1	1	it is very comfortable on the ear
0	0	the football scenes at the end were perplexing
0	0	dont go here
0	0	i got home to see the driest damn wings e

#**Part of Speech Tagging**

In [0]:
import nltk

In [0]:
nltk.download("treebank")

[nltk_data] Downloading package treebank to /root/nltk_data...
[nltk_data]   Unzipping corpora/treebank.zip.


True

In [0]:
import numpy as np
import os
import shutil
import tensorflow as tf

In [0]:
def clean_logs(data_dir):
    logs_dir = os.path.join(data_dir, "logs")
    shutil.rmtree(logs_dir, ignore_errors=True)
    return logs_dir


def download_and_read(dataset_dir, num_pairs=None):
    sent_filename = os.path.join(dataset_dir, "treebank-sents.txt")
    poss_filename = os.path.join(dataset_dir, "treebank-poss.txt")
    if not(os.path.exists(sent_filename) and os.path.exists(poss_filename)):
        import nltk    

        if not os.path.exists(dataset_dir):
            os.makedirs(dataset_dir)
        fsents = open(sent_filename, "w")
        fposs = open(poss_filename, "w")
        sentences = nltk.corpus.treebank.tagged_sents()
        for sent in sentences:
            fsents.write(" ".join([w for w, p in sent]) + "\n")
            fposs.write(" ".join([p for w, p in sent]) + "\n")

        fsents.close()
        fposs.close()
    sents, poss = [], []
    with open(sent_filename, "r") as fsent:
        for idx, line in enumerate(fsent):
            sents.append(line.strip())
            if num_pairs is not None and idx >= num_pairs:
                break
    with open(poss_filename, "r") as fposs:
        for idx, line in enumerate(fposs):
            poss.append(line.strip())
            if num_pairs is not None and idx >= num_pairs:
                break
    return sents, poss


def tokenize_and_build_vocab(texts, vocab_size=None, lower=True):
    if vocab_size is None:
        tokenizer = tf.keras.preprocessing.text.Tokenizer(lower=lower)
    else:
        tokenizer = tf.keras.preprocessing.text.Tokenizer(
            num_words=vocab_size+1, oov_token="UNK", lower=lower)
    tokenizer.fit_on_texts(texts)
    if vocab_size is not None:
        # additional workaround, see issue 8092
        # https://github.com/keras-team/keras/issues/8092
        tokenizer.word_index = {e:i for e, i in tokenizer.word_index.items() 
            if i <= vocab_size+1 }
    word2idx = tokenizer.word_index
    idx2word = {v:k for k, v in word2idx.items()}
    return word2idx, idx2word, tokenizer


class POSTaggingModel(tf.keras.Model):
    def __init__(self, source_vocab_size, target_vocab_size,
            embedding_dim, max_seqlen, rnn_output_dim, **kwargs):
        super(POSTaggingModel, self).__init__(**kwargs)
        self.embed = tf.keras.layers.Embedding(
            source_vocab_size, embedding_dim, input_length=max_seqlen)
        self.dropout = tf.keras.layers.SpatialDropout1D(0.2)
        self.rnn = tf.keras.layers.Bidirectional(
            tf.keras.layers.GRU(rnn_output_dim, return_sequences=True))
        self.dense = tf.keras.layers.TimeDistributed(
            tf.keras.layers.Dense(target_vocab_size))
        self.activation = tf.keras.layers.Activation("softmax")

    def call(self, x):
        x = self.embed(x)
        x = self.dropout(x)
        x = self.rnn(x)
        x = self.dense(x)
        x = self.activation(x)
        return x


def masked_accuracy():
    def masked_accuracy_fn(ytrue, ypred):
        ytrue = tf.keras.backend.argmax(ytrue, axis=-1)
        ypred = tf.keras.backend.argmax(ypred, axis=-1)
 
        mask = tf.keras.backend.cast(
            tf.keras.backend.not_equal(ypred, 0), tf.int32)
        matches = tf.keras.backend.cast(
            tf.keras.backend.equal(ytrue, ypred), tf.int32) * mask
        numer = tf.keras.backend.sum(matches)
        denom = tf.keras.backend.maximum(tf.keras.backend.sum(mask), 1)
        accuracy =  numer / denom
        return accuracy

    return masked_accuracy_fn

In [0]:
!rm -r datasets

In [0]:
NUM_PAIRS = None
EMBEDDING_DIM = 128
RNN_OUTPUT_DIM = 256
BATCH_SIZE = 128
NUM_EPOCHS = 50

# set random seed
tf.random.set_seed(42)

# clean up log area
data_dir = "./data"
logs_dir = clean_logs(data_dir)

# download and read source and target data into data structure
sents, poss = download_and_read("./datasets", num_pairs=NUM_PAIRS)
assert(len(sents) == len(poss))
print("# of records: {:d}".format(len(sents)))

# vocabulary sizes
word2idx_s, idx2word_s, tokenizer_s = tokenize_and_build_vocab(
    sents, vocab_size=9000)
word2idx_t, idx2word_t, tokenizer_t = tokenize_and_build_vocab(
    poss, vocab_size=38, lower=False)
source_vocab_size = len(word2idx_s)
target_vocab_size = len(word2idx_t)
print("vocab sizes (source): {:d}, (target): {:d}".format(
    source_vocab_size, target_vocab_size))

# # max sequence length - these should be identical on source and
# # target so we can just analyze one of them and choose max_seqlen
# sequence_lengths = np.array([len(s.split()) for s in sents])
# print([(p, np.percentile(sequence_lengths, p)) 
#     for p in [75, 80, 90, 95, 99, 100]])
# # [(75, 33.0), (80, 35.0), (90, 41.0), (95, 47.0), (99, 58.0), (100, 271.0)]
max_seqlen = 271

# create dataset
sents_as_ints = tokenizer_s.texts_to_sequences(sents)
sents_as_ints = tf.keras.preprocessing.sequence.pad_sequences(
    sents_as_ints, maxlen=max_seqlen, padding="post")
poss_as_ints = tokenizer_t.texts_to_sequences(poss)
poss_as_ints = tf.keras.preprocessing.sequence.pad_sequences(
    poss_as_ints, maxlen=max_seqlen, padding="post")
dataset = tf.data.Dataset.from_tensor_slices(
    (sents_as_ints, poss_as_ints))
idx2word_s[0], idx2word_t[0] = "PAD", "PAD"
poss_as_catints = []
for p in poss_as_ints:
    poss_as_catints.append(tf.keras.utils.to_categorical(p, 
        num_classes=target_vocab_size, dtype="int32"))
poss_as_catints = tf.keras.preprocessing.sequence.pad_sequences(
    poss_as_catints, maxlen=max_seqlen)
dataset = tf.data.Dataset.from_tensor_slices(
    (sents_as_ints, poss_as_catints))

# split into training, validation, and test datasets
dataset = dataset.shuffle(10000)
test_size = len(sents) // 3
val_size = (len(sents) - test_size) // 10
test_dataset = dataset.take(test_size)
val_dataset = dataset.skip(test_size).take(val_size)
train_dataset = dataset.skip(test_size + val_size)

# create batches
batch_size = BATCH_SIZE
train_dataset = train_dataset.batch(batch_size)
val_dataset = val_dataset.batch(batch_size)
test_dataset = test_dataset.batch(batch_size)

# define model
embedding_dim = EMBEDDING_DIM
rnn_output_dim = RNN_OUTPUT_DIM




# of records: 3914
vocab sizes (source): 9001, (target): 39


In [0]:
model = POSTaggingModel(source_vocab_size, target_vocab_size,
    embedding_dim, max_seqlen, rnn_output_dim)
model.build(input_shape=(batch_size, max_seqlen))
model.summary()

model.compile(
    loss="categorical_crossentropy",
    optimizer="adam", 
    metrics=["accuracy", masked_accuracy()])

# for input_b, output_b in train_dataset.take(1):
#     pred_b = model(input_b)
#     pred_b = tf.argmax(pred_b, axis=-1)
# print("in:", input_b.shape, "label:", output_b.shape, 
#     "prediction:", pred_b.shape)

# train
num_epochs = NUM_EPOCHS

best_model_file = os.path.join(data_dir, "best_model.h5")
checkpoint = tf.keras.callbacks.ModelCheckpoint(
    best_model_file, 
    save_weights_only=True,
    save_best_only=True)
tensorboard = tf.keras.callbacks.TensorBoard(log_dir=logs_dir)
history = model.fit(train_dataset, 
    epochs=num_epochs,
    validation_data=val_dataset,
    callbacks=[checkpoint, tensorboard])


Model: "pos_tagging_model"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding (Embedding)        multiple                  1152128   
_________________________________________________________________
spatial_dropout1d (SpatialDr multiple                  0         
_________________________________________________________________
bidirectional (Bidirectional multiple                  592896    
_________________________________________________________________
time_distributed (TimeDistri multiple                  20007     
_________________________________________________________________
activation (Activation)      multiple                  0         
Total params: 1,765,031
Trainable params: 1,765,031
Non-trainable params: 0
_________________________________________________________________
Train for 19 steps, validate for 3 steps
Epoch 1/50


KeyboardInterrupt: ignored

In [0]:
# evaluate with test
best_model = POSTaggingModel(source_vocab_size, target_vocab_size, embedding_dim, max_seqlen, rnn_output_dim)
best_model.build(input_shape=(batch_size, max_seqlen))
best_model.load_weights(best_model_file)
best_model.compile(
    loss="categorical_crossentropy",
    optimizer = "adam",
    metrics = ["accuracy", masked_accuracy()]
)

test_loss, test_acc, test_masked_acc = best_model.evaluate(test_dataset)




ValueError: ignored

In [0]:
print("test loss: {:3f}, test_accuracy: {:3f}, masked_test_accuracy: {:3f}".format(test_loss, test_acc, test_masked_acc))

test loss: 0.060080, test_accuracy: 0.981598, masked_test_accuracy: 0.792096


In [0]:
# predict on batches
labels, predictions = [], []
is_first_batch = True
accuracies = []

for test_batch in test_dataset:
    inputs_b, outputs_b = test_batch
    preds_b = best_model.predict(inputs_b)
    # convert from categorical to list of ints
    preds_b = np.argmax(preds_b, axis=-1)
    outputs_b = np.argmax(outputs_b.numpy(), axis=-1)
    for i, (pred_l, output_l) in enumerate(zip(preds_b, outputs_b)):
        assert(len(pred_l) == len(output_l))
        pad_len = np.nonzero(output_l)[0][0]
        acc = np.count_nonzero(
            np.equal(
                output_l[pad_len:], pred_l[pad_len:]
            )
        ) / len(output_l[pad_len:])
        accuracies.append(acc)
        if is_first_batch:
            words = [idx2word_s[x] for x in inputs_b.numpy()[i][pad_len:]]
            postags_l = [idx2word_t[x] for x in output_l[pad_len:] if x > 0]
            postags_p = [idx2word_t[x] for x in pred_l[pad_len:] if x > 0]
            print("labeled  : {:s}".format(" ".join(["{:s}/{:s}".format(w, p) 
                for (w, p) in zip(words, postags_l)])))
            print("predicted: {:s}".format(" ".join(["{:s}/{:s}".format(w, p) 
                for (w, p) in zip(words, postags_p)])))
            print(" ")
    is_first_batch = False

accuracy_score = np.mean(np.array(accuracies))
print("pos tagging accuracy: {:.3f}".format(accuracy_score))


labeled  : for/IN fiscal/JJ 1989/CD the/DT company/NN posted/VBD net/NN of/IN UNK/CD 9/CD million/NONE u/CC or/CD 2/NONE 87/DT u/NN a/IN share/IN down/CD from/CD UNK/NONE 9/CC million/CD u/NONE or/DT 3/NN 04/IN u/JJ a/CD
predicted: for/IN fiscal/JJ 1989/CD the/DT company/NN posted/VBD net/JJ of/IN UNK/CD 9/CD million/CD u/CC or/CD 2/NONE 87/NONE u/DT a/NN share/IN down/IN from/CD UNK/CD 9/CD million/CC u/CC or/CD 3/NN 04/NN u/DT a/NN share/IN
 
labeled  : texaco/NNP rose/VBD 3/CD 4/TO to/CD 53/CD 3/IN 8/CD as/CD 4/NNS 4/VBD million/NNS
predicted: texaco/NNP rose/VBD 3/CD 4/TO to/CD 53/CD 3/CD 8/CD as/CD 4/CD 4/CD million/NNS shares/NNS
 
labeled  : but/CC other/JJ people/NNS do/VBP n't/RB want/VB 1/NONE to/TO lose/VB the/DT bridges/NNS '/POS beautiful/JJ sometimes/RB historic/JJ features/NNS
predicted: but/CC other/JJ people/NNS do/VBP n't/RB want/VB 1/NONE to/TO lose/VB the/DT bridges/NNS '/POS beautiful/JJ sometimes/RB historic/JJ features/NNS
 
labeled  : in/IN addition/NN to/TO the

#**Sequence to Sequence without Attention**

In [0]:
import nltk
import numpy as np
import re
import shutil
import tensorflow as tf
import os
import unicodedata 

In [0]:
from nltk.translate.bleu_score import sentence_bleu, SmoothingFunction

In [0]:
def clean_up_logs(data_dir):
  checkpoint_dir = os.path.join(data_dir, "checkpoints")
  if os.path.exists(checkpoint_dir):
    shutil.rmtree(checkpoint_dir, ignore_errors=True)
    os.makedir(checkpoint_dir)
  return checkpoint_dir

In [0]:
def preprocess_sentences(sent):
  sent = "".join([c for c in unicodedata.normalize("NFD", sent) if unicodedata.category(c) != "Mn"])
  sent = re.sub(r"([!.?])", r" \1", sent)
  sent = re.sub(r"[^a-zA-Z!.?]+", r" ", sent)
  sent = sent.lower()
  return sent

In [0]:
def download_and_read():
  en_sents, fr_sents_in, fr_sents_out = [], [], []
  local_file = os.path.join("datasets", "fra.txt")
  with open(local_file, "r") as fin:
    for i, line in enumerate(fin):
      en_sent, fr_sent = line.strip().split('\t')
      en_sent = [w for w in preprocess_sentence(en_sent).split()]
      fr_sent = preprocess_sentences(fr_sent)
      fr_sent_in = [w for w in ("BOS " + fr_sent).split()]
      fr_sent_out = [w for w in (fr_sent + "EOS").split()]
      en_sents.append(en_sent)
      fr_sents_in.append(fr_sent_in)
      fr_sents_out.append(fr_sent_out)
      if i >= num_sent_pairs - 1:
        break
  return en_sents, fr_sents_in, fr_sents_out

In [0]:
class Encoder(tf.keras.Model):
  def __init__(self, vocab_size, num_timesteps, embedding_dim, encoder_dim, **kwargs):
      super(Encoder, self).__init__(**kwargs)
      self.encoder_dim = encoder_dim
      self.embedding = tf.keras.layers.Embedding(vocab_size, embedding_dim, input_length=num_timesteps)
      self.rnn = tf.keras.layers.GRU(encoder_dim, return_sequences=False, return_state=True)
  def call(self, x, state):
    x = self.embedding(x)
    x, state = self.rnn(x, initial_state=state)
    return x, state

  def init_state(self, batch_size):
    return tf.zeros((batch_size, self.encoder_dim))


In [0]:
class Decoder(tf.keras.Model):
  def __init__(self, vocab_size, embedding_dim, num_timesteps, decoder_dim, **kwargs):
    super(Decoder, self).__init__(**kwargs)
    self.decoder_dim = decoder_dim
    self.embedding = tf.keras.layers.Embedding(
        vocab_size, embedding_dim, input_length = num_timesteps)
    self.rnn = tf.keras.layers.GRU(
        decoder_dim, return_sequences=True, return_state=True
    )
    self.dense = tf.kera.layers.Dense(vocab_size)

  def call(self, x, state):
    x = self.embedding(x)
    x, state = self.rnn(x, state)
    x = self.dense(x)
    return x, state

In [0]:
def loss_fn(ytrue, ypred):
  scce = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True)
  mask = tf.math.logical_not(tf.math.equal(ytrue, 0))
  mask = tf.cast(mask, dtype=tf.int64)
  loss = scce(ytrue, ypred, sample_weight=mask)
  return loss

In [0]:
@tf.function()
def train_step(encoder_in, decoder_in, decoder_out, encoder_state):
  with tf.GradientTape() as tape:
    encoder_out, encoder_state = encoder(encoder_in, encoder_state)
    decoder_state = encoder_state
    decoder_pred, decoder_state = decoder(decoder_in, decoder_state)
    loss = loss_fn(decoder_out, decoder_pred)

  variables = encoder.trainable_variables + decoder.trainable_variables
  gradients = tape.gradient(loss, variables)
  optimizer.apply_gradients(zip(gradients, variables))
  return loss 


In [0]:
def predict(encoder, decoder, batch_size, sents_en, data_en, sents_fr_out, word2idx_fr, idx2word_fr):
  random_id = np.random.choice(len(sents_en))
  print("input    : ", " ".join(sents_en[random_id]))
  print("label    : ", " ".join(sents_fr_out[random_id]))

  encoder_in = tf.expand_dims(data_en[random_id], axis=0)
  decoder_out = tf.expand_dims(sents_fr_out[random_id], axis=0)

  encoder_state = encoder.init_state(1)
  encoder_out, encoder_state = encoder(encoder_in, encoder_state)
  decoder_state = encoder_state

  decoder_in = tf.expand_dims(
      tf.constant([word2idx_fr["BOS"]]), axis=0)
  
  pred_sent_fr = []

  while True:
    decoder_pred = decoder_state = decoder(decoder_in, decoder_state)
    decoder_pred = tf.argmax(decoder_pred, axis=-1)
    pred_word = idx2word_fr[decoder_pred.numpy()[0][0]]
    pred_sent_fr.append(pred_word)
    if pred_word == "EOS":
      break
    decoder_in = decoder_pred

  print("predicted: ", " ".join(pred_send_f))

In [0]:
def evaluate_bleu_score(encoder, decoder, test_dataset, word2idx_fr, idx2word_fr):
  bleu_scores = []
  smooth_fn = SmoothingFunction()
  for encoder_in, decoder_in, decoder_out in test_dataset:
    encoder_state = encoder.init_state(batch_size)
    encoder_out, encoder_state = encoder(encoder_in, encoder_state)
    decoder_state = encoder_state
    decoder_pred, decoder_state = decoder(decoder_in, decoder_state)

    # compute argmax
    decoder_out = decoder_out.numpy()
    decoder_pred = tf.argmax(decoder_pred, axis=-1).numpy()

    for i in range(decoder.shape[0]):
      ref_sent = [idx2word_fr[j] for j in decoder_out[i].tolist() if j > 0]
      hyp_sent = [idx2word_fr[j] for j in decoder_pred[i].tolist() if j > 0]
      # remove tailing EOS
      ref_sent = ref_sent[0:-1]
      hyp_sent = hyp_sent[0:-1]
      blue_score = sentence_bleu([ref_sent], hyp_sent, smoothing_function=smooth_fn.method1)
      blue_scores.append(bleu_score)


    return np.mean(np.array(bleu_scores))

ERROR! Session/line number was not unique in database. History logging moved to new session 59
