In [15]:
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

In this example, we will user tensorflow's Tokenizer to generate a vocabulary from a given set of sentences

In [16]:
# input list of sentences
sentences = ["I love my cat.", "i, love my dog!!"]


# instantiate tokenizer object (num_words parameter specifies num_words most frequent words to keep in vocabulary when generating sequences of word indices from sentences)
tokenizer = Tokenizer(num_words=100)

# generate word indices (the tokenizer removes all punctuation symbols and lowercases all characters)
tokenizer.fit_on_texts(sentences)
word_index = tokenizer.word_index

print(word_index)

{'i': 1, 'love': 2, 'my': 3, 'cat': 4, 'dog': 5}


Note that the word indices start from 1. Once we've generated the vocablary, we can use the tokenizer to transform sentences into a sequence of indices as follows

In [17]:
sequences = tokenizer.texts_to_sequences(sentences)
print(sequences)

[[1, 2, 3, 4], [1, 2, 3, 5]]


We can also generate sequences on unseen sentences, however out of vocabulary (OOV) words will get ignored

In [18]:
print(tokenizer.texts_to_sequences(["My ostrich loves my dog"])) # the words 'ostrich' and 'loves' are OOV

[[3, 3, 5]]


To fix this problem, initialize the Tokenizer with a special OOV token

In [26]:
# input list of sentences
sentences = ["I love my cat.", "i, love my dog!!", "you love my dog.", "My dog ate my homework, dammit!"]

# instantiate tokenizer object (specify OOV token)
tokenizer = Tokenizer(num_words=100, oov_token="<OOV>")

# generate word indices (the tokenizer removes all punctuation symbols and lowercases all characters)
tokenizer.fit_on_texts(sentences)
word_index = tokenizer.word_index

print(word_index)
sequences = tokenizer.texts_to_sequences(sentences)
print(tokenizer.texts_to_sequences(["My ostrich loves my dog"])) # the words 'ostrich' and 'loves' are OOV

{'<OOV>': 1, 'my': 2, 'love': 3, 'dog': 4, 'i': 5, 'cat': 6, 'you': 7, 'ate': 8, 'homework': 9, 'dammit': 10}
[[2, 1, 1, 2, 4]]


Sequences can also be padded to uniform length

In [30]:
# maxlen parameter default value is the length of longest sequence (for maxlen < length of longest sequences, truncation occurs) and default padding and truncating are 'pre'
padded_sequences = pad_sequences(sequences, maxlen=5, padding='post', truncating='post')
print(sequences)
print(padded_sequences)

[[5, 3, 2, 6], [5, 3, 2, 4], [7, 3, 2, 4], [2, 4, 8, 2, 9, 10]]
[[5 3 2 6 0]
 [5 3 2 4 0]
 [7 3 2 4 0]
 [2 4 8 2 9]]


Experimenting with the `Sarcasm` dataset. Each item in this data set is a dictionary containing a news headline sentence, a URL to the news article and a binary label denoting whether the news headline is sarcastic  

In [35]:
import wget

# download data set
link = "https://storage.googleapis.com/tensorflow-1-public/course3/sarcasm.json"
wget.download(link)

'sarcasm.json'

In [37]:
import json

# load the file contents
with open("./sarcasm.json", 'r') as f:
    datastore = json.load(f)

In [48]:
print(f"Number of news articles: {len(datastore)}. Some examples:")
for i in range(0,25,5):
    print(datastore[i])

Number of news articles: 26709. Some examples:
{'article_link': 'https://www.huffingtonpost.com/entry/versace-black-code_us_5861fbefe4b0de3a08f600d5', 'headline': "former versace store clerk sues over secret 'black code' for minority shoppers", 'is_sarcastic': 0}
{'article_link': 'https://www.huffingtonpost.com/entry/advancing-the-worlds-women_b_6810038.html', 'headline': "advancing the world's women", 'is_sarcastic': 0}
{'article_link': 'https://www.huffingtonpost.com/entry/airline-passengers-tackle-man-who-rushes-cockpit-in-bomb-threat_us_59302e57e4b07572bdbf9460', 'headline': 'airline passengers tackle man who rushes cockpit in bomb threat', 'is_sarcastic': 0}
{'article_link': 'https://entertainment.theonion.com/nuclear-bomb-detonates-during-rehearsal-for-spider-man-1819572009', 'headline': "nuclear bomb detonates during rehearsal for 'spider-man' musical", 'is_sarcastic': 1}
{'article_link': 'https://local.theonion.com/courtroom-sketch-artist-has-clear-manga-influences-1820298494',

In [49]:
# get the URLS, headline sentences and labels into separate lists
urls = []
sentences = []
labels = []
for item in datastore:
    urls.append(item['article_link'])
    sentences.append(item['headline'])
    labels.append(item['is_sarcastic'])

Now lets preprocess these headle sentences using the Tokenizer. First we generate the ictionary of word indices, then create padded seuences of word indices for the sentences

In [54]:
tokenizer = Tokenizer(oov_token="<OOV>")

# generate vocab
tokenizer.fit_on_texts(sentences)
word_index = tokenizer.word_index
print(f"Number of words in vocabulary: {len(word_index)}\n\n")


Number of words in vocabulary: 29657




In [57]:
# generate padded sequences
sequences = tokenizer.texts_to_sequences(sentences)
padded = pad_sequences(sequences,padding='post')

print(f"Length of each opoadded sequence: {len(padded[0])}")
print("First ten padded sequences: ")
print(padded[:10])

Length of each opoadded sequence: 40
First ten padded sequences: 
[[  308 15115   679  3337  2298    48   382  2576 15116     6  2577  8434
      0     0     0     0     0     0     0     0     0     0     0     0
      0     0     0     0     0     0     0     0     0     0     0     0
      0     0     0     0]
 [    4  8435  3338  2746    22     2   166  8436   416  3112     6   258
      9  1002     0     0     0     0     0     0     0     0     0     0
      0     0     0     0     0     0     0     0     0     0     0     0
      0     0     0     0]
 [  145   838     2   907  1749  2093   582  4719   221   143    39    46
      2 10736     0     0     0     0     0     0     0     0     0     0
      0     0     0     0     0     0     0     0     0     0     0     0
      0     0     0     0]
 [ 1485    36   224   400     2  1832    29   319    22    10  2924  1393
   6969   968     0     0     0     0     0     0     0     0     0     0
      0     0     0     0     0     0  