In [3]:
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer

In [4]:
sentences = [
    "Today is a sunny day.",
    "TOday is a rainy day."
]

tokenizer = Tokenizer(num_words=20)
tokenizer.fit_on_texts(sentences)

In [5]:
tokenizer.word_index

{'today': 1, 'is': 2, 'a': 3, 'day': 4, 'sunny': 5, 'rainy': 6}

In [6]:
sentences = [
    "Today is a sunny day.",
    "Today is a rainy day.",
    "Is it sunny today?"
]

tokenizer = Tokenizer(num_words=20)
tokenizer.fit_on_texts(sentences)
seq = tokenizer.texts_to_sequences(sentences)
tokenizer.word_index

{'today': 1, 'is': 2, 'a': 3, 'sunny': 4, 'day': 5, 'rainy': 6, 'it': 7}

In [7]:
new_sentence = ["Will it be rainy today?"]

In [8]:
tokenizer.texts_to_sequences(new_sentence)

[[7, 6, 1]]

In [9]:
tokenizer = Tokenizer(num_words=20, oov_token="<OOV>")
tokenizer.fit_on_texts(sentences)
seq = tokenizer.texts_to_sequences(sentences)

In [10]:
tokenizer.texts_to_sequences(new_sentence)

[[1, 8, 1, 7, 2]]

In [11]:
tokenizer.word_index

{'<OOV>': 1,
 'today': 2,
 'is': 3,
 'a': 4,
 'sunny': 5,
 'day': 6,
 'rainy': 7,
 'it': 8}

In [12]:
sentences = [
    "Today is a sunny day.",
    "Today is a rainy day.",
    "Is it sunny today?",
    "I really enjoyed walking in the snow today with you."
]

tokenizer = Tokenizer(num_words=50, oov_token="<OOV>")
tokenizer.fit_on_texts(sentences)
seq = tokenizer.texts_to_sequences(sentences)

In [13]:
seq

[[2, 3, 4, 5, 6],
 [2, 3, 4, 7, 6],
 [3, 8, 5, 2],
 [9, 10, 11, 12, 13, 14, 15, 2, 16, 17]]

In [14]:
from tensorflow.keras.preprocessing.sequence import pad_sequences

In [15]:
padded = pad_sequences(seq)
padded

array([[ 0,  0,  0,  0,  0,  2,  3,  4,  5,  6],
       [ 0,  0,  0,  0,  0,  2,  3,  4,  7,  6],
       [ 0,  0,  0,  0,  0,  0,  3,  8,  5,  2],
       [ 9, 10, 11, 12, 13, 14, 15,  2, 16, 17]], dtype=int32)

In [16]:
padded = pad_sequences(seq, padding="post")
padded

array([[ 2,  3,  4,  5,  6,  0,  0,  0,  0,  0],
       [ 2,  3,  4,  7,  6,  0,  0,  0,  0,  0],
       [ 3,  8,  5,  2,  0,  0,  0,  0,  0,  0],
       [ 9, 10, 11, 12, 13, 14, 15,  2, 16, 17]], dtype=int32)

In [17]:
padded = pad_sequences(seq, padding="post", maxlen=6)
padded

array([[ 2,  3,  4,  5,  6,  0],
       [ 2,  3,  4,  7,  6,  0],
       [ 3,  8,  5,  2,  0,  0],
       [13, 14, 15,  2, 16, 17]], dtype=int32)

In [18]:
padded = pad_sequences(seq, padding="post", maxlen=6, truncating="post")
padded

array([[ 2,  3,  4,  5,  6,  0],
       [ 2,  3,  4,  7,  6,  0],
       [ 3,  8,  5,  2,  0,  0],
       [ 9, 10, 11, 12, 13, 14]], dtype=int32)

In [20]:
from bs4 import BeautifulSoup
soup = BeautifulSoup(sentences[0])
sentence = soup.get_text()

In [23]:
stopwords = ["a", "about", "above", ... ,"yours", "yourself", "yourselves"]

words = sentence.split()
filtered_sentence = ""
for word in words:
  if word not in stopwords:
    filtered_sentence = filtered_sentence + word + " "
sentences.append(filtered_sentence)

In [24]:
import string
table = str.maketrans('', '', string.punctuation)
words = sentence.split()
filtered_sentence = ""
for word in words:
  word = word.translate(table)
  if word not in stopwords:
    filtered_sentence = filtered_sentence + word + " "
sentences.append(filtered_sentence)

In [25]:
import tensorflow_datasets as tfds

In [26]:
train_data = tfds.as_numpy(tfds.load("imdb_reviews", split="train"))

Downloading and preparing dataset 80.23 MiB (download: 80.23 MiB, generated: Unknown size, total: 80.23 MiB) to /root/tensorflow_datasets/imdb_reviews/plain_text/1.0.0...


Dl Completed...: 0 url [00:00, ? url/s]

Dl Size...: 0 MiB [00:00, ? MiB/s]

Generating splits...:   0%|          | 0/3 [00:00<?, ? splits/s]

Generating train examples...:   0%|          | 0/25000 [00:00<?, ? examples/s]

Shuffling /root/tensorflow_datasets/imdb_reviews/plain_text/1.0.0.incompleteB7ZAW8/imdb_reviews-train.tfrecord…

Generating test examples...:   0%|          | 0/25000 [00:00<?, ? examples/s]

Shuffling /root/tensorflow_datasets/imdb_reviews/plain_text/1.0.0.incompleteB7ZAW8/imdb_reviews-test.tfrecord*…

Generating unsupervised examples...:   0%|          | 0/50000 [00:00<?, ? examples/s]

Shuffling /root/tensorflow_datasets/imdb_reviews/plain_text/1.0.0.incompleteB7ZAW8/imdb_reviews-unsupervised.t…

Dataset imdb_reviews downloaded and prepared to /root/tensorflow_datasets/imdb_reviews/plain_text/1.0.0. Subsequent calls will reuse this data.


In [27]:
imdb_sentences = []

for item in train_data:
  imdb_sentences.append(str(item['text']))

In [28]:
imdb_sentences[0]

'b"This was an absolutely terrible movie. Don\'t be lured in by Christopher Walken or Michael Ironside. Both are great actors, but this must simply be their worst role in history. Even their great acting could not redeem this movie\'s ridiculous storyline. This movie is an early nineties US propaganda piece. The most pathetic scenes were those when the Columbian rebels were making their cases for revolutions. Maria Conchita Alonso appeared phony, and her pseudo-love affair with Walken was nothing but a pathetic emotional plug in a movie that was devoid of any real meaning. I am disappointed that there are movies like this, ruining actor\'s like Christopher Walken\'s good name. I could barely sit through it."'

In [29]:
tokenizer = Tokenizer(num_words=5000)
tokenizer.fit_on_texts(imdb_sentences)
sequences = tokenizer.texts_to_sequences(imdb_sentences)

In [30]:
from bs4 import BeautifulSoup

import string

table = str.maketrans("", "", string.punctuation)

imdb_sentences = []
train_data = tfds.as_numpy(tfds.load("imdb_reviews", split="train"))

stopwords = ["a", "your", "yourselves"]

for item in train_data:
  sentence = str(item['text'].decode("UTF-8").lower())
  soup = BeautifulSoup(sentence)
  sentence = soup.get_text()
  words = sentence.split(" ")
  
  filtered_sentence = ""
  for word in words:
    word = word.translate(table)

    if word not in stopwords:
      filtered_sentence = filtered_sentence + word + " "

  imdb_sentences.append(filtered_sentence)

  soup = BeautifulSoup(sentence)


In [31]:
tokenizer = Tokenizer(num_words=25000)
tokenizer.fit_on_texts(imdb_sentences)

sequences = tokenizer.texts_to_sequences(imdb_sentences)

In [32]:
reverse_word_index = dict((value, key) for key, value in tokenizer.word_index.items())

In [None]:
reverse_word_index

{1: 'the',
 2: 'and',
 3: 'of',
 4: 'to',
 5: 'is',
 6: 'in',
 7: 'it',
 8: 'this',
 9: 'i',
 10: 'that',
 11: 'was',
 12: 'as',
 13: 'with',
 14: 'for',
 15: 'movie',
 16: 'but',
 17: 'film',
 18: 'on',
 19: 'not',
 20: 'are',
 21: 'you',
 22: 'his',
 23: 'have',
 24: 'be',
 25: 'he',
 26: 'one',
 27: 'its',
 28: 'at',
 29: 'all',
 30: 'by',
 31: 'an',
 32: 'they',
 33: 'who',
 34: 'from',
 35: 'like',
 36: 'so',
 37: 'her',
 38: 'or',
 39: 'just',
 40: 'about',
 41: 'has',
 42: 'out',
 43: 'if',
 44: 'some',
 45: 'what',
 46: 'there',
 47: 'good',
 48: 'more',
 49: 'very',
 50: 'when',
 51: 'she',
 52: 'even',
 53: 'up',
 54: 'no',
 55: 'would',
 56: 'my',
 57: 'which',
 58: 'only',
 59: 'time',
 60: 'really',
 61: 'story',
 62: 'their',
 63: 'were',
 64: 'had',
 65: 'see',
 66: 'can',
 67: 'me',
 68: 'than',
 69: 'we',
 70: 'much',
 71: 'been',
 72: 'get',
 73: 'well',
 74: 'will',
 75: 'into',
 76: 'because',
 77: 'people',
 78: 'other',
 79: 'also',
 80: 'do',
 81: 'bad',
 82: 'gr