In [1]:
import builtins

def print(*args, **kwargs):
    return builtins.print(*args, **kwargs, end='\n\n')

In [2]:
from tensorflow.keras.preprocessing.text import Tokenizer

In [3]:
sentences = [
    'I love my Dog',
    'I love my Cat',
    'You love my Dog!!'
]

In [4]:
tokenizer = Tokenizer(num_words=100)
tokenizer.fit_on_texts(sentences)
word_index = tokenizer.word_index
print(word_index)

{'love': 1, 'my': 2, 'i': 3, 'dog': 4, 'cat': 5, 'you': 6}



----------------

In [5]:
sentences = [
    'I love my Dog',
    'I love my Cat',
    'You love my Dog!!',
    'Do you think my dog is amazing?'
]

tokenizer = Tokenizer(num_words=100)
tokenizer.fit_on_texts(sentences)
word_index = tokenizer.word_index

# sentence to list of tokens/numbers
sequences = tokenizer.texts_to_sequences(sentences)

print(word_index)
print(sequences)

{'my': 1, 'love': 2, 'dog': 3, 'i': 4, 'you': 5, 'cat': 6, 'do': 7, 'think': 8, 'is': 9, 'amazing': 10}

[[4, 2, 1, 3], [4, 2, 1, 6], [5, 2, 1, 3], [7, 5, 8, 1, 3, 9, 10]]



In [6]:
test_data = [
    'i really love my dog',
    'my dog loves my manatee'
]

test_seq = tokenizer.texts_to_sequences(test_data)
print(test_seq)

[[4, 2, 1, 3], [1, 3, 1]]



______________

In [7]:
sentences = [
    'I love my Dog',
    'I love my Cat',
    'You love my Dog!!',
    'Do you think my dog is amazing?'
]

# special value for unseen word
tokenizer = Tokenizer(num_words=100, oov_token="<OOV>")
tokenizer.fit_on_texts(sentences)
word_index = tokenizer.word_index

# sentence to list of tokens/numbers
sequences = tokenizer.texts_to_sequences(sentences)

print(word_index)
print(sequences)

test_data = [
    'i really love my dog',
    'my dog loves my manatee'
]

test_seq = tokenizer.texts_to_sequences(test_data)
print(test_seq)

{'<OOV>': 1, 'my': 2, 'love': 3, 'dog': 4, 'i': 5, 'you': 6, 'cat': 7, 'do': 8, 'think': 9, 'is': 10, 'amazing': 11}

[[5, 3, 2, 4], [5, 3, 2, 7], [6, 3, 2, 4], [8, 6, 9, 2, 4, 10, 11]]

[[5, 1, 3, 2, 4], [2, 4, 1, 2, 1]]



In [8]:
# padding to make words uniform

In [9]:
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.preprocessing.text import Tokenizer

In [10]:
sentences = [
    'I love my Dog',
    'I love my Cat',
    'You love my Dog!!',
    'Do you think my dog is amazing?'
]

# special value for unseen word
tokenizer = Tokenizer(num_words=100, oov_token="<OOV>")
tokenizer.fit_on_texts(sentences)
word_index = tokenizer.word_index

# sentence to list of tokens/numbers
sequences = tokenizer.texts_to_sequences(sentences)

# padding 
paddded = pad_sequences(sequences)
print(paddded)

# padding after the sentence
paddded = pad_sequences(sequences, padding='post')
print(paddded)

# matrix len is the same as the sentence with maximum length
# to change it
paddded = pad_sequences(sequences, padding='post', maxlen=5)
print(paddded)

# loose info from end instead of front
paddded = pad_sequences(sequences, padding='post',
                       truncating='post', maxlen=5)
print(paddded)

# print(word_index)
# print(sequences)
# print(paddded)

[[ 0  0  0  5  3  2  4]
 [ 0  0  0  5  3  2  7]
 [ 0  0  0  6  3  2  4]
 [ 8  6  9  2  4 10 11]]

[[ 5  3  2  4  0  0  0]
 [ 5  3  2  7  0  0  0]
 [ 6  3  2  4  0  0  0]
 [ 8  6  9  2  4 10 11]]

[[ 5  3  2  4  0]
 [ 5  3  2  7  0]
 [ 6  3  2  4  0]
 [ 9  2  4 10 11]]

[[5 3 2 4 0]
 [5 3 2 7 0]
 [6 3 2 4 0]
 [8 6 9 2 4]]



_____________

In [11]:
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

In [12]:
sentences = [
    'I love my Dog',
    'I love my Cat',
    'You love my Dog!!',
    'Do you think my dog is amazing?'
]

In [13]:
sentences = [
    'I love my Dog',
    'I love my Cat',
    'You love my Dog!!',
    'Do you think my dog is amazing?'
]


sequences = tokenizer.texts_to_sequences(sentences)

padded = pad_sequences(sequences, maxlen=5)

tokenizer = Tokenizer(num_words=100, oov_token="<OOV>")
tokenizer.fit_on_texts(sentences)
word_index = tokenizer.word_index

sequences = tokenizer.texts_to_sequences(sentences)

padded = pad_sequences(sequences, maxlen=5)

print(f"Word Index: {word_index}")
print(f"Sequences: {sequences}")
print(f"Padded Sequences: \n{padded}")

Word Index: {'<OOV>': 1, 'my': 2, 'love': 3, 'dog': 4, 'i': 5, 'you': 6, 'cat': 7, 'do': 8, 'think': 9, 'is': 10, 'amazing': 11}

Sequences: [[5, 3, 2, 4], [5, 3, 2, 7], [6, 3, 2, 4], [8, 6, 9, 2, 4, 10, 11]]

Padded Sequences: 
[[ 0  5  3  2  4]
 [ 0  5  3  2  7]
 [ 0  6  3  2  4]
 [ 9  2  4 10 11]]



In [14]:
# testing

test_data = [
    'i really love my dog',
    'my dog loves my manatee'
]

test_seq = tokenizer.texts_to_sequences(test_data)
print(f"Test Sequences: {test_seq}")

padded = pad_sequences(test_seq, maxlen=10)
print(f"Padded test sequence: \n {padded}")

Test Sequences: [[5, 1, 3, 2, 4], [2, 4, 1, 2, 1]]

Padded test sequence: 
 [[0 0 0 0 0 5 1 3 2 4]
 [0 0 0 0 0 2 4 1 2 1]]



In [15]:
stopwords = ["a", "about", "above", "after", "again", "against", "all", "am", "an", "and", "any", "are", "as", "at", 
             "be", "because", "been", "before", "being", "below", "between", "both", "but", "by", "could", "did", 
             "do", "does", "doing", "down", "during", "each", "few", "for", "from", "further", "had", "has", 
             "have", "having", "he", "he'd", "he'll", "he's", "her", "here", "here's", "hers", "herself", "him", 
             "himself", "his", "how", "how's", "i", "i'd", "i'll", "i'm", "i've", "if", "in", "into", "is", "it", 
             "it's", "its", "itself", "let's", "me", "more", "most", "my", "myself", "nor", "of", "on", "once", 
             "only", "or", "other", "ought", "our", "ours", "ourselves", "out", "over", "own", "same", "she", "she'd", 
             "she'll", "she's", "should", "so", "some", "such", "than", "that", "that's", "the", "their", "theirs", "them", 
             "themselves", "then", "there", "there's", "these", "they", "they'd", "they'll", "they're", "they've", "this", 
             "those", "through", "to", "too", "under", "until", "up", "very", "was", "we", "we'd", "we'll", "we're", "we've",
             "were", "what", "what's", "when", "when's", "where", "where's", "which", "while", "who", "who's", "whom", "why", 
             "why's", "with", "would", "you", "you'd", "you'll", "you're", "you've", "your", "yours", "yourself", "yourselves"]