# Tokenizer

In [6]:
from tensorflow.keras.preprocessing.text import Tokenizer

sentences = [
    'I love my dog',
    'I love my cat'
]

# num_words = 100代表took top 100 words by columns，因为出现频率少的单词对Accuracy影响少，但延长了训练时间
tokenizer = Tokenizer(num_words = 100)
# fit的过程中，会把所有大小变成小写，并去掉标点符号
tokenizer.fit_on_texts(sentences)
word_index = tokenizer.word_index
word_index
# 编码顺序按出现频率和出现顺序排序

{'i': 1, 'love': 2, 'my': 3, 'dog': 4, 'cat': 5}

In [9]:
# 把句子转成index数组
sequences = tokenizer.texts_to_sequences(sentences)
sequences

[[1, 2, 3, 4], [1, 2, 3, 5]]

In [10]:
# 默认情况下，tokenizer会把不认识的单词跳过（ignore）
# 可以把不认识的单词设置为OOV
tokenizer = Tokenizer(num_words=100, oov_token='<OOV>')

# padding和trunicate

让所有句子的index数组的维度相同

In [23]:
from tensorflow.keras.preprocessing.sequence import pad_sequences

sentences = [
    'I love my dog',
    'I love my cat',
    'my dog realy loves me'
]
tokenizer = Tokenizer(num_words=100, oov_token='<OOV>')
tokenizer.fit_on_texts(sentences)
sequences = tokenizer.texts_to_sequences(sentences)
padded = pad_sequences(sequences)
# 句子长度不同时，所有句子都padding到最长句子的长度。默认的padding方式是在前面填0
padded

array([[0, 3, 4, 2, 5],
       [0, 3, 4, 2, 6],
       [2, 5, 7, 8, 9]])

In [25]:
# 改为在句子尾部填0
padded = pad_sequences(sequences, padding='post')
padded

array([[3, 4, 2, 5, 0],
       [3, 4, 2, 6, 0],
       [2, 5, 7, 8, 9]])

In [31]:
sentences = [
    'I love my dog',
    'I love my cat',
    'my dog realy loves me',
    'I love you'
]
tokenizer = Tokenizer(num_words=100, oov_token='<OOV>')
tokenizer.fit_on_texts(sentences)
sequences = tokenizer.texts_to_sequences(sentences)

# 设置句子的最大长度，不足则填充，超长则截断
padded = pad_sequences(sequences, maxlen=4)
# 默认的截断方式是保留最后的同几个字符，将前面的截掉
padded

array([[ 2,  3,  4,  5],
       [ 2,  3,  4,  6],
       [ 5,  7,  8,  9],
       [ 0,  2,  3, 10]])

In [35]:
# 设置为保留开关的字符，把后面的截掉
padded = pad_sequences(sequences, maxlen=4, truncating='post')
# 默认的截断方式是保留最后的同几个字符，将前面的截掉
padded

array([[ 2,  3,  4,  5],
       [ 2,  3,  4,  6],
       [ 4,  5,  7,  8],
       [ 0,  2,  3, 10]])