# Tokenizer

## 基本用法

In [1]:
from tensorflow.keras.preprocessing.text import Tokenizer

sentences = [
    'I love my dog',
    'I love my cat'
]

# num_words = 100代表took top 100 words by columns，因为出现频率少的单词对Accuracy影响少，但延长了训练时间
tokenizer = Tokenizer(num_words = 100)
# fit的过程中，会把所有大小变成小写，并去掉标点符号
tokenizer.fit_on_texts(sentences)
word_index = tokenizer.word_index
word_index
# 编码顺序按出现频率和出现顺序排序

{'i': 1, 'love': 2, 'my': 3, 'dog': 4, 'cat': 5}

In [2]:
# 把句子转成index数组
sequences = tokenizer.texts_to_sequences(sentences)
sequences

[[1, 2, 3, 4], [1, 2, 3, 5]]

In [3]:
# 把index转成句子
sentences = tokenizer.sequences_to_texts(sequences)
sentences

['i love my dog', 'i love my cat']

In [4]:
'dog' in word_index

True

# num_words

对全部单词编码，但只对前num_words个单词转码

In [9]:
from tensorflow.keras.preprocessing.text import Tokenizer

sentences = [
    'I love my dog',
    'I love my cat'
]

# num_words = 100代表took top 100 words by columns，因为出现频率少的单词对Accuracy影响少，但延长了训练时间
tokenizer = Tokenizer(num_words = 8)
# fit的过程中，会把所有大小变成小写，并去掉标点符号
tokenizer.fit_on_texts(sentences)
word_index = tokenizer.word_index
word_index
# 编码顺序按出现频率和出现顺序排序
sequences = tokenizer.texts_to_sequences(sentences)
sequences

[[1, 2, 3, 4], [1, 2, 3, 5]]

In [6]:
import numpy as np

min(tokenizer.num_words, len(tokenizer.word_index))
#tokenizer.num_words
#len(tokenizer.word_index)

2

In [8]:
word_index['dog']

4

## oov

In [3]:
# 默认情况下，tokenizer会把不认识的单词跳过（ignore）
# 可以把不认识的单词设置为OOV
tokenizer = Tokenizer(num_words=100, oov_token='<OOV>')

## 序列化

In [14]:
# tokenizer存在json文件中

json_token = tokenizer.to_json()
with open('file\\tokenizer.json', 'w') as f:
    print(json_token)
    f.write(json_token)

{"class_name": "Tokenizer", "config": {"num_words": 100, "filters": "!\"#$%&()*+,-./:;<=>?@[\\]^_`{|}~\t\n", "lower": true, "split": " ", "char_level": false, "oov_token": null, "document_count": 2, "word_counts": "{\"i\": 2, \"love\": 2, \"my\": 2, \"dog\": 1, \"cat\": 1}", "word_docs": "{\"my\": 2, \"dog\": 1, \"love\": 2, \"i\": 2, \"cat\": 1}", "index_docs": "{\"3\": 2, \"4\": 1, \"2\": 2, \"1\": 2, \"5\": 1}", "index_word": "{\"1\": \"i\", \"2\": \"love\", \"3\": \"my\", \"4\": \"dog\", \"5\": \"cat\"}", "word_index": "{\"i\": 1, \"love\": 2, \"my\": 3, \"dog\": 4, \"cat\": 5}"}}


In [15]:
# 从json文件中读tokenizer
from tensorflow.keras.preprocessing.text import tokenizer_from_json

with open('file\\tokenizer.json', 'r') as f:
    json_string = f.read()
    tokenizer = tokenizer_from_json(json_string)

tokenizer

<keras_preprocessing.text.Tokenizer at 0x1e38a93b780>

# padding

In [1]:
from tensorflow.keras.preprocessing.sequence import pad_sequences

sequences = [
    [1],
    [1,2],
    [1,2,3],
    [1,2,3,4]
]
padded = pad_sequences(sequences)
# 句子长度不同时，所有句子都padding到最长句子的长度。默认的padding方式是在前面填0
padded

array([[0, 0, 0, 1],
       [0, 0, 1, 2],
       [0, 1, 2, 3],
       [1, 2, 3, 4]])

In [2]:
# 改为在句子尾部填0
sequences = [
    [1],
    [1,2],
    [1,2,3],
    [1,2,3,4]
]
padded = pad_sequences(sequences, padding='post')
padded

array([[1, 0, 0, 0],
       [1, 2, 0, 0],
       [1, 2, 3, 0],
       [1, 2, 3, 4]])

# trunicate

In [4]:
sequences = [
    [1],
    [1,2],
    [1,2,3],
    [1,2,3,4]
]
# 设置句子的最大长度，不足则填充，超长则截断
padded = pad_sequences(sequences, maxlen=2)
# 默认的截断方式是保留最后的同几个字符，将前面的截掉
padded

array([[0, 1],
       [1, 2],
       [2, 3],
       [3, 4]])

In [5]:
sequences = [
    [1],
    [1,2],
    [1,2,3],
    [1,2,3,4]
]
# 设置为保留开关的字符，把后面的截掉
padded = pad_sequences(sequences, maxlen=2, truncating='post')
# 默认的截断方式是保留最后的同几个字符，将前面的截掉
padded

array([[0, 1],
       [1, 2],
       [1, 2],
       [1, 2]])

# 图像增强

In [None]:
from tensorflow.keras.preprocessing.image import ImageDataGenerator
# from pyimagesearch.minivggnet import MiniVGGNet
import tensorflow.keras as keras

aug = ImageDataGenerator(rotation_range=20, zoom_range=0.15, width_shift_range=0.2, height_shift_range=0.2,
                         shear_range=0.15, horizontal_flip=True, fill_mode='nearest')

history = model.fit_generator(aug.flow(trainX, trainY, batch_size=BS),
                             validation_data=(testX, testY),
                             steps_per_epoch=len(trainX)//BS,
                             epochs = EPOCHS)