# tensorflow preprocessing layers
> A copy of preprocessing layers in tensorflow.

- toc: true 
- badges: true
- comments: true
- categories: [jupyter,preprocessing_layers]
- image: images/chart-preview.png

In [1]:
import numpy as np

In [2]:
import tensorflow as tf

In [3]:
from tensorflow.keras import layers

In [4]:
data = np.array([[0.1,0.2,0.3],
                [0.8,0.9,1.0],
                [1.5,1.6,1.7]])

In [5]:
layer = layers.Normalization()

In [6]:
layer.adapt(data)

In [7]:
normalized_data = layer(data)

In [9]:
print("Feature mean: %.2f" %(normalized_data.numpy().mean()))

Feature mean: -0.00


In [10]:
print("Feature std: %.2f" %(normalized_data.numpy().std()))

Feature std: 1.00


In [11]:
from tensorflow import keras
from tensorflow.keras import layers

In [15]:
data_augmentation = keras.Sequential(
    [
        layers.RandomFlip("horizontal"),
        layers.RandomRotation(0.1),
        layers.RandomZoom(0.1),
    ]
)

## load some data

In [13]:
(x_train, y_train), _ = keras.datasets.cifar10.load_data()
input_shape = x_train.shape[1:]
classes = 10

Downloading data from https://www.cs.toronto.edu/~kriz/cifar-10-python.tar.gz


In [17]:
train_dataset = tf.data.Dataset.from_tensor_slices((x_train, y_train))
train_dataset = train_dataset.batch(16).map(lambda x, y: (data_augmentation(x), y))

In [18]:
input_shape

(32, 32, 3)

In [20]:
x_train.shape

(50000, 32, 32, 3)

In [21]:
y_train.shape

(50000, 1)

In [24]:
train_dataset  #MapDataset

<MapDataset element_spec=(TensorSpec(shape=(None, 32, 32, 3), dtype=tf.float32, name=None), TensorSpec(shape=(None, 1), dtype=tf.uint8, name=None))>

In [25]:
train_dataset.element_spec

(TensorSpec(shape=(None, 32, 32, 3), dtype=tf.float32, name=None),
 TensorSpec(shape=(None, 1), dtype=tf.uint8, name=None))

## 创建一个模型，利用数据增强后的数据训练

In [30]:
inputs = keras.Input(shape=input_shape)
x = layers.Rescaling(1.0 / 255)(inputs)
outputs = keras.applications.ResNet50(
     weights=None, input_shape=input_shape, classes=classes
    )(x)
model = keras.Model(inputs, outputs)
model.compile(optimizer="rmsprop", loss="sparse_categorical_crossentropy")
model.fit(train_dataset, steps_per_epoch=5)



<keras.callbacks.History at 0x7f85fcb370d0>

# 通过0-1编码字符特征

In [45]:
data = tf.constant([["a"],["b"],["c"],["b"],["c"],["a"],['e']])

In [46]:
data.numpy()

array([[b'a'],
       [b'b'],
       [b'c'],
       [b'b'],
       [b'c'],
       [b'a'],
       [b'e']], dtype=object)

In [47]:
lookup = layers.StringLookup(output_mode="one_hot")
lookup.adapt(data)

In [48]:
test_data = tf.constant([["a"],["b"],["c"],["d"],["e"],[""]])

In [49]:
encoded_data = lookup(test_data)

In [50]:
print(encoded_data)

tf.Tensor(
[[0. 0. 0. 1. 0.]
 [0. 0. 1. 0. 0.]
 [0. 1. 0. 0. 0.]
 [1. 0. 0. 0. 0.]
 [0. 0. 0. 0. 1.]
 [1. 0. 0. 0. 0.]], shape=(6, 5), dtype=float32)


In [40]:
test_data_1 = tf.constant([["a"],["b"],["c"],["b"],["c"],["a"]])

In [41]:
lookup(test_data_1)

<tf.Tensor: shape=(6, 4), dtype=float32, numpy=
array([[0., 0., 0., 1.],
       [0., 0., 1., 0.],
       [0., 1., 0., 0.],
       [0., 0., 1., 0.],
       [0., 1., 0., 0.],
       [0., 0., 0., 1.]], dtype=float32)>

In [42]:
test_data_1 = tf.constant([["a"],["b"],["c"],["b"],["c"],["a"],["d"],["e"],["f"]])

In [52]:
lookup(test_data_1)

<tf.Tensor: shape=(9, 5), dtype=float32, numpy=
array([[0., 0., 0., 1., 0.],
       [0., 0., 1., 0., 0.],
       [0., 1., 0., 0., 0.],
       [0., 0., 1., 0., 0.],
       [0., 1., 0., 0., 0.],
       [0., 0., 0., 1., 0.],
       [1., 0., 0., 0., 0.],
       [0., 0., 0., 0., 1.],
       [1., 0., 0., 0., 0.]], dtype=float32)>

## 1.1 根据已知词汇表创建查找层

In [64]:
vocab = ['a','b','c','d']
data = tf.constant([['a','c','d'],['d','z','b']])
layer = tf.keras.layers.StringLookup(vocabulary=vocab)

In [69]:
layer.get_vocabulary() # 根据a的索引位置为1，b的为2，将输入数据编码

['[UNK]', 'a', 'b', 'c', 'd']

In [70]:
layer(data)

<tf.Tensor: shape=(2, 3), dtype=int64, numpy=
array([[1, 3, 4],
       [4, 0, 2]])>

## 1.2 未知词汇表，根据对输入数据分析建立词汇表

In [72]:
data = tf.constant([['a','c','d'],['d','z','b']])

In [74]:
layer = tf.keras.layers.StringLookup()
layer.adapt(data)
layer.get_vocabulary() #排到第一的是未知字符，然后是出现频率最多的，然后是逆序排列



['[UNK]', 'd', 'z', 'c', 'b', 'a']

In [53]:
data = tf.constant([["a","c","d"],["d","z","b"]])

In [54]:
layer = tf.keras.layers.StringLookup() #默认输出模式为 int, 输入字符根据对应的词汇表中的位置索引转化

In [55]:
layer.adapt(data)

In [56]:
layer(data)

<tf.Tensor: shape=(2, 3), dtype=int64, numpy=
array([[5, 3, 1],
       [1, 2, 4]])>

## 1.3 设置多个未知字符位置

In [75]:
vocab = ['a','b','c','d']
data= tf.constant([['a','c','d'],['m','z','b']])
layer = tf.keras.layers.StringLookup(vocabulary=vocab, num_oov_indices=2)

In [76]:
layer.get_vocabulary()

['[UNK]', '[UNK]', 'a', 'b', 'c', 'd']

In [77]:
layer(data)

<tf.Tensor: shape=(2, 3), dtype=int64, numpy=
array([[2, 4, 5],
       [0, 1, 3]])>

## 1.4 one-hot output

In [79]:
vocab = ['a','b','c','d']
data = tf.constant(['a','b','c','d','z'])
layer = tf.keras.layers.StringLookup(
    vocabulary = vocab, output_mode="one_hot"
)

In [82]:
layer.get_vocabulary() # 第一个位置代表位置字符

['[UNK]', 'a', 'b', 'c', 'd']

In [83]:
layer(data)

<tf.Tensor: shape=(5, 5), dtype=float32, numpy=
array([[0., 1., 0., 0., 0.],
       [0., 0., 1., 0., 0.],
       [0., 0., 0., 1., 0.],
       [0., 0., 0., 0., 1.],
       [1., 0., 0., 0., 0.]], dtype=float32)>

## 1.5 Multi-hot output

In [85]:
vocab = ['a','b','c','d']
data = tf.constant([['a','c','d','d'],['d','z','b','z']])
layer = tf.keras.layers.StringLookup(
        vocabulary = vocab, output_mode='multi_hot'
    )

In [86]:
layer.get_vocabulary()

['[UNK]', 'a', 'b', 'c', 'd']

In [87]:
layer(data)

<tf.Tensor: shape=(2, 5), dtype=float32, numpy=
array([[0., 1., 0., 1., 1.],
       [1., 0., 1., 0., 1.]], dtype=float32)>

## 1.6 计数 Token count output

In [88]:
vocab = ['a','b','c','d']
data = tf.constant([['a','c','d','d'],['d','z','b','z']])
layer = tf.keras.layers.StringLookup(
        vocabulary = vocab, output_mode='count'
    )

In [89]:
layer.get_vocabulary()

['[UNK]', 'a', 'b', 'c', 'd']

In [90]:
layer(data)

<tf.Tensor: shape=(2, 5), dtype=float32, numpy=
array([[0., 1., 0., 1., 2.],
       [2., 0., 1., 0., 1.]], dtype=float32)>