## 드롭아웃 규제
- 일부 뉴런이 과도하게 전문화되어 특히 민감해지는 것을 방지하기 위해 특정 뉴런을 무시하도록 하는 것

In [3]:
import tensorflow as tf

data = tf.keras.datasets.fashion_mnist

(training_images, training_labels), (test_images, test_labels) = data.load_data()

training_images = training_images.reshape(60000, 28, 28, 1)
training_images = training_images / 255.0

test_images = test_images.reshape(10000, 28, 28, 1)
test_images = test_images / 255.0

model = tf.keras.models.Sequential([
    tf.keras.layers.Flatten(input_shape = (28, 28)),
    tf.keras.layers.Dense(256, activation = tf.nn.relu),
    tf.keras.layers.Dropout(0.2),
    tf.keras.layers.Dense(128, activation = tf.nn.relu),
    tf.keras.layers.Dropout(0.2),
    tf.keras.layers.Dense(64, activation = tf.nn.relu),
    tf.keras.layers.Dropout(0.2),
    tf.keras.layers.Dense(10, activation = tf.nn.softmax),
])

model.compile(loss = 'sparse_categorical_crossentropy',
              optimizer = 'adam',
              metrics = ['accuracy'])

model.fit(training_images, training_labels,
          epochs=5,
          validation_data = (test_images, test_labels))

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


<keras.callbacks.History at 0x7f36ad7451f0>

## 텐서플로 데이터셋 사용하기

In [None]:
!pip install tensorflow_datasets

In [5]:
import tensorflow_datasets as tfds
mnist_data = tfds.load('fashion_mnist')
for item in mnist_data:
  print(item)

Downloading and preparing dataset Unknown size (download: Unknown size, generated: Unknown size, total: Unknown size) to /root/tensorflow_datasets/fashion_mnist/3.0.1...


Dl Completed...: 0 url [00:00, ? url/s]

Dl Size...: 0 MiB [00:00, ? MiB/s]

Extraction completed...: 0 file [00:00, ? file/s]

Generating splits...:   0%|          | 0/2 [00:00<?, ? splits/s]

Generating train examples...: 0 examples [00:00, ? examples/s]

Shuffling /root/tensorflow_datasets/fashion_mnist/3.0.1.incompleteO33HBS/fashion_mnist-train.tfrecord*...:   0…

Generating test examples...: 0 examples [00:00, ? examples/s]

Shuffling /root/tensorflow_datasets/fashion_mnist/3.0.1.incompleteO33HBS/fashion_mnist-test.tfrecord*...:   0%…

Dataset fashion_mnist downloaded and prepared to /root/tensorflow_datasets/fashion_mnist/3.0.1. Subsequent calls will reuse this data.
train
test


In [6]:
mnist_train = tfds.load(name = 'fashion_mnist', split = 'train')
assert isinstance(mnist_train, tf.data.Dataset)
print(type(mnist_train))

<class 'tensorflow.python.data.ops.dataset_ops.PrefetchDataset'>


In [7]:
for item in mnist_train.take(1):
  print(type(item))
  print(item.keys())

<class 'dict'>
dict_keys(['image', 'label'])


In [None]:
for item in mnist_train.take(1):
  print(type(item))
  print(item.keys())
  print(item['image'])
  print(item['label'])

In [9]:
mnist_test, info = tfds.load(name = 'fashion_mnist', with_info = 'true')
print(info)

tfds.core.DatasetInfo(
    name='fashion_mnist',
    full_name='fashion_mnist/3.0.1',
    description="""
    Fashion-MNIST is a dataset of Zalando's article images consisting of a training set of 60,000 examples and a test set of 10,000 examples. Each example is a 28x28 grayscale image, associated with a label from 10 classes.
    """,
    homepage='https://github.com/zalandoresearch/fashion-mnist',
    data_path='/root/tensorflow_datasets/fashion_mnist/3.0.1',
    file_format=tfrecord,
    download_size=29.45 MiB,
    dataset_size=36.42 MiB,
    features=FeaturesDict({
        'image': Image(shape=(28, 28, 1), dtype=uint8),
        'label': ClassLabel(shape=(), dtype=int64, num_classes=10),
    }),
    supervised_keys=('image', 'label'),
    disable_shuffling=False,
    splits={
        'test': <SplitInfo num_examples=10000, num_shards=1>,
        'train': <SplitInfo num_examples=60000, num_shards=1>,
    },
    citation="""@article{DBLP:journals/corr/abs-1708-07747,
      author    

In [None]:
(training_images, training_labels), (test_images, test_labels) = tfds.load('fashion_mnist', 
                                                                           split = ['train', 'test'], 
                                                                           batch_size = 1, # batch 1은 모든 데이터
                                                                           as_supervised = True) # True일 경우 (입력, 레이블)로 구성된 튜플이 반환

training_images = tf.cast(training_images, tf.float32) / 255.0
test_images = tf.cast(test_images, tf.float32) / 255.0

model = tf.keras.models.Sequential([
    tf.keras.layers.Flatten(input_shape = (28, 28, 1)),
    tf.keras.layers.Dense(128, activation = tf.nn.relu),
    tf.keras.layers.Dropout(0.2),
    tf.keras.layers.Dense(10, activation = tf.nn.softmax),
])

model.compile(loss = 'sparse_categorical_crossentropy',
              optimizer = 'adam',
              metrics = ['accuracy'])

model.fit(training_images, training_labels,
          epochs=5,
          validation_data = (test_images, test_labels))

In [None]:
data = tfds.load('horses_or_humans', split = 'train', as_supervised = True)
val_data = tfds.load('horses_or_humans', split = 'test', as_supervised = True)

train_batches = data.shuffle(100).batch(10)
validation_batches = val_data.batch(32)

model = tf.keras.models.Sequential([
    tf.keras.layers.Conv2D(16, (3, 3), activation = 'relu',
                           input_shape = (300, 300, 3)),
    tf.keras.layers.MaxPooling2D((2, 2)),

    tf.keras.layers.Conv2D(32, (3, 3), activation = 'relu',
                           input_shape = (300, 300, 3)),
    tf.keras.layers.MaxPooling2D((2, 2)),

    tf.keras.layers.Conv2D(64, (3, 3), activation = 'relu',
                           input_shape = (300, 300, 3)),
    tf.keras.layers.MaxPooling2D((2, 2)),

    tf.keras.layers.Conv2D(64, (3, 3), activation = 'relu',
                           input_shape = (300, 300, 3)),
    tf.keras.layers.MaxPooling2D((2, 2)),

    tf.keras.layers.Conv2D(64, (3, 3), activation = 'relu',
                           input_shape = (300, 300, 3)),
    tf.keras.layers.MaxPooling2D((2, 2)),

    tf.keras.layers.Flatten(),
    tf.keras.layers.Dense(512, activation = 'relu'),
    tf.keras.layers.Dense(1, activation = 'sigmoid')

])

model.compile(loss = 'binary_crossentropy',
              optimizer = 'adam',
              metrics = ['accuracy'])

model.fit(train_batches,
          epochs=5,
          validation_data = validation_batches)

## 특정 버전에서의 데이터셋 로드

In [None]:
data, info = tfds.load('horses_or_human:3.0.0', with_info = True)

## 데이터 증식을 위해 매핑 함수 사용

In [None]:
data = tfds.load('horsed_or_humans', split = 'train', as_supervised = True)

def augmentimages(image, label):
  image = tf.cast(image, tf.float32)
  image = image / 255
  image = tf.image.random_flip_left_right(image) # tf.image 라이브러리의 이미지 증식용 함수 확인
  return image, label

train = data.map(augmentimages)
train_batches = train.shuffle(100).batch(32)

In [13]:
!pip install tensorflow_addons

## 애드온 라이브러리 사용
#### ImageDataGenerator에 있는 일부 기능은 애드온에서만 제공 ex.rotate

In [14]:
import tensorflow_addons as tfa

def augmentimages(image, label):
  image = tf.cast(image, tf.float32)
  image = image / 255
  image = tf.image.random_flip_left_right(image) # tf.image 라이브러리의 이미지 증식용 함수 확인
  image = tfa.image.rotate(image, 40, interpolation = 'NEAREST')
  return image, label

## 사용자 정의 분할 사용

In [None]:
data = tfds.load('cats_vs_dogs', split = 'train', as_supervised = True)

# 처음 10000개만 사용
data = tfds.load('cats_vs_dogs', split = 'train[:10000]', as_supervised = True)

# 20%만 사용
data = tfds.load('cats_vs_dogs', split = 'train[:20%]', as_supervised = True)

# 처음과 끝 각 1000씩만 사용
data = tfds.load('cats_vs_dogs', split = 'train[-1000:]+train[:1000]', as_supervised = True)

# 훈련, 테스트 검증으로 분할
data = tfds.load('cats_vs_dogs', split = 'train[:80%]', as_supervised = True)

validatoin_data = tfds.load('cats_vs_dogs', split = 'train[80%:90%]', as_supervised = True)

test_data = tfds.load('cats_vs_dogs', split = 'train[-10%:]', as_supervised = True)

## TFRecord 이해하기
- 텐서플로 데이터셋은 데이터를 다운로드해 디스크에 캐싱
- 캐싱을 위해 TFRecord 포맷을 사용

#### 캐싱이란?
- 자주 접근될 데이터를 더 빠른 속도의 메모리상에 가지고 와서 연산 수행 성능을 높이는 것


In [16]:
data, info = tfds.load('mnist', with_info = True)
print(info)

Downloading and preparing dataset Unknown size (download: Unknown size, generated: Unknown size, total: Unknown size) to /root/tensorflow_datasets/mnist/3.0.1...


Dl Completed...:   0%|          | 0/5 [00:00<?, ? file/s]

Dataset mnist downloaded and prepared to /root/tensorflow_datasets/mnist/3.0.1. Subsequent calls will reuse this data.
tfds.core.DatasetInfo(
    name='mnist',
    full_name='mnist/3.0.1',
    description="""
    The MNIST database of handwritten digits.
    """,
    homepage='http://yann.lecun.com/exdb/mnist/',
    data_path='/root/tensorflow_datasets/mnist/3.0.1.incompleteAJAE8P',
    file_format=tfrecord,
    download_size=11.06 MiB,
    dataset_size=21.00 MiB,
    features=FeaturesDict({
        'image': Image(shape=(28, 28, 1), dtype=uint8),
        'label': ClassLabel(shape=(), dtype=int64, num_classes=10),
    }),
    supervised_keys=('image', 'label'),
    disable_shuffling=False,
    splits={
        'test': <SplitInfo num_examples=10000, num_shards=1>,
        'train': <SplitInfo num_examples=60000, num_shards=1>,
    },
    citation="""@article{lecun2010mnist,
      title={MNIST handwritten digit database},
      author={LeCun, Yann and Cortes, Corinna and Burges, CJ},
     

In [18]:
# 원시 레코드를 TFRecordDataset으로 로드

import os 
import sys

filename = os.path.join(os.path.expanduser('~') + 
                        '/tensorflow_datasets/mnist/3.0.1/mnist-test.tfrecord-00000-of-00001')
raw_dataset = tf.data.TFRecordDataset(filename)

for raw_record in raw_dataset.take(1):
    print(repr(raw_record))

<tf.Tensor: shape=(), dtype=string, numpy=b"\n\x85\x03\n\xf2\x02\n\x05image\x12\xe8\x02\n\xe5\x02\n\xe2\x02\x89PNG\r\n\x1a\n\x00\x00\x00\rIHDR\x00\x00\x00\x1c\x00\x00\x00\x1c\x08\x00\x00\x00\x00Wf\x80H\x00\x00\x01)IDAT(\x91\xc5\xd2\xbdK\xc3P\x14\x05\xf0S(v\x13)\x04,.\x82\xc5Aq\xac\xedb\x1d\xdc\n.\x12\x87n\x0e\x82\x93\x7f@Q\xb2\x08\xba\tbQ0.\xe2\xe2\xd4\xb1\xa2h\x9c\x82\xba\x8a(\nq\xf0\x83Fh\x95\n6\x88\xe7R\x87\x88\xf9\xa8Y\xf5\x0e\x8f\xc7\xfd\xdd\x0b\x87\xc7\x03\xfe\xbeb\x9d\xadT\x927Q\xe3\xe9\x07:\xab\xbf\xf4\xf3\xcf\xf6\x8a\xd9\x14\xd29\xea\xb0\x1eKH\xde\xab\xea%\xaba\x1b=\xa4P/\xf5\x02\xd7\\\x07\x00\xc4=,L\xc0,>\x01@2\xf6\x12\xde\x9c\xde[t/\xb3\x0e\x87\xa2\xe2\xc2\xe0A<\xca\xb26\xd5(\x1b\xa9\xd3\xe8\x0e\xf5\x86\x17\xceE\xdarV\xae\xb7_\xf3AR\r!I\xf7(\x06m\xaaE\xbb\xb6\xac\r*\x9b$e<\xb8\xd7\xa2\x0e\x00\xd0l\x92\xb2\xd5\x15\xcc\xae'\x00\xf4m\x08O'+\xc2y\x9f\x8d\xc9\x15\x80\xfe\x99[q\x962@CN|i\xf7\xa9!=\xd7 \xab\x19\x00\xc8\xd6\xb8\xeb\xa1\xf0\xd8l\xca\xfb]\xee\xfb]*\x9fV\xe1\x07\xb7\xc

In [19]:
# 특성 디스크립션을 만듭니다.
feature_description = {
    'image': tf.io.FixedLenFeature([], dtype=tf.string),
    'label': tf.io.FixedLenFeature([], dtype=tf.int64),
}

def _parse_function(example_proto):
    # 위에서 만든 딕셔너리로 입력을 파싱합니다.
    return tf.io.parse_single_example(example_proto, feature_description)

parsed_dataset = raw_dataset.map(_parse_function)
for parsed_record in parsed_dataset.take(1):
    print((parsed_record))

{'image': <tf.Tensor: shape=(), dtype=string, numpy=b"\x89PNG\r\n\x1a\n\x00\x00\x00\rIHDR\x00\x00\x00\x1c\x00\x00\x00\x1c\x08\x00\x00\x00\x00Wf\x80H\x00\x00\x01)IDAT(\x91\xc5\xd2\xbdK\xc3P\x14\x05\xf0S(v\x13)\x04,.\x82\xc5Aq\xac\xedb\x1d\xdc\n.\x12\x87n\x0e\x82\x93\x7f@Q\xb2\x08\xba\tbQ0.\xe2\xe2\xd4\xb1\xa2h\x9c\x82\xba\x8a(\nq\xf0\x83Fh\x95\n6\x88\xe7R\x87\x88\xf9\xa8Y\xf5\x0e\x8f\xc7\xfd\xdd\x0b\x87\xc7\x03\xfe\xbeb\x9d\xadT\x927Q\xe3\xe9\x07:\xab\xbf\xf4\xf3\xcf\xf6\x8a\xd9\x14\xd29\xea\xb0\x1eKH\xde\xab\xea%\xaba\x1b=\xa4P/\xf5\x02\xd7\\\x07\x00\xc4=,L\xc0,>\x01@2\xf6\x12\xde\x9c\xde[t/\xb3\x0e\x87\xa2\xe2\xc2\xe0A<\xca\xb26\xd5(\x1b\xa9\xd3\xe8\x0e\xf5\x86\x17\xceE\xdarV\xae\xb7_\xf3AR\r!I\xf7(\x06m\xaaE\xbb\xb6\xac\r*\x9b$e<\xb8\xd7\xa2\x0e\x00\xd0l\x92\xb2\xd5\x15\xcc\xae'\x00\xf4m\x08O'+\xc2y\x9f\x8d\xc9\x15\x80\xfe\x99[q\x962@CN|i\xf7\xa9!=\xd7 \xab\x19\x00\xc8\xd6\xb8\xeb\xa1\xf0\xd8l\xca\xfb]\xee\xfb]*\x9fV\xe1\x07\xb7\xc9\x8b55\xe7M\xef\xb0\x04\xc0\xfd&\x89\x01<\xbe\xf9\x0

## 텐서플로에서 데이터 관리를 위한 ETL 프로세스

In [None]:
import tensorflow as tf
import tensorflow_datasets as tfds
import tensorflow_addons as tfa

# 모델 정의 시작 #
model = tf.keras.models.Sequential([
    tf.keras.layers.Conv2D(16, (3, 3), activation='relu', 
                           input_shape=(300, 300, 3)),
    tf.keras.layers.MaxPooling2D((2, 2)),
    tf.keras.layers.Conv2D(32, (3, 3), activation='relu'),
    tf.keras.layers.MaxPooling2D((2, 2)),
    tf.keras.layers.Conv2D(64, (3, 3), activation='relu'),
    tf.keras.layers.MaxPooling2D((2, 2)),
    tf.keras.layers.Conv2D(64, (3, 3), activation='relu'),
    tf.keras.layers.MaxPooling2D((2, 2)),
    tf.keras.layers.Conv2D(64, (3, 3), activation='relu'),
    tf.keras.layers.MaxPooling2D((2, 2)),
    tf.keras.layers.Flatten(),
    tf.keras.layers.Dense(512, activation='relu'),
    tf.keras.layers.Dense(1, activation='sigmoid')
])
model.compile(optimizer='Adam', loss='binary_crossentropy', 
              metrics=['accuracy'])
# 모델 정의 끝 #

# 추출 단계 시작 #
data = tfds.load('horses_or_humans', split='train', 
                 as_supervised=True)
val_data = tfds.load('horses_or_humans', split='test', 
                     as_supervised=True)
# 추출 단계 끝 #

# 변환 단계 시작 #
def augmentimages(image, label):
    image = tf.cast(image, tf.float32)
    image = (image/255)
    image = tf.image.random_flip_left_right(image)
    image = tfa.image.rotate(image, 40, interpolation='NEAREST')
    return image, label

train = data.map(augmentimages)
train_batches = train.shuffle(100).batch(32)
validation_batches = val_data.batch(32)
# 변환 단계 끝 #

# 로드 단계 시작 #
history = model.fit(train_batches, epochs=10, 
                    validation_data=validation_batches)
# 로드 단계 끝 #

- 대규모 데이터셋에서 작업한다고 가정
- 훈련할 때 GPU, TPU가 유리하므로 어느 프로세서에서나 수행가능한 추출과 변환은 CPU에게 배정
- 훈련은 GPU, TPU에 배정하고 데이터 준비와 훈련을 병렬로 수행
- CPU와 GPU, TPU가 갖는 유휴시간을 최적화 함으로써 비용과 시간 감소 가능하므로 중요하다

## 훈련 속도 향상을 위해 ETL 병렬화 하기

In [22]:
# 추출 단계 병렬화
train_data = tfds.load('cats_vs_dogs', split='train', with_info=True)

import sys

file_pattern = os.path.join(
    os.path.expanduser('~') + 
    '/tensorflow_datasets/cats_vs_dogs/4.0.0/cats_vs_dogs-train.tfrecord*'
)
files = tf.data.Dataset.list_files(file_pattern)

train_dataset = files.interleave(
    tf.data.TFRecordDataset, 
    cycle_length=4, # 동시 전처리할 입력 원소 개수, 지정하지 않을 경우 사용 가능한 CPU 코어 개수 사용
    num_parallel_calls=tf.data.experimental.AUTOTUNE # 병렬 실행 횟수 지정, autotune은 CPU 개수에 따라 설정
)

In [None]:
# 변환 단계 병렬화
def read_tfrecord(serialized_example):
    feature_description={
        "image": tf.io.FixedLenFeature((), tf.string, ""),
        "label": tf.io.FixedLenFeature((), tf.int64, -1),
    }
    example = tf.io.parse_single_example(
        serialized_example, feature_description
    )
    image = tf.io.decode_jpeg(example['image'], channels=3)
    image = tf.cast(image, tf.float32)
    image = image / 255
    image = tf.image.resize(image, (300,300))
    return image, example['label']

In [None]:
import multiprocessing

cores = multiprocessing.cpu_count()
print(cores)
train_dataset = train_dataset.map(read_tfrecord, num_parallel_calls=cores)
# 코랩의 경우 데이터셋을 캐싱하면 메모리 부족으로 런타임이 다운될 수 있으므로 주석처리힙니다.
# train_dataset = train_dataset.cache()

In [None]:
train_dataset = train_dataset.shuffle(1024).batch(32)
train_dataset = train_dataset.prefetch(tf.data.experimental.AUTOTUNE)

In [None]:
model.fit(train_dataset, epochs=10, verbose=1)

- colab 기준 실행 시 ETL 프로세스 병렬화를 위해 추가된 코드 덕분에
- 훈련시간이 대략 절반으로 감소

## 추가로 공부할 내용

- 드롭아웃과 적절한 수치, 사용 사례
- 모델 학습을 위한 batch 함수 사용법
- 애드온 라이브러리에서의 이미지 증식 방법
- TFRecord
- 딥러닝에서 ETL 프로세스 중요성과 병렬화 사례 찾아보기