ImageDataGenerator에 validation dir 추가하기, **Augmentation**

In [1]:
import os
from os import getcwd
import zipfile
import tensorflow as tf
from tensorflow.keras.preprocessing.image import ImageDataGenerator
import random
from shutil import copyfile

In [2]:
local_zip = f"{getcwd()}/../data/cats-and-dogs.zip"

In [3]:
zip_ref = zipfile.ZipFile(local_zip, 'r')
zip_ref.extractall('/tmp')
zip_ref.close()

In [4]:
print(os.listdir('/tmp/PetImages'))

['Cat', '.DS_Store', 'Dog']


In [5]:
print(len(os.listdir('/tmp/PetImages/Cat')))
print(len(os.listdir('/tmp/PetImages/Dog')))

1500
1500


### Split train and validate directories
`SPLIT_SIZE` 비율만큼 train / test data를 나누는 함수 만들기

In [6]:
try:
    os.mkdir('/tmp/cats-v-dogs')
    os.mkdir('/tmp/cats-v-dogs/training')
    os.mkdir('/tmp/cats-v-dogs/testing')
    os.mkdir('/tmp/cats-v-dogs/training/cats')
    os.mkdir('/tmp/cats-v-dogs/training/dogs')
    os.mkdir('/tmp/cats-v-dogs/testing/cats')
    os.mkdir('/tmp/cats-v-dogs/testing/dogs')
except OSError:
    print('Some error occured')
    pass

Some error occured


In [7]:
def split_data(SOURCE, TRAINING, TESTING, SPLIT_SIZE):
    files=[]
    print(SOURCE + ' length: {}'.format(len(os.listdir(SOURCE))))
    
    # All images should be checked
    for filename in os.listdir(SOURCE):
        full_path = SOURCE + filename
        #print(full_path)
        
        if os.path.getsize(full_path)>0:
            files.append(filename)
        else:
            print(filename + "is zero length!!")
    print('valid(not zero length) file length', len(files))
            
    # 비율만큼 데이터 크기 정하기
    training_list_len = int(len(files) * SPLIT_SIZE)
    testing_list_len = int(len(files) - training_list_len)
    print("split size:", training_list_len, testing_list_len, "\tportion:", training_list_len/testing_list_len)
    
    # Shuffle
    shuffled_list = random.sample(files, len(files))
    
    # Shuffle된 데이터셋으로부터 train and test image 나누기(slicing)
    training_list = shuffled_list[:training_list_len]
    testing_list = shuffled_list[-testing_list_len:]
    print('check shuffled length :', len(training_list), len(testing_list))
    
    # 이제 이거를 COPY
    for filename in training_list:
        src = SOURCE + filename
        dst = TRAINING + filename
        copyfile(src, dst)
    
    for filename in testing_list:
        src = SOURCE + filename
        dst = TESTING + filename
        copyfile(src, dst)
    

In [8]:
CAT_SOURCE_DIR = "/tmp/PetImages/Cat/"
TRAINING_CATS_DIR = "/tmp/cats-v-dogs/training/cats/"
TESTING_CATS_DIR = "/tmp/cats-v-dogs/testing/cats/"
DOG_SOURCE_DIR = "/tmp/PetImages/Dog/"
TRAINING_DOGS_DIR = "/tmp/cats-v-dogs/training/dogs/"
TESTING_DOGS_DIR = "/tmp/cats-v-dogs/testing/dogs/"

split_size = .9
split_data(CAT_SOURCE_DIR, TRAINING_CATS_DIR, TESTING_CATS_DIR, split_size)
split_data(DOG_SOURCE_DIR, TRAINING_DOGS_DIR, TESTING_DOGS_DIR, split_size)

/tmp/PetImages/Cat/ length: 1500
valid(not zero length) file length 1500
split size: 1350 150 	portion: 9.0
check shuffled length : 1350 150
/tmp/PetImages/Dog/ length: 1500
valid(not zero length) file length 1500
split size: 1350 150 	portion: 9.0
check shuffled length : 1350 150


In [9]:
print(len(os.listdir('/tmp/cats-v-dogs/training/cats/')))
print(len(os.listdir('/tmp/cats-v-dogs/training/dogs/')))
print(len(os.listdir('/tmp/cats-v-dogs/testing/cats/')))
print(len(os.listdir('/tmp/cats-v-dogs/testing/dogs/')))

1500
1500
519
511


왜 split한 크기는 1350 150인데 copy된 크기는 1498/410 & 1499/403일까... 커널 셧다운 하고 다시 돌리면 맞아질까...?
* 뭔가 껐다가 다시 돌릴 때 마다 점점 늘어난다!!!

### Preprocessing
Augmentaion on ImageDataGenerator
* 주의 : ImageDataGenerator를 쓸 경우 모델 fitting도 model.fit이 아니라 `model.fit_generator`임

In [13]:
training_dir = '/tmp/cats-v-dogs/training/'
validation_dir = '/tmp/cats-v-dogs/testing/'

train_datagen = ImageDataGenerator(rescale=1./255,
                                  rotation_range=40,
                                  width_shift_range=0.2,
                                  height_shift_range=0.2,
                                  shear_range=0.2,
                                  zoom_range=0.2,
                                  horizontal_flip=True,
                                  fill_mode='nearest')
train_generator = train_datagen.flow_from_directory(
    training_dir,
    target_size=(150,150),
    batch_size=10,
    class_mode='binary'
)

valid_datagen = ImageDataGenerator(rescale=1./255)
valid_generator = valid_datagen.flow_from_directory(
    validation_dir,
    target_size=(150,150),
    batch_size=10,
    class_mode='binary'
)

Found 2997 images belonging to 2 classes.
Found 813 images belonging to 2 classes.


### Modeling

In [16]:
model = tf.keras.models.Sequential([
    tf.keras.layers.Conv2D(32, (3,3), activation='relu', input_shape=(150,150,3)),
    tf.keras.layers.MaxPooling2D(2,2),
    tf.keras.layers.Conv2D(64, (3,3), activation='relu'),
    tf.keras.layers.MaxPooling2D(2,2),
    tf.keras.layers.Conv2D(64, (3,3), activation='relu'),
    tf.keras.layers.MaxPooling2D(2,2),
    
    tf.keras.layers.Flatten(),
    tf.keras.layers.Dense(128, activation='relu'),
    tf.keras.layers.Dense(1, activation='sigmoid')
])

In [17]:
model.summary()

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
conv2d_3 (Conv2D)            (None, 148, 148, 32)      896       
_________________________________________________________________
max_pooling2d_3 (MaxPooling2 (None, 74, 74, 32)        0         
_________________________________________________________________
conv2d_4 (Conv2D)            (None, 72, 72, 64)        18496     
_________________________________________________________________
max_pooling2d_4 (MaxPooling2 (None, 36, 36, 64)        0         
_________________________________________________________________
conv2d_5 (Conv2D)            (None, 34, 34, 64)        36928     
_________________________________________________________________
max_pooling2d_5 (MaxPooling2 (None, 17, 17, 64)        0         
_________________________________________________________________
flatten (Flatten)            (None, 18496)             0

In [18]:
from tensorflow.keras.optimizers import RMSprop
model.compile(optimizer=RMSprop(lr=0.001), loss='binary_crossentropy', metrics=['accuracy'])

In [None]:
history = model.fit_generator(train_generator,
                             validation_data=valid_generator,
                             epochs=2,
                             verbose=1)