# __Data Augmentation__
- Dog & Cat

In [25]:
my_path = 'image_folder/images/'

In [26]:
from os import listdir
from os.path import isfile, join
from keras.preprocessing.image import ImageDataGenerator, img_to_array, load_img


def augmentation(dir_path, initial_letter_of_file='w', augment_num=3):
    
    
    """
    note : 指定ディレクトリ内の,指定頭文字で始まるファイルを指定枚数オーグメントする
    ----------
    dir_path : フォルダパス
    initial_letter : augmentしたいファイル名の頭文字
    aument_num : augmentしたい枚数
    ----------
    """
    
    
    files_name = [f for f in listdir(my_path) if isfile(join(my_path, f))]
    files_name.remove('.DS_Store')
    
    
    datagen = ImageDataGenerator(rotation_range=40,
                             width_shift_range=0.2,
                             height_shift_range=0.2,
                             shear_range=0.2,
                             zoom_range=0.2,
                             horizontal_flip=True,
                             fill_mode='nearest'
    )
    
    
    for i, file in enumerate(files_name):
        img = load_img(dir_path + file)
        x = img_to_array(img)
        x = x.reshape((1,) + x.shape) 

        if file[0] == initial_letter_of_file:
            i = 0
            for batch in datagen.flow(x, save_to_dir=dir_path, save_prefix=initial_letter_of_file, save_format="jpg"):
                i += 1
                if i > augment_num:
                    break

        else:
            pass

In [27]:
augmentation(my_path, 'c')
augmentation(my_path, 'w')

updated_files_name = [f for f in listdir(my_path) if isfile(join(my_path, f))]

OSError: cannot identify image file 'image_folder/images/coffee-20.jpg'

In [31]:
from scipy.misc import imread
print(imread("image_folder/images/coffee-20.jpg").shape)

`imread` is deprecated in SciPy 1.0.0, and will be removed in 1.2.0.
Use ``imageio.imread`` instead.
  


OSError: cannot identify image file 'image_folder/images/coffee-20.jpg'

In [None]:
import random
random.seed(0)
random.shuffle(updated_files_name)

#### __前処理__
- ラベルデータの保存
- resize
- test_data, val_dataへの分割
- Dog - 1, Cat - 0
- ディレクトリの作成
    - catsvsdogs / images / train / dogs
    - catsvsdogs / images / train / cats
    - catsvsdogs / images / val / dogs
    - catsvsdogs / images / val / cats

#### __保存用ディレクトリの作成__

In [None]:
import os
import shutil

class1_dir_train = 'image_folder/train/class1/'
class1_dir_val = 'image_folder/test/class1/'
class2_dir_train = 'image_folder/train/class2/'
class2_dir_val = 'image_folder/test/class2/'

def make_dir(directory):
    if os.path.exists(directory):
        shutil.rmtree(directory)
    os.makedirs(directory)
    
make_dir(class1_dir_train)
make_dir(class1_dir_val)
make_dir(class2_dir_train)
make_dir(class2_dir_val)

#### __データのふるい分け、分割__

In [None]:
import cv2
import numpy as np
import sys
import shutil


def train_test_split(files_name, class_1='w', class_2='c', train_size=0.8):
    
    """
    note : 画像フォルダから、指定クラスを、指定割合でtrain_sprit
    ----------
    class_1 : クラス名（今回はdog）
    class_2 : クラス名（今回はcat）
    train_size : 分割したい割合
    ----------
    """
    
    class_1_count = 0
    class_2_count = 0
    
    each_class_size = len(files_name) // 2
    
    train_size = each_class_size * train_size
    test_size = each_class_size - train_size
    
    training_images = []
    training_labels = []
    test_images = []
    test_labels = []
    
    size=200
    
    for i, file in enumerate(files_name):
        
        if files_name[i][0] == class_1:
            class_1_count += 1
            image = cv2.imread(my_path + file)
            image = cv2.resize(image, (size, size), interpolation = cv2.INTER_AREA)
            if class_1_count <= train_size:
                training_images.append(image)
                training_labels.append(1)
                cv2.imwrite(class1_dir_train + class_1 + str(class_1_count) + '.jpg', image)
            if class_1_count > train_size and class_1_count <= train_size + test_size:
                test_images.append(image)
                test_labels.append(1)
                cv2.imwrite(class1_dir_val + class_1 + str(class_1_count) + '_' + '.jpg', image)

        if files_name[i][0] == class_2:
            class_2_count += 1
            image = cv2.imread(my_path + file)
            image = cv2.resize(image, (size, size), interpolation = cv2.INTER_AREA)
            if class_2_count <= train_size:
                training_images.append(image)
                training_labels.append(0)
                cv2.imwrite(class2_dir_train + class_2 + str(class_2_count) + '.jpg', image)
            if class_2_count > train_size and class_2_count <= train_size + test_size:
                test_images.append(image)
                test_labels.append(0)
                cv2.imwrite(class2_dir_val + class_2 + str(class_2_count) + '_' + '.jpg', image)
                
    return training_images, training_labels, test_images, test_labels

In [None]:
training_images, training_labels, test_images, test_labels = train_test_split(updated_files_name)

#### __Kerasが対応するファイル型に変換__

In [None]:
np.savez('wil_vs_cof_training_data.npz', np.array(training_images))
np.savez('wil_vs_cof_training_labels.npz', np.array(training_labels))
np.savez('wil_vs_cof_test_data.npz', np.array(test_images))
np.savez('wil_vs_cof_test_labels.npz', np.array(test_labels))

In [None]:
import numpy as np

def load_data_training_and_test(datasetname):
    npzfile = np.load(datasetname + "_training_data.npz")
    train = npzfile["arr_0"]
    
    npzfile = np.load(datasetname + "_training_labels.npz")
    train_labels = npzfile["arr_0"]
    
    npzfile = np.load(datasetname + "_test_data.npz")
    test = npzfile["arr_0"]
    
    npzfile = np.load(datasetname + "_test_labels.npz")
    test_labels = npzfile["arr_0"]
    
    return (train, train_labels), (test, test_labels)

for i in range(1, 11):
    random = np.random.randint(0, len(training_images))
    cv2.imshow("image_" + str(i), training_images[random])
    if training_labels[random] == 0:
        print(str(i) + "- Cat")
    else:
        print(str(i) + "- Dog")
        
cv2.destroyAllWindows()

#### __データの前処理__

In [None]:
(X_train, y_train), (X_test, y_test) = load_data_training_and_test("wil_vs_cof")

y_train = y_train.reshape(y_train.shape[0], 1)
y_test = y_test.reshape(y_test.shape[0], 1)

X_train = X_train.astype("float32")
X_test = X_test.astype("float32")

X_train /= 255
X_test /= 255

#### __学習__

In [None]:
from keras.preprocessing.image import ImageDataGenerator
from keras.models import Sequential
from keras.layers import Dense, Dropout, Activation, Flatten
from keras.layers import Conv2D, MaxPooling2D
import os

batch_size = 16
epochs = 40

img_rows = X_train[0].shape[0]
img_cols = X_train[1].shape[0]
input_size = (img_rows, img_cols, 3)

model = Sequential()
model.add(Conv2D(32, (3, 3), input_shape = input_size), kernel_initializer=)
model.add(Activation('relu'))
model.add(MaxPooling2D(pool_size=(2,2)))

model.add(Conv2D(32, (3, 3)))
model.add(Activation('relu'))
model.add(MaxPooling2D(pool_size=(2,2)))

model.add(Conv2D(64, (3, 3)))
model.add(Activation('relu'))
model.add(MaxPooling2D(pool_size=(2,2)))

model.add(Flatten())
model.add(Dense(64))
model.add(Activation('relu'))
model.add(Dropout(0.5))
model.add(Dense(1))
model.add(Activation('sigmoid'))

model.compile(loss = 'binary_crossentropy',
             optimizer = 'rmsprop',
             metrics = ['accuracy'])

print(model.summary())

In [None]:
history = model.fit(X_train, y_train,
                   batch_size = batch_size,
                   epochs = epochs,
                   validation_data = (X_test, y_test),
                   shuffle = True)

model.save("cats_vs_dogs.h5")

scores = model.evaluate(X_test, y_test, verbose=1)
print('Test loss', scores[0], 'Test accuracy', scores[1])

#### __Augmentation なし__

Train on 10 samples, validate on 10 samples
Epoch 1/25
10/10 [==============================] - 0s 30ms/step - loss: 1.8070e-05 - acc: 1.0000 - val_loss: 3.0441 - val_acc: 0.6000
Epoch 2/25
10/10 [==============================] - 0s 26ms/step - loss: 2.1992e-06 - acc: 1.0000 - val_loss: 3.0435 - val_acc: 0.6000
Epoch 3/25
10/10 [==============================] - 0s 25ms/step - loss: 1.4173e-04 - acc: 1.0000 - val_loss: 3.0218 - val_acc: 0.7000
Epoch 4/25
10/10 [==============================] - 0s 26ms/step - loss: 9.7983e-07 - acc: 1.0000 - val_loss: 3.0216 - val_acc: 0.7000
Epoch 5/25
10/10 [==============================] - 0s 26ms/step - loss: 2.4776e-06 - acc: 1.0000 - val_loss: 3.0214 - val_acc: 0.7000
Epoch 6/25
10/10 [==============================] - 0s 25ms/step - loss: 4.4536e-06 - acc: 1.0000 - val_loss: 3.0208 - val_acc: 0.7000
Epoch 7/25
10/10 [==============================] - 0s 26ms/step - loss: 5.0711e-06 - acc: 1.0000 - val_loss: 3.0207 - val_acc: 0.7000
Epoch 8/25
10/10 [==============================] - 0s 26ms/step - loss: 4.0394e-06 - acc: 1.0000 - val_loss: 3.0207 - val_acc: 0.7000
Epoch 9/25
10/10 [==============================] - 0s 26ms/step - loss: 5.2048e-05 - acc: 1.0000 - val_loss: 3.0292 - val_acc: 0.7000
Epoch 10/25
10/10 [==============================] - 0s 25ms/step - loss: 3.3786e-06 - acc: 1.0000 - val_loss: 3.0292 - val_acc: 0.7000
Epoch 11/25
10/10 [==============================] - 0s 25ms/step - loss: 1.9690e-06 - acc: 1.0000 - val_loss: 3.0294 - val_acc: 0.7000
Epoch 12/25
10/10 [==============================] - 0s 25ms/step - loss: 1.0861e-04 - acc: 1.0000 - val_loss: 3.1017 - val_acc: 0.6000
Epoch 13/25
10/10 [==============================] - 0s 27ms/step - loss: 5.6698e-06 - acc: 1.0000 - val_loss: 3.1057 - val_acc: 0.6000
Epoch 14/25
10/10 [==============================] - 0s 26ms/step - loss: 0.0213 - acc: 1.0000 - val_loss: 2.2884 - val_acc: 0.6000
Epoch 15/25
10/10 [==============================] - 0s 25ms/step - loss: 0.0016 - acc: 1.0000 - val_loss: 2.7232 - val_acc: 0.5000
Epoch 16/25
10/10 [==============================] - 0s 25ms/step - loss: 0.0067 - acc: 1.0000 - val_loss: 2.9423 - val_acc: 0.6000
Epoch 17/25
10/10 [==============================] - 0s 26ms/step - loss: 4.8264e-04 - acc: 1.0000 - val_loss: 2.8328 - val_acc: 0.6000
Epoch 18/25
10/10 [==============================] - 0s 25ms/step - loss: 2.9219e-05 - acc: 1.0000 - val_loss: 2.8082 - val_acc: 0.6000
Epoch 19/25
10/10 [==============================] - 0s 25ms/step - loss: 5.6744e-04 - acc: 1.0000 - val_loss: 2.5338 - val_acc: 0.7000
Epoch 20/25
10/10 [==============================] - 0s 24ms/step - loss: 1.3732e-05 - acc: 1.0000 - val_loss: 2.5418 - val_acc: 0.7000
Epoch 21/25
10/10 [==============================] - 0s 24ms/step - loss: 1.1395e-04 - acc: 1.0000 - val_loss: 2.6436 - val_acc: 0.7000
Epoch 22/25
10/10 [==============================] - 0s 25ms/step - loss: 6.0817e-05 - acc: 1.0000 - val_loss: 2.7054 - val_acc: 0.7000
Epoch 23/25
10/10 [==============================] - 0s 24ms/step - loss: 0.0070 - acc: 1.0000 - val_loss: 3.6960 - val_acc: 0.6000
Epoch 24/25
10/10 [==============================] - 0s 23ms/step - loss: 0.0010 - acc: 1.0000 - val_loss: 4.0447 - val_acc: 0.6000
Epoch 25/25
10/10 [==============================] - 0s 24ms/step - loss: 1.5118e-04 - acc: 1.0000 - val_loss: 4.0725 - val_acc: 0.6000
10/10 [==============================] - 0s 5ms/step
Test loss 4.072466850280762 Test accuracy 0.6000000238418579

#### __Augmentation 4 * 4__

Train on 400 samples, validate on 80 samples
Epoch 1/25
400/400 [==============================] - 9s 21ms/step - loss: 0.7688 - acc: 0.4825 - val_loss: 0.6883 - val_acc: 0.5000
Epoch 2/25
400/400 [==============================] - 8s 20ms/step - loss: 0.7008 - acc: 0.5950 - val_loss: 0.6643 - val_acc: 0.6125
Epoch 3/25
400/400 [==============================] - 8s 19ms/step - loss: 0.6364 - acc: 0.6500 - val_loss: 0.5930 - val_acc: 0.7375
Epoch 4/25
400/400 [==============================] - 8s 19ms/step - loss: 0.6539 - acc: 0.6875 - val_loss: 0.6051 - val_acc: 0.7375
Epoch 5/25
400/400 [==============================] - 8s 19ms/step - loss: 0.5354 - acc: 0.7600 - val_loss: 0.5143 - val_acc: 0.7875
Epoch 6/25
400/400 [==============================] - 8s 20ms/step - loss: 0.5291 - acc: 0.7625 - val_loss: 0.5527 - val_acc: 0.6250
Epoch 7/25
400/400 [==============================] - 8s 20ms/step - loss: 0.4127 - acc: 0.8275 - val_loss: 0.3708 - val_acc: 0.8375
Epoch 8/25
400/400 [==============================] - 8s 19ms/step - loss: 0.3846 - acc: 0.8400 - val_loss: 0.4236 - val_acc: 0.8250
Epoch 9/25
400/400 [==============================] - 8s 19ms/step - loss: 0.3645 - acc: 0.8625 - val_loss: 0.3718 - val_acc: 0.8375
Epoch 10/25
400/400 [==============================] - 8s 19ms/step - loss: 0.2291 - acc: 0.9150 - val_loss: 0.3645 - val_acc: 0.8500
Epoch 11/25
400/400 [==============================] - 8s 19ms/step - loss: 0.2896 - acc: 0.8850 - val_loss: 0.3536 - val_acc: 0.8500
Epoch 12/25
400/400 [==============================] - 8s 20ms/step - loss: 0.2940 - acc: 0.8800 - val_loss: 0.3106 - val_acc: 0.8875
Epoch 13/25
400/400 [==============================] - 8s 20ms/step - loss: 0.1852 - acc: 0.9325 - val_loss: 0.3798 - val_acc: 0.8625
Epoch 14/25
400/400 [==============================] - 8s 20ms/step - loss: 0.1617 - acc: 0.9425 - val_loss: 0.4412 - val_acc: 0.8500
Epoch 15/25
400/400 [==============================] - 8s 19ms/step - loss: 0.1717 - acc: 0.9375 - val_loss: 0.3906 - val_acc: 0.8625
Epoch 16/25
400/400 [==============================] - 8s 20ms/step - loss: 0.1362 - acc: 0.9500 - val_loss: 0.3388 - val_acc: 0.9000
Epoch 17/25
400/400 [==============================] - 8s 20ms/step - loss: 0.0957 - acc: 0.9625 - val_loss: 0.3369 - val_acc: 0.8750
Epoch 18/25
400/400 [==============================] - 8s 20ms/step - loss: 0.1039 - acc: 0.9575 - val_loss: 0.6380 - val_acc: 0.8750
Epoch 19/25
400/400 [==============================] - 8s 20ms/step - loss: 0.1599 - acc: 0.9500 - val_loss: 1.1282 - val_acc: 0.8000
Epoch 20/25
400/400 [==============================] - 8s 20ms/step - loss: 0.0578 - acc: 0.9800 - val_loss: 0.5977 - val_acc: 0.8750
Epoch 21/25
400/400 [==============================] - 8s 19ms/step - loss: 0.1028 - acc: 0.9625 - val_loss: 0.5286 - val_acc: 0.8875
Epoch 22/25
400/400 [==============================] - 8s 19ms/step - loss: 0.0438 - acc: 0.9825 - val_loss: 0.8115 - val_acc: 0.8625
Epoch 23/25
400/400 [==============================] - 8s 20ms/step - loss: 0.0887 - acc: 0.9775 - val_loss: 0.6342 - val_acc: 0.8875
Epoch 24/25
400/400 [==============================] - 8s 19ms/step - loss: 0.0491 - acc: 0.9850 - val_loss: 0.5752 - val_acc: 0.8625
Epoch 25/25
400/400 [==============================] - 8s 19ms/step - loss: 0.1410 - acc: 0.9600 - val_loss: 0.6641 - val_acc: 0.8875
80/80 [==============================] - 1s 7ms/step
Test loss 0.6640982627868652 Test accuracy 0.8875

In [None]:
import matplotlib.pyplot as plt
% matplotlib inline
plt.plot(history.history['val_loss'])
plt.title('val_loss')

In [None]:
plt.plot(history.history['val_acc'])
plt.title('val_acc')

#### __誤差と精度が上がり続ける__
- 汎化誤差
    - 学習時に使用しなかったデータに対する予測値と正解の差
- クラス不均衡
    - ではない
- 正則化
    - dropout
    - 参考になりそう
        - https://stackoverflow.com/questions/40910857/how-to-interpret-increase-in-both-loss-and-accuracy
        - https://kharshit.github.io/blog/2018/12/07/loss-vs-accuracy

- データオーグメンテーション
    - lossを発生させて、勾配を発生させている?
    - k

In [None]:
p = [1, 2, 3]
type(p)