# Big Data Content Analytics - AUEB

## Data Preprocessing - Image Data Generators and Image Augmentation

* Lab Assistant: George Perakis
* Email: gperakis[at]aeub.gr 

### Imports

In [None]:
import shutil
from tqdm import tqdm
import os
import matplotlib.pyplot as plt

from tensorflow.python.keras.preprocessing.image import ImageDataGenerator

In [None]:
# The datasets can be found here
# https://www.kaggle.com/c/dogs-vs-cats/data?select=train.zip
# https://www.kaggle.com/c/dogs-vs-cats/data?select=test1.zip

### Steps to follow when handling Images:

* Read the picture files.
* Decode the JPEG content to RGB grids of pixels.
* Convert these into floating-point tensors.
* Rescale the pixel values (between 0 and 255) to the [0, 1] interval (as you know,
neural networks prefer to deal with small input values).

Using **ImageDataGenerator** to read images from directories

In [None]:
dog_cats_dir = os.path.join(os.getcwd(), 'dogs-vs-cats')
train_dir = os.path.join(dog_cats_dir, 'train')
val_dir = os.path.join(dog_cats_dir, 'test1')

train_dog_dir = os.path.join(train_dir, 'dog')
train_cat_dir = os.path.join(train_dir, 'cat')

val_dog_dir = os.path.join(val_dir, 'dog')
val_cat_dir = os.path.join(val_dir, 'cat')

In [None]:
if not os.path.exists(train_dog_dir):
    os.mkdir(train_dog_dir)
    
if not os.path.exists(train_cat_dir):
    os.mkdir(train_cat_dir)
    
if not os.path.exists(val_dog_dir):
    os.mkdir(val_dog_dir)
    
if not os.path.exists(val_dog_dir):
    os.mkdir(val_cat_dir)

In [None]:
for img_file in tqdm(os.listdir(train_dir)):
    
    src_file = os.path.join(train_dir, img_file)
    
    if img_file.startswith('dog.'):
        
        dest_file = os.path.join(train_dog_dir, img_file)
        
        shutil.move(src_file, dest_file)
        
    elif img_file.startswith('cat.'):
        
        dest_file = os.path.join(train_cat_dir, img_file)
    
        shutil.move(src_file, dest_file)

In [None]:
train_datagen = ImageDataGenerator(rescale=1./255)

# test_datagen = ImageDataGenerator(rescale=1./255)

In [None]:
img_size = (150, 150)

train_generator = train_datagen.flow_from_directory(
    train_dir,
    target_size=img_size,
    batch_size=20,
    class_mode='binary')

In [None]:
# validation_generator = test_datagen.flow_from_directory(
#     validation_dir,
#     target_size=(150, 150),
#     batch_size=20,
#     class_mode='binary')

In [None]:
for data_batch, labels_batch in train_generator:
    print('data batch shape:', data_batch.shape)
    print('labels batch shape:', labels_batch.shape)
    break

In [None]:
labels_batch

In [None]:
# history = model.fit_generator(
# train_generator,
# steps_per_epoch=100,
# epochs=30,
# validation_data=validation_generator,
# validation_steps=50)

# It’s good practice to always save your models after training.

### Using data augmentation

##### Francois Chollet (Deep Learning)

Overfitting is caused by having too few samples to learn from, rendering you unable
to train a model that can generalize to new data. Given infinite data, your model

* **rotation_range** is a value in degrees (0–180), a range within which to randomly rotate pictures. 
* **width_shift** and **height_shift** are ranges (as a fraction of total width or height) within which to randomly translate pictures vertically or horizontally.
* **shear_range** is for randomly applying shearing transformations.
* **zoom_range** is for randomly zooming inside pictures.
* **horizontal_flip** is for randomly flipping half the images horizontally—relevant when there are no assumptions of horizontal asymmetry (for example, real-world pictures).
* **fill_mode** is the strategy used for filling in newly created pixels, which can appear after a rotation or a width/height shift.

In [None]:
datagen = ImageDataGenerator(
    rotation_range=40,
    width_shift_range=0.2,
    height_shift_range=0.2,
    shear_range=0.2,
    zoom_range=0.2,
    horizontal_flip=True,
    fill_mode='nearest')

In [None]:
from tensorflow.python.keras.preprocessing import image

In [None]:
fnames = [os.path.join(train_cat_dir, fname) for fname in os.listdir(train_cat_dir)]

In [None]:
img_path = fnames[1] # Chooses one image to augment

# Reads the image and resizes it
img = image.load_img(img_path, target_size=(300, 300))

# Converts it to a Numpy array with shape (150, 150, 3)
x = image.img_to_array(img)

# Reshapes it to (1, 150, 150, 3)
x = x.reshape((1,) + x.shape)

i = 1
for batch in datagen.flow(x, batch_size=1):
    plt.figure(i)
    imgplot = plt.imshow(image.array_to_img(batch[0]))
    i += 1
    if i % 4 == 0:
        break
        
plt.show()