## Balancing, cv preprocessing and augmentation.

In [1]:
import os
import shutil
import numpy as np

import cv2
from PIL import Image
from matplotlib import pyplot
from keras.preprocessing import image
from sklearn.cross_validation import train_test_split
from sklearn.utils.class_weight import compute_class_weight

Using TensorFlow backend.


In [14]:
path = "/home/ubuntu/nbs/data/balanced_preprocessed_seedlings"
train_path = os.path.join(path, "train")
valid_path = os.path.join(path, "valid")
train_preproc_path = os.path.join(path,"train_preproc")
valid_preproc_path = os.path.join(path,"valid_preproc")



### Splitting the dataset to train and valid (split koef 0.8)

In [None]:
split_koef = 0.8
cat_size = {}
files_to_remove = []
for cat in os.listdir(train_path):
    valid_cat_dir = os.path.join(valid_path, cat)
    train_cat_dir = os.path.join(train_path, cat)
    cat_size[cat] = len(os.listdir(train_cat_dir))
    if not os.path.exists(valid_cat_dir):
        os.mkdir(valid_cat_dir)
    files = os.listdir(train_cat_dir)
    x_files, y_files = train_test_split(files, test_size=1-split_koef)
    for f in y_files:
        from_file = os.path.join(train_cat_dir, f)
        to_file = os.path.join(valid_cat_dir, f)
        shutil.copy(from_file, to_file)
        files_to_remove.append(from_file)
        
for f in files_to_remove:
        os.remove(f)
    


### Take a look at Train and Valid data size

In [15]:
train_cat_size = {}
valid_cat_size = {}
for cat in os.listdir(train_path):
    train_cat_dir = os.path.join(train_path, cat)
    train_cat_size[cat] = len(os.listdir(train_cat_dir))

for cat in os.listdir(valid_path):
    valid_cat_dir = os.path.join(valid_path, cat)
    valid_cat_size[cat] = len(os.listdir(valid_cat_dir))

print("Train set")
for cat in sorted(train_cat_size):
    print("{:26}: {}".format(cat, train_cat_size[cat]))
    
print("")
print("Valid set")
for cat in sorted(valid_cat_size):
    print("{:26}: {}".format(cat, valid_cat_size[cat]))



Train set
Black-grass               : 211
Charlock                  : 313
Cleavers                  : 230
Common Chickweed          : 489
Common wheat              : 177
Fat Hen                   : 381
Loose Silky-bent          : 524
Maize                     : 177
Scentless Mayweed         : 413
Shepherds Purse           : 185
Small-flowered Cranesbill : 397
Sugar beet                : 309

Valid set
Black-grass               : 54
Charlock                  : 79
Cleavers                  : 59
Common Chickweed          : 124
Common wheat              : 46
Fat Hen                   : 96
Loose Silky-bent          : 132
Maize                     : 46
Scentless Mayweed         : 105
Shepherds Purse           : 48
Small-flowered Cranesbill : 101
Sugar beet                : 129


In [16]:
print("Train data size: {} samples".format(sum(train_cat_size.values())))
print("Validation data size: {} samples".format(sum(valid_cat_size.values())))

Train data size: 3806 samples
Validation data size: 1019 samples


### CV2 manipulation


In [17]:
def cv2_preprocess_function(img):
    '''Remove background and leave only seedling.
    Inputs
    ------
        img : numpy array, BGR image 
    Returns
    -------
        output : numpy array
            Result image
    '''
    #convert to hsv format
    image_hsv = cv2.cvtColor(img, cv2.COLOR_BGR2HSV)
    lower_hsv = np.array([150, 0, 0])
    upper_hsv = np.array([179, 255, 255])
    #make the mask of green colors in hsv format
    mask_range = cv2.inRange(image_hsv,lower_hsv, upper_hsv)
    kernel = cv2.getStructuringElement(cv2.MORPH_ELLIPSE, (5,5))
    mask = cv2.morphologyEx(mask_range, cv2.MORPH_CLOSE, kernel)
    #remove background 
    output = cv2.bitwise_and(img, img, mask = mask)
    return output

### Balance datacet by oversampling with augmentation

In [18]:
NUM_TRAIN_SAMPLES = 500
NUM_VALID_SAMPLES = 200

In [19]:
# Get batches from the dirname directory, perform augmentation (rotation, shift, flip, zoom) 
# and return batch genarator object.
def get_batches(dirname, save_to_dir=None, shuffle=True, batch_size=4, class_mode='input',
               target_size=(299, 299)):
    gen = image.ImageDataGenerator(
                        rotation_range=180, 
                        #width_shift_range=0.1, height_shift_range=0.1, 
                        horizontal_flip=True, vertical_flip=True,
                        #zoom_range=0.2,
                        preprocessing_function=cv2_preprocess_function
                        )
    flow = gen.flow_from_directory(
                        dirname,
                        save_to_dir=save_to_dir,
                        shuffle=False,
                        batch_size=batch_size, 
                        class_mode=class_mode, 
                        target_size=target_size
                        )

    return flow 

In [20]:
for cat in os.listdir(train_path):
    train_cat_dir = os.path.join(train_path, cat)
    save_to_dir = os.path.join(train_preproc_path, cat)
    if not os.path.exists(save_to_dir):
        os.mkdir(save_to_dir)
    batch = get_batches(train_cat_dir, save_to_dir=save_to_dir, batch_size=1, class_mode="input")
    for i in range(NUM_TRAIN_SAMPLES):
        next(batch)


Found 308 images belonging to 1 classes.
Found 184 images belonging to 1 classes.
Found 523 images belonging to 1 classes.
Found 396 images belonging to 1 classes.
Found 412 images belonging to 1 classes.
Found 380 images belonging to 1 classes.
Found 312 images belonging to 1 classes.
Found 176 images belonging to 1 classes.
Found 210 images belonging to 1 classes.
Found 176 images belonging to 1 classes.
Found 488 images belonging to 1 classes.
Found 229 images belonging to 1 classes.


In [21]:
for cat in os.listdir(valid_path):
    valid_cat_dir = os.path.join(valid_path, cat)
    save_to_dir = os.path.join(valid_preproc_path, cat)
    if not os.path.exists(save_to_dir):
        os.mkdir(save_to_dir)
    batch = get_batches(valid_cat_dir, save_to_dir=save_to_dir, batch_size=1, class_mode="input")
    for i in range(NUM_VALID_SAMPLES):
        next(batch)


Found 77 images belonging to 1 classes.
Found 47 images belonging to 1 classes.
Found 131 images belonging to 1 classes.
Found 100 images belonging to 1 classes.
Found 104 images belonging to 1 classes.
Found 95 images belonging to 1 classes.
Found 78 images belonging to 1 classes.
Found 45 images belonging to 1 classes.
Found 53 images belonging to 1 classes.
Found 45 images belonging to 1 classes.
Found 123 images belonging to 1 classes.
Found 58 images belonging to 1 classes.


In [24]:
train_cat_size = {}
valid_cat_size = {}
for cat in os.listdir(train_preproc_path):
    train_cat_dir = os.path.join(train_preproc_path, cat)
    train_cat_size[cat] = len(os.listdir(train_cat_dir))

for cat in os.listdir(valid_preproc_path):
    valid_cat_dir = os.path.join(valid_preproc_path, cat)
    valid_cat_size[cat] = len(os.listdir(valid_cat_dir))

print("Train set")
for cat in sorted(train_cat_size):
    print("{:26}: {}".format(cat, train_cat_size[cat]))
    
print("")
print("Valid set")
for cat in sorted(valid_cat_size):
    print("{:26}: {}".format(cat, valid_cat_size[cat]))

Train set
Black-grass               : 500
Charlock                  : 500
Cleavers                  : 500
Common Chickweed          : 500
Common wheat              : 500
Fat Hen                   : 500
Loose Silky-bent          : 500
Maize                     : 500
Scentless Mayweed         : 500
Shepherds Purse           : 500
Small-flowered Cranesbill : 500
Sugar beet                : 500

Valid set
Black-grass               : 200
Charlock                  : 200
Cleavers                  : 200
Common Chickweed          : 200
Common wheat              : 200
Fat Hen                   : 200
Loose Silky-bent          : 200
Maize                     : 200
Scentless Mayweed         : 200
Shepherds Purse           : 200
Small-flowered Cranesbill : 200
Sugar beet                : 200
