# Do experiments with cats/dogs dataset

In [1]:
from __future__ import print_function
#
import os
import glob
import re
import pandas as pd
import numpy as np
import random
from sklearn.model_selection import train_test_split
import argparse
parser = argparse.ArgumentParser(description="Cats/Dogs playground")
parser.add_argument(
    "--config-file",
    default="",
    metavar="FILE",
    help="path to config file",
    type=str,
    )
parser.add_argument(
        "opts",
        help="Modify config options using the command-line",
        default=None,
        nargs=argparse.REMAINDER,
    )
args = parser.parse_args([])


"""
import matplotlib
matplotlib.use('Agg')
import matplotlib.pyplot as plt
"""

import time
from tqdm import tqdm


import sys

import cv2
import imgaug as ia
from imgaug import augmenters as iaa
import tensorflow as tf

  from ._conv import register_converters as _register_converters


In [2]:
args.config_file = "configs/R50v1_BN_BZ1_Pretrain.yaml"

In [3]:
class Augmentation_Setup(object):  
    sometimes = lambda aug: iaa.Sometimes(0.5, aug)
    lesstimes = lambda aug: iaa.Sometimes(0.2, aug)
    
    augmentation = iaa.Sequential([
        iaa.Fliplr(0.5, name="FlipLR"),
        iaa.Flipud(0.5, name="FlipUD"),
        iaa.OneOf([iaa.Affine(rotate = 90),
                   iaa.Affine(rotate = 180),
                   iaa.Affine(rotate = 270)]),
        sometimes(iaa.Affine(
                    scale = (0.8,1.2),
                    translate_percent = (-0.2, 0.2),
                    rotate = (-15, 15),
                    mode = 'wrap'
                    ))
    ])

def preproc(img):
    #return (img - img.min()) / (img.max() - img.min())
    return img / 255.
    
### DONT CHANGE IF NOT TESTING FOR DATASET LOADER ###
class GetDataset():
    def __init__(self, df_list, class_id, n_classes, f_input_preproc, image_size=(256,256,3), onehot=True, augmentation=None):
        
        self.df_list = df_list
        self.class_id = class_id
        self.n_classes = n_classes
        self.preproc = f_input_preproc
        self.image_size = image_size
        self.onehot = onehot
        self.aug = augmentation
        
        ## Init ##
        self.df_list = self.df_list.sample(frac=1.).reset_index(drop=True)
        self.current_index = 0
    
    def __len__(self):
        return len(self.df_list)
    
    def __getitem__(self, idx):
        
        img = self.load_image(img_path=self.df_list.iloc[self.current_index]['img_path'], image_size=self.image_size)
        
        if self.aug is not None:
            img = self.aug.augment_image(img)
            
        img = img.astype(np.float32)
        
        if self.preproc is not None:
            img = self.preproc(img)
        
        label = self.df_list.iloc[self.current_index]['cate']
        if self.onehot:
             label = tf.keras.utils.to_categorical(label, num_classes=self.n_classes)
        
        self.current_index = (self.current_index + 1) % len(self.df_list)
        return img, label
    
    def __next__(self):
        return self.__getitem__(idx=self.current_index)
    
    @staticmethod
    def load_image(img_path, image_size):
        img = cv2.imread(img_path)
        img = cv2.cvtColor(img, cv2.COLOR_BGR2RGB)
        img = cv2.resize(img, (image_size[0], image_size[1]))
        return img
    
class Customized_dataloader():
    """
    1. Compose multiple generators together
    2. Make this composed generator into multi-processing function
    """
    def __init__(self, list_dataset, batch_size_per_dataset=16, queue_size=128, num_workers=0):
        """
        Args:
            - list_dataset: put generator object as list [gen1, gen2, ...]
            - batch_size_per_dataset: bz for each generator (total_batch_size/n_class)
            - queue_size: queue size
            - num_workers: start n workers to get data
        
        Action: Call with next
        """
        self.list_dataset = list_dataset
        self.batch_size_per_dataset = batch_size_per_dataset
        self.sample_queue = mp.Queue(maxsize = queue_size)
        
        self.jobs = num_workers
        self.events = list()
        self.workers = list()
        for i in range(num_workers):
            event = Event()
            work = mp.Process(target = enqueue, args = (self.sample_queue, event, self.compose_data))
            work.daemon = True
            work.start()
            self.events.append(event)
            self.workers.append(work)
        print("workers ready")
        
    def __next__(self):
        return self.sample_queue.get()
    
    def compose_data(self):
        while True:
            imgs, labels = [], []
            for z in range(self.batch_size_per_dataset):
                data = [next(i) for i in self.list_dataset]
                img, label = zip(*data)
                imgs.append(np.array(img))
                labels.append(np.array(label))
            yield np.concatenate(imgs), np.concatenate(labels)
    
    def stop_worker(self):
        for t in self.events:
            t.set()
        for i, t in enumerate(self.workers):
            t.join(timeout = 1)
        print("all_worker_stop")

# ----- #
def enqueue(queue, stop, gen_func):
    gen = gen_func()
    while True:
        if stop.is_set():
            return
        queue.put(next(gen))

In [4]:
from default import get_cfg_defaults
cfg = get_cfg_defaults()
cfg.merge_from_file(args.config_file)
cfg.merge_from_list(args.opts)
cfg.freeze()
sys.path.append(cfg.SYSTEM.BACKBONE_PATH)
from model import build_model, parse_model_fn, make_optimizer, preproc
from data_generator import GetDataset, Customized_dataloader
print(cfg)

os.environ["CUDA_DEVICE_ORDER"]="PCI_BUS_ID"
os.environ['CUDA_VISIBLE_DEVICES'] = str(cfg.SYSTEM.GPU_ID)

"""  Get data """
image_train_list = glob.glob(cfg.DATASET.TRAIN + '*.jpg')
image_test_list = glob.glob(cfg.DATASET.TEST + '*.jpg')

df_train = pd.DataFrame({'img_path': image_train_list})
df_test = pd.DataFrame({'img_path': image_test_list})

df_train['cate'] = df_train.img_path.apply(os.path.basename)
df_train['cate'] = [i.split(".")[0] for i in list(df_train.cate)]
df_train.cate = df_train.cate.replace({'dog': 0, 'cat': 1})

df_train_0, df_val_0 = train_test_split(df_train[df_train['cate'] == 0], test_size = 1-cfg.TRAIN.TRAIN_RATIO)
df_train_1, df_val_1 = train_test_split(df_train[df_train['cate'] == 1], test_size = 1-cfg.TRAIN.TRAIN_RATIO)
df_val = pd.concat((df_val_0, df_val_1)).reset_index(drop = True)

del df_val_0, df_val_1

USE_RESNET_PREPROC = cfg.TRAIN.USE_RESNET_PREPROC
dtrain = GetDataset(df_list=df_train,
                    class_id=0, n_classes=2,
                    f_input_preproc=preproc if not USE_RESNET_PREPROC else tf.keras.applications.resnet50.preprocess_input,
                    augmentation=Augmentation_Setup.augmentation, 
                    onehot= True, 
                    image_size=cfg.TRAIN.IMAGE_SIZE)

dvalid = GetDataset(df_list=df_val, 
                    class_id=0, n_classes=2,
                    f_input_preproc=preproc if not USE_RESNET_PREPROC else tf.keras.applications.resnet50.preprocess_input,
                    augmentation=None, 
                    onehot= True, 
                    image_size=cfg.TRAIN.IMAGE_SIZE)

valid_gen = Customized_dataloader([dvalid], batch_size_per_dataset=16, num_workers=1)
x_val, y_val = [], []
for _ in tqdm(range(100)):
    a,b = next(valid_gen)
    x_val.append(a)
    y_val.append(b)
x_val = np.concatenate(x_val)
y_val = np.concatenate(y_val)
valid_gen.stop_worker()

print(x_val.shape)
print(y_val.shape)
print(y_val.sum(axis=0))

DATASET:
  TEST: /data/seanyu/cat_dog/dataset/test1/
  TRAIN: /data/seanyu/cat_dog/dataset/train/
MODEL:
  BACKBONE: R-50-v1
  NORM_USE: bn
  OPTIMIZER: Adam
  USE_PRETRAIN: True
SYSTEM:
  BACKBONE_PATH: /home/seanyu/research/deep-learning-experiments/cats_and_dogs_playground/experimental_stuff
  GPU_ID: 1
  NAME_FLAG: R50v1_BN_BZ1_Pretrain
  NUM_WORKERS: 4
  QUEUE_SIZE: 50
  RESULT_DIR: ./results/
TRAIN:
  BATCH_SIZE: 1
  EPOCHS: 50
  IMAGE_SIZE: (256, 256, 3)
  LR: 0.0001
  NUM_CLASSES: 2
  NUM_UPDATES: 2000
  TRAIN_RATIO: 0.9
  USE_RESNET_PREPROC: True


  0%|          | 0/100 [00:00<?, ?it/s]

workers ready


100%|██████████| 100/100 [00:20<00:00,  4.79it/s]


all_worker_stop
(1600, 256, 256, 3)
(1600, 2)
[812. 788.]


In [5]:
train_gen = Customized_dataloader([dtrain], 
                                  batch_size_per_dataset=cfg.TRAIN.BATCH_SIZE // cfg.TRAIN.NUM_CLASSES, 
                                  num_workers=cfg.SYSTEM.NUM_WORKERS, 
                                  queue_size=cfg.SYSTEM.QUEUE_SIZE)

Process Process-2:
Traceback (most recent call last):
Process Process-3:
  File "/home/seanyu/.conda/envs/tf18_keras/lib/python3.6/multiprocessing/process.py", line 258, in _bootstrap
    self.run()


workers ready


Process Process-4:
Traceback (most recent call last):
  File "/home/seanyu/.conda/envs/tf18_keras/lib/python3.6/multiprocessing/process.py", line 93, in run
    self._target(*self._args, **self._kwargs)
  File "/home/seanyu/research/deep-learning-experiments/cats_and_dogs_playground/data_generator.py", line 116, in enqueue
    queue.put(next(gen))
  File "/home/seanyu/.conda/envs/tf18_keras/lib/python3.6/multiprocessing/process.py", line 258, in _bootstrap
    self.run()
  File "/home/seanyu/research/deep-learning-experiments/cats_and_dogs_playground/data_generator.py", line 101, in compose_data
    yield np.concatenate(imgs), np.concatenate(labels)
  File "/home/seanyu/.conda/envs/tf18_keras/lib/python3.6/multiprocessing/process.py", line 93, in run
    self._target(*self._args, **self._kwargs)
Traceback (most recent call last):
ValueError: need at least one array to concatenate
  File "/home/seanyu/research/deep-learning-experiments/cats_and_dogs_playground/data_generator.py", line 1

In [4]:



model = build_model(model_fn=parse_model_fn(cfg.MODEL.BACKBONE), norm_use=cfg.MODEL.NORM_USE)
optim = make_optimizer(cfg)

model.compile(loss='categorical_crossentropy', 
              metrics=["accuracy"], 
              optimizer=optim)
model.summary()
cb_list = [tf.keras.callbacks.ReduceLROnPlateau(factor=0.5,
                                            patience=4,
                                            min_lr=1e-12),
      ]
model.fit_generator(train_gen,
                    epochs=cfg.TRAIN.EPOCHS,
                    steps_per_epoch=cfg.TRAIN.NUM_UPDATES, 
                    validation_data=(x_val, y_val),
                    callbacks=cb_list
                    )

train_loss = model.history.history['loss']
valid_loss = model.history.history['val_loss']
train_acc = model.history.history['acc']
valid_acc = model.history.history['val_acc']

plt.figure(figsize=(8,6))
plt.plot(range(len(train_loss)), train_loss, label='train_loss')
plt.plot(range(len(valid_loss)), valid_loss, label='valid_loss')
plt.legend()
plt.savefig(os.path.join("results", "exp_" + cfg.SYSTEM.NAME_FLAG + "_loss.png"))

plt.figure(figsize=(8,6))
plt.plot(range(len(train_acc)), train_acc, label='train_accuracy')
plt.plot(range(len(valid_acc)), valid_acc, label='valid_accuracy')
plt.legend()
plt.savefig(os.path.join("results", "exp_" + cfg.SYSTEM.NAME_FLAG + "_acc.png"))

result_df = pd.DataFrame({"train_loss":train_loss,
                          "valid_loss":valid_loss,
                          "train_acc":train_acc,
                          "valid_acc":valid_acc
                         })
result_df.to_csv(os.path.join("results", "exp_" + cfg.SYSTEM.NAME_FLAG + "_result.csv"), index=False)
print("All Done")

In [None]:
###############

In [1]:
from default import get_cfg_defaults
import yaml

def quoted_presenter(dumper, data):
    return dumper.represent_scalar('tag:yaml.org,2002:str', data, style='"')
yaml.add_representer(str, quoted_presenter)

cfg = get_cfg_defaults()
cfg.TRAIN.EPOCHS = 100

In [2]:
tmp = dict(cfg)
tmp['SYSTEM'] = dict(tmp['SYSTEM'])
tmp

#isinstance(tmp['SYSTEM'], dict)

{'SYSTEM': {'NUM_WORKERS': 2,
  'QUEUE_SIZE': 50,
  'GPU_ID': 0,
  'RESULT_DIR': './results/',
  'NAME_FLAG': 'default',
  'BACKBONE_PATH': ''},
 'DATASET': CfgNode({'TRAIN': '/data/seanyu/cat_dog/dataset/train/', 'TEST': '/data/seanyu/cat_dog/dataset/test1/'}),
 'TRAIN': CfgNode({'TRAIN_RATIO': 0.9, 'IMAGE_SIZE': (256, 256, 3), 'NUM_CLASSES': 2, 'BATCH_SIZE': 32, 'NUM_UPDATES': 2000, 'EPOCHS': 50, 'LR': 0.0001, 'USE_RESNET_PREPROC': True}),
 'MODEL': CfgNode({'BACKBONE': 'R-50-v1', 'USE_PRETRAIN': True, 'NORM_USE': 'bn', 'OPTIMIZER': 'SGD', 'EPOCHS': 100}),
 'EXPERIMENT': CfgNode({'ABC': 1, 'DEF': CfgNode({'RGB': 'test'})})}

In [2]:
with open("test.yaml", "w") as f:
    f.write(cfg.dump())

In [None]:
def cfg_to_yaml(cfg, output_name):
    """Write yacs config to file
    
    Args:
      cfg (object): yacs configuration
      output_name (str): full path to save the yaml, ex: ~/path_to_result/train_conidtion.yaml
    Returns:
      no return
    """
    with open(output_name, "w") as f:
        f.write(cfg.dump())    

In [3]:
from default import get_cfg_defaults
cfg = get_cfg_defaults()
cfg.merge_from_file("test.yaml")
print(cfg)

DATASET:
  TEST: /data/seanyu/cat_dog/dataset/test1/
  TRAIN: /data/seanyu/cat_dog/dataset/train/
EXPERIMENT:
  ABC: 1
  DEF:
    RGB: test
MODEL:
  BACKBONE: R-50-v1
  NORM_USE: bn
  OPTIMIZER: SGD
  USE_PRETRAIN: True
SYSTEM:
  BACKBONE_PATH: 
  GPU_ID: 0
  NAME_FLAG: default
  NUM_WORKERS: 2
  QUEUE_SIZE: 50
  RESULT_DIR: ./results/
TRAIN:
  BATCH_SIZE: 32
  EPOCHS: 100
  IMAGE_SIZE: (256, 256, 3)
  LR: 0.0001
  NUM_CLASSES: 2
  NUM_UPDATES: 2000
  TRAIN_RATIO: 0.9
  USE_RESNET_PREPROC: True
