In [None]:
import pandas as pd
import numpy as np
import os
import shutil
import seaborn as sns
import matplotlib.pyplot as plt
from tqdm import tqdm
import cv2
from random import shuffle

from keras.preprocessing.image import ImageDataGenerator

In [None]:
#windows path
DATASET_DIR = " "#path of the dataset
TRAIN_DIR = DATASET_DIR+"\\train"
TRAIN_LABEL = DATASET_DIR+"\\labels.csv"
DATASET_MULTICLASS_DIR = DATASET_DIR+"\\dataset_multiclass"
TRAIN_SET_DIR = DATASET_DIR+"\\train_set"
VAL_SET_DIR = DATASET_DIR+"\\val_set"
TEST_SET_DIR = DATASET_DIR+"\\test_set"

In [None]:
def apply_transformation(img_path, dirname, name):
    img = cv2.imread(img_path)
    rows,cols,_ = img.shape
    
    #rotation
    ang = 10
    M = cv2.getRotationMatrix2D((cols/2,rows/2),ang,1)
    dst = cv2.warpAffine(img,M,(cols,rows))
    
    #zoom
    fx_ =  np.random.rand(1)+1
    fy_ =  np.random.rand(1)+1
    res = cv2.resize(dst,None,fx=fx_, fy=fy_, interpolation = cv2.INTER_CUBIC)
    
    #flip
    horizontal_img = cv2.flip(res, 1)
    cv2.imwrite(dirname+"/over_"+name, horizontal_img)
    
    

In [None]:
def oversample_dataset(DIR, largest_class):
    for root, dirs, files in os.walk(DIR):
        for dirname in tqdm(sorted(dirs)):
            filelist = os.listdir(DIR+'/'+dirname)
            img_names = np.asarray(filelist)
            number_oversample_img = largest_class - len(filelist)
            
            shuffle(img_names)
            img_names = img_names[:number_oversample_img]
            for filename in img_names:
                img_path = DIR+'/'+dirname+'/'+filename
                apply_transformation(img_path, DIR+'/'+dirname, filename)
                

In [None]:
df_train = pd.read_csv(TRAIN_LABEL)
freq = df_train['breed'].value_counts()
largest_class = freq[0]

breeds = df_train.breed.unique()
breeds = np.sort(breeds)
print(largest_class)
print(freq)

In [None]:
if not os.path.exists(DATASET_DIR+"/dataset_multiclass"):
    os.makedirs(DATASET_DIR+"/dataset_multiclass")
    
for name in tqdm(breeds):
    if not os.path.exists(DATASET_DIR+"/dataset_multiclass/"+name):
        os.makedirs(DATASET_DIR+"/dataset_multiclass/"+name)

In [None]:
for index, row in tqdm(df_train.iterrows()):
    src = TRAIN_DIR+"/"+row['id']+".jpg"
    dst = DATASET_MULTICLASS_DIR+"/"+row['breed']
    shutil.copy (src, dst)

### Oversample of the images

In [None]:
oversample_dataset(DATASET_DIR+"/dataset_multiclass", largest_class)

### Creating Train/Validation/Test set

In [None]:
if not os.path.exists(DATASET_DIR+"/train_set"):
    os.makedirs(DATASET_DIR+"/train_set")
    
if not os.path.exists(DATASET_DIR+"/val_set"):
    os.makedirs(DATASET_DIR+"/val_set")
    
if not os.path.exists(DATASET_DIR+"/test_set"):
    os.makedirs(DATASET_DIR+"/test_set")

In [None]:
train_len =  0 
test_len =  0 
val_len =  0 

print("Creating Training/Validation/Test Set")
for filename in tqdm(os.listdir(DATASET_MULTICLASS_DIR)):
    files = os.listdir(DATASET_MULTICLASS_DIR+'/'+filename)
    files = np.asarray(files)
    
    if not os.path.exists(TRAIN_SET_DIR+"/"+filename):
        os.makedirs(TRAIN_SET_DIR+"/"+filename)
        
    if not os.path.exists(VAL_SET_DIR+"/"+filename):
        os.makedirs(VAL_SET_DIR+"/"+filename)
    
    if not os.path.exists(TEST_SET_DIR+"/"+filename):
        os.makedirs(TEST_SET_DIR+"/"+filename)
    
    train_sz = int(0.8 * files.shape[0])
    test_sz = files.shape[0] - train_sz
    val_sz = int(0.2 * train_sz)
    train_sz = train_sz-val_sz
    
    train_files = files[0:train_sz]
    val_files = files[train_sz:train_sz+val_sz]
    test_files = files[train_sz+val_sz:train_sz+val_sz+test_sz]
  
    for img_name in train_files:
        img_path = DATASET_MULTICLASS_DIR+'/'+filename+'/'+img_name
        dst = TRAIN_SET_DIR+"/"+filename
        shutil.move(img_path,dst)

    for img_name in val_files:
        img_path = DATASET_MULTICLASS_DIR+'/'+filename+'/'+img_name
        dst = VAL_SET_DIR+"/"+filename
        shutil.move(img_path,dst)
    
    for img_name in test_files:
        img_path = DATASET_MULTICLASS_DIR+'/'+filename+'/'+img_name
        dst = TEST_SET_DIR+"/"+filename
        shutil.move(img_path,dst)
    
#     print train_files.shape, val_files.shape, test_files.shape
#     print train_sz, val_sz, test_sz
    train_len = train_len + train_sz
    val_len = val_len + val_sz
    test_len = test_len + test_sz

print ('Total classes:',breeds.shape[0])
print ('Total data:',df_train.shape[0])
print
print ('Training 60% = ', train_len)
print ('Testing 20% = ', test_len)
print('Validation 20% from Train', val_len)