Modified from following project:
https://github.com/medp1030/Image-Recognition-with-Deep-Learning

This project using the ImageDataGenerator and flow_from_directory() functionality of Keras, so need to create a directory structure where images of each class sits within its own sub-directory in the training and validation directories.

The purpose of this notebook is to split raw_images in raw_date folder into _train, _val, _test and _test_orig subfolder under data folder.

### The whole project should look like following:

```
xx-oo.ipynb
raw data/
└── raw data/subclass01/
                ├── xxxxx.jpg
                ├── xxxxx.jpg
                ├── ..
                └── xxxxx.jpg
             subclass02/
                ├── xxxxx.jpg
                ├── xxxxx.jpg
                ├── ..
                └── xxxxx.jpg
             subclass03/
                ├── xxxxx.jpg
                ├── xxxxx.jpg
                ├── ..
                └── xxxxx.jpg
data/
└── train/
    ├── subclass01
xxxxx.jpg
    ├── xxxxx.jpg
    ├── ..
    └── xxxxx.jpg
    val/
    ├── xxxxx.jpg
    ├── xxxxx.jpg
    ├── ..
    └── xxxxx.jpg
    test/
    ├── xxxxx.jpg
    ├── xxxxx.jpg
    ├── ..
    └── xxxxx.jpg
    test_org/
    ├── xxxxx.jpg
    ├── xxxxx.jpg
    ├── ..
    └── xxxxx.jpg
    
```

In [1]:
# Get information of platform and version of tensorflow and keras

import platform
import tensorflow as tf
from tensorflow import keras

print("Platform: {}".format(platform.platform()))
print("Tensorflow version: {}".format(tf.__version__))
print("Keras version: {}".format(keras.__version__))

Platform: Darwin-19.6.0-x86_64-i386-64bit
Tensorflow version: 2.3.0-rc2
Keras version: 2.4.0


In [2]:
import glob, os, shutil
from shutil import copyfile 
from sklearn.model_selection import train_test_split
from tqdm import tqdm

https://blog.csdn.net/haoni123321/article/details/13624537
https://blog.csdn.net/silviakafka/article/details/46744961

In [3]:
# raw data
files = glob.glob('raw_data/strabismus_crop_raw_data/*')

print(files)

['raw_data/strabismus_crop_raw_data/strabismus', 'raw_data/strabismus_crop_raw_data/normal']


In [4]:
files = [glob.glob(i+'/*.jpg') for i in files]
# print(files)
[len(i) for i in files]

[4805, 4802]

In [5]:
[files[i][0].rsplit('/')[-2] for i in range(len(files))]

['strabismus', 'normal']

In [6]:
# To create 'data' folder
destination = "data"
if not os.path.exists(destination):
    os.makedirs(destination)
    
# To create 'logs' folder
train_logs = "logs"
if not os.path.exists(train_logs):
    os.makedirs(train_logs)
    
# To create 'csv' folder
train_logs = "csv"
if not os.path.exists(train_logs):
    os.makedirs(train_logs)

In [7]:
for i in range(len(files)):
    x = files[i]
    train, validate = train_test_split(x, test_size = 0.2, random_state = 42)

len(x)

4802

## train example
raw_data/leaves_raw_data/Leaf_01/image_0009.jpg
* subfolder was at [2]
* image was at [3]

In [8]:
# Create folder data/train with split size 0.8
for i in range(len(files)):
    x = files[i]
    train, validate = train_test_split(x, test_size = 0.2, random_state = 42)

    train_loc = train[0].rsplit("/")[2]
    # print(train)
    if not os.path.exists(destination+ "/" + "train/" + train_loc):
        os.makedirs(destination+ "/" + "train/" + train_loc)
    for j in train:
        #print(j)
        copyfile(j, destination+ "/" + "train/" + train_loc + "/"+ j.rsplit("/")[3])
        
    validate_loc = validate[0].rsplit("/")[2]
    #print(validate[0])
    if not os.path.exists(destination+ "/" + "temp" + "_val/" + validate_loc):
        os.makedirs(destination+ "/" + "temp" + "_val/" + validate_loc)
    for m in validate:
        copyfile(m, destination+ "/" + "temp" + "_val/" + validate_loc+"/" + m.rsplit("/")[3])
    

In [9]:
train_folders = glob.glob("data/train/*")
train_images = [glob.glob(i+"/*.jpg") for i  in train_folders]

[len(i) for i in train_images]

[3844, 3841]

In [10]:
temp_folders = glob.glob('data/temp_val/*')
temp_images = [glob.glob(i+'/*.jpg') for i in temp_folders]

[len(i) for i in temp_images]

[961, 961]

data/temp_train/Leaf_01/image_0016.jpg

In [11]:
# Split temp_val into _val and _test_org

for i in range(len(temp_images)):
    x = temp_images[i]
    val, test = train_test_split(x, test_size = 0.5, random_state = 42)
    
    val_loc = val[0].rsplit("/")[2]
    print(val[0])
    if not os.path.exists(destination + "/" + "val/" + val_loc):
        os.makedirs(destination + "/" + "val/" + val_loc)
    for j in val:
        copyfile(j, destination + "/" + "val/" + val_loc + "/"+ j.rsplit("/")[3])
    
    test_loc = test[0].rsplit("/")[2]
    #print(test[0])
    if not os.path.exists(destination+ "/" + "test/" + test_loc):
        os.makedirs(destination+ "/" + "test/" + test_loc)
    for n in test:
        copyfile(n, destination+ "/" + "test/" + test_loc+"/" + n.rsplit("/")[3])

data/temp_val/strabismus/DSC03940_0.jpg
data/temp_val/normal/V_0226_10_0.jpg


In [12]:
files = glob.glob("data/train/*")
files = [glob.glob(i+"/*.jpg") for i  in files]
sum([len(i) for i in files])

7685

In [13]:
files = glob.glob("data/test/*")
files = [glob.glob(i+"/*.jpg") for i  in files]
sum([len(i) for i in files])

962

In [14]:
files = glob.glob("data/val/*")
files = [glob.glob(i+"/*.jpg") for i  in files]
sum([len(i) for i in files])

960

In [15]:
test_orig_files = glob.glob('data/test/*')

In [16]:
test_orig_files = [glob.glob(i+'/*.jpg') for i in test_orig_files]

In [17]:
destination = "data/test_orig"
if not os.path.exists(destination):
    os.makedirs(destination)

In [18]:
for i in range(len(files)):
    x = test_orig_files[i]
    for m in x:
        file_prefix = x[0].rsplit("/")[2]
        copyfile(m, destination + "/" + file_prefix + "_" + m.rsplit("/")[3])

In [19]:
shutil.rmtree("data/temp_val", ignore_errors=False, onerror=None)

In [None]:
# shutil.rmtree("data", ignore_errors=False, onerror=None)

In [None]:
# shutil.rmtree("data_train", ignore_errors=False, onerror=None)