<a href="https://colab.research.google.com/github/ykitaguchi77/PreProcess_Image_colab/blob/master/Divide_dataset_into_train_val_test.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

#**データセットをランダムに振り分け**
train/val/test

In [None]:
"""
dataset-------1.jpg, 2.jpg. .....
↓

dataset---train--
        |
        |--val--
        |
        |--test--  


"""

In [3]:
from __future__ import print_function, division

import torch
import torch.nn as nn
import torch.optim as optim
from torch.optim import lr_scheduler
import torch.utils.data as data
import numpy as np
import torchvision
from torchvision import datasets, models, transforms
import matplotlib.pyplot as plt
import time
import os
import copy
import math
import shutil

#Advanced Pytorchから
import glob
import os.path as osp
import random
import json
from PIL import Image
from tqdm import tqdm
%matplotlib inline

#サポートパッチのインポート
from google.colab.patches import cv2_imshow
import cv2

plt.ion()   # interactive mode
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

# Set random seem for reproducibility
manualSeed = 20200815
#manualSeed = random.randint(1, 10000) # use if you want new results
print("Random Seed: ", manualSeed)
random.seed(manualSeed)
torch.manual_seed(manualSeed)
torch.cuda.manual_seed(manualSeed)

torch.torch.backends.cudnn.benchmark = True
torch.torch.backends.cudnn.enabled = True


'''
grav: 甲状腺眼症
cont: コントロール
黒の空白を挿入することにより225px*225pxの画像を生成、EfficientNetを用いて転移学習
－－－－－－－－－－－－－－
データの構造
gravcont.zip ------grav
               |---cont
'''                                     

#google driveをcolabolatoryにマウント
from google.colab import drive
drive.mount('/content/drive')

Random Seed:  20200815
Mounted at /content/drive


#**シンプルに分割する場合**

In [None]:
"""
----dataset

↓

----dataset----train
            |--val
            |--

"""

#**set parameters**

In [18]:
orig_path = '/content/drive/MyDrive/Deep_learning/gravcont_all_500px'
dst_path = '/content/drive/MyDrive/Deep_learning/gravcont_all_500px_divided'

#trainとvalの割合を指定。残りがtestになる
rate_train = 0.74
rate_val = 0.185

#**process**

In [19]:
#振り分け枚数の確認
total = os.listdir(orig_path)
rate_train = rate_train
rate_val = rate_val
rate_test = 1-rate_train-rate_val

test = random.sample(total, int(len(total)*rate_test))
trainval = set(total)-set(test)
train = random.sample(trainval, int(len(trainval)*(rate_train/(rate_train+rate_val))))
val = set(trainval)-set(train)

print('total: '+str(len(total)))
print('train: '+str(len(train)))
print('val: '+str(len(val)))
print('test: '+str(len(test)))

total: 2
train: 1
val: 1
test: 0


In [None]:
start = time.time()

if os.path.exists(dst_path):
    shutil.rmtree(dst_path)
os.makedirs(dst_path, exist_ok=True)


os.makedirs(dst_path +'/train', exist_ok=True)
os.makedirs(dst_path +'/val', exist_ok=True)
os.makedirs(dst_path +'/test', exist_ok=True)

l=0
for j in train:
    shutil.copy(orig_path+'/'+j, dst_path +'/train')
    print('train: '+str(l)+' images copied')
    l+=1

l=0
for j in val:
    shutil.copy(orig_path+'/'+j, dst_path +'/val')
    print('val: '+str(l)+' images copied')
    l+=1

l=0
for j in test:
    shutil.copy(orig_path+'/'+j, dst_path +'/test')
    print('test: '+str(l)+' images copied')
    l+=1


print('Process done!!')
elapsed_time = time.time() - start
print ("elapsed_time:{0}".format(elapsed_time) + "[sec]")

#**親フォルダがある場合**

In [None]:
"""
dataset-----grav
         |
         |--cont

↓

dataset---train---grav
        |       |-cont
        |       
        |-val-----grav
        |       |-cont
        |
        |-test----grav
                |-cont

"""

In [29]:

orig_path = '/content/drive/MyDrive/Deep_learning/gravcont_all_500px'
dst_path = '/content/drive/MyDrive/Deep_learning/gravcont_all_500px_divided'

#trainとvalの割合を指定。残りがtestになる
rate_train = 0.74
rate_val = 0.185

In [30]:
#親フォルダ直下のディレクトリを探索
files = os.listdir(orig_path)
files_dir = [f for f in files if os.path.isdir(os.path.join(orig_path, f))]
print(files_dir)    # ['grav', 'cont']

['grav', 'cont']


In [31]:
for i in files_dir:
    in_path = orig_path + "/" + i
    print(in_path)

    #振り分け枚数の確認
    total = os.listdir(in_path)
    rate_train = rate_train
    rate_val = rate_val
    rate_test = 1-rate_train-rate_val

    test = random.sample(total, int(len(total)*rate_test))
    trainval = set(total)-set(test)
    train = random.sample(trainval, int(len(trainval)*(rate_train/(rate_train+rate_val))))
    val = set(trainval)-set(train)

    print(i + ' total: '+str(len(total)))
    print(i + ' train: '+str(len(train)))
    print(i + ' val: '+str(len(val)))
    print(i + ' test: '+str(len(test)))

/content/drive/MyDrive/Deep_learning/gravcont_all_500px/grav
grav total: 894
grav train: 661
grav val: 166
grav test: 67
/content/drive/MyDrive/Deep_learning/gravcont_all_500px/cont
cont total: 1618
cont train: 1197
cont val: 300
cont test: 121


In [34]:
start = time.time()

if os.path.exists(dst_path):
    shutil.rmtree(dst_path)
os.makedirs(dst_path, exist_ok=True)


for i in files_dir:

    os.makedirs(dst_path +'/train/' + i, exist_ok=True)
    os.makedirs(dst_path +'/val/' + i, exist_ok=True)
    os.makedirs(dst_path +'/test/' + i, exist_ok=True)

    #移動させる画像のパスを抜き出し
    in_path = orig_path + "/" + i
    print(in_path)

    total = os.listdir(in_path)
    rate_train = rate_train
    rate_val = rate_val
    rate_test = 1-rate_train-rate_val

    test = random.sample(total, int(len(total)*rate_test))
    trainval = set(total)-set(test)
    train = random.sample(trainval, int(len(trainval)*(rate_train/(rate_train+rate_val))))
    val = set(trainval)-set(train)

    #移動
    l=0
    for j in train:
        shutil.copy(orig_path+'/'+i+'/'+j, dst_path +'/train/' + i)
        print('train '+i+': '+str(l)+' images copied')
        l+=1

    l=0
    for j in val:
        shutil.copy(orig_path+'/'+i+'/'+j, dst_path +'/val/' + i)
        print('val: '+i+': '+str(l)+' images copied')
        l+=1

    l=0
    for j in test:
        shutil.copy(orig_path+'/'+i+'/'+j, dst_path +'/test/' + i)
        print('test: '+i+': '+str(l)+' images copied')
        l+=1


print('Process done!!')
elapsed_time = time.time() - start
print ("elapsed_time:{0}".format(elapsed_time) + "[sec]")

/content/drive/MyDrive/Deep_learning/gravcont_all_500px/grav
train grav: 0 images copied
train grav: 1 images copied
train grav: 2 images copied
train grav: 3 images copied
train grav: 4 images copied
train grav: 5 images copied
train grav: 6 images copied
train grav: 7 images copied
train grav: 8 images copied
train grav: 9 images copied
train grav: 10 images copied
train grav: 11 images copied
train grav: 12 images copied
train grav: 13 images copied
train grav: 14 images copied
train grav: 15 images copied
train grav: 16 images copied
train grav: 17 images copied
train grav: 18 images copied
train grav: 19 images copied
train grav: 20 images copied
train grav: 21 images copied
train grav: 22 images copied
train grav: 23 images copied
train grav: 24 images copied
train grav: 25 images copied
train grav: 26 images copied
train grav: 27 images copied
train grav: 28 images copied
train grav: 29 images copied
train grav: 30 images copied
train grav: 31 images copied
train grav: 32 images

#**フォルダとその中身を削除するスクリプト**

In [None]:
#フォルダを削除するスクリプト
directory = '/content/drive/MyDrive/Deep_learning/Olympia_dataset/dataset_500px_divided'
files = os.listdir(directory)
print(directory)
print(files)

for i in files:
    try:
        os.remove(directory+'/'+i)
    except IsADirectoryError:
        pass

shutil.rmtree(directory)