In [1]:
import sys
import os
import multiprocessing as mp
import shutil
import random

In [32]:
def make_data_directories(dir_name, labels):

    # dir_name - Top level directory name
    # labels - labels names. (e.g. cats, dogs, horses)
    try:
        os.mkdir(dir_name)  # making top level directory
    except OSError as e:
        print(e.filename, 'exists already')
        pass
    
    train_path = os.path.join(dir_name, 'training')
    testing_path = os.path.join(dir_name, 'testing')
    
    try:
        os.mkdir(train_path)  # making training subdirectory
    except FileExistsError as e:
        print(e.filename, 'exists already')
        pass
    
    try:
        os.mkdir(testing_path)  # making testing subdirectory
    except FileExistsError as e:
        print(e.filename, 'exists already')
        pass
        
    for i in range(len(labels)):  # making training labels and testing labels subdirectory
        try:
            os.mkdir(os.path.join(train_path, str(labels[i])))
        except OSError as e:
            print(e.filename, 'exists already')
            pass
        
        try:
            os.mkdir(os.path.join(testing_path, str(labels[i])))
        except OSError as e:
            print(e.filename, 'exists already')
            pass


def split_data(SOURCE, TRAINING, TESTING, SPLIT_SIZE):

    # SOURCE - Path location where the images are located
    # TRAINING - Path location where the training data will be moved to
    # TESTING - Path location where the testing data will be moved to
    # SPLIT_SIZE = The amount of images going to the training dataset by %.
    # (e.g. SPLIT_SIZE = .9: total images = 100, 90 images goes to training directory, 10 images goes to test directory)

    testing_file_names = []
    file_names = os.listdir(SOURCE)  #
    t = round(len(file_names) * (1-SPLIT_SIZE))
    random.shuffle(file_names)
    for i in range(int(t)):
        testing_file_names.append(file_names[i])  # adding the file names for the testing data set
        file_names.remove(file_names[i])  # removing the testing data file names
    
    for jpg in range(len(testing_file_names)):  # using the testing file names to move from source to testing directory
        shutil.copy2(os.path.join(SOURCE, testing_file_names[jpg]), TESTING)

    for jpg in range(len(file_names)):  # # using the file names to move from source to training directory
        shutil.copy2(os.path.join(SOURCE, file_names[jpg]), TRAINING)

        
def make_dataset(new_dir, labels, src_directories):
    make_data_directories(new_dir, labels)
    for i in range(len(labels)):
        split_data(SOURCE=src_directories[i],
                  TRAINING=os.path.join(new_dir, 'training', labels[i]),
                  TESTING=os.path.join(new_dir, 'testing', labels[i]),
                  SPLIT_SIZE=0.9)

In [64]:
def multi_process_dataset(new_dir, labels, src_directories):
    make_data_directories(new_dir, labels)
    processes = []
    for i in range(len(labels)):
        training_path = os.path.join(new_dir, 'training', labels[i])
        testing_path = os.path.join(new_dir, 'testing', labels[i])
        p = mp.Process(target=split_data, args=(src_directories[i], training_path, testing_path, 0.9))        
        processes.append(p)
        p.start()

    for process in processes:
        process.join()

In [47]:
new_dir = 'new_dir'
labels = ['rock','paper', 'scissor']
for i in range(len(labels)):
    print(os.path.join(new_dir, 'training', labels[i]))
    print(os.path.join(new_dir, 'testing', labels[i]))

new_dir\training\rock
new_dir\testing\rock
new_dir\training\paper
new_dir\testing\paper
new_dir\training\scissor
new_dir\testing\scissor


In [43]:
new_dir = 'new_dir' # the new directory name
paper_dir = 'rps/paper/' # The next three are the source directories
rock_dir = 'rps/rock/'
scissor_dir = 'rps/scissors/'

In [44]:
src = [paper_dir,rock_dir, scissor_dir]

In [45]:
labels = ['rock','paper', 'scissor']

In [48]:
import time
start_time = time.time()
make_dataset(new_dir, labels, src)
print("--- %s seconds ---" % (time.time() - start_time))

--- 2.9929604530334473 seconds ---


In [51]:
new_dir = 'blck_ball_dir' # the new directory name
blueball = 'Ball_Block_Proj/BlueBall/' 
blueblock = 'Ball_Block_Proj/BlueBlock/'
greenball = 'Ball_Block_Proj/GreenBall/'
greenblock = 'Ball_Block_Proj/GreenBlock/'
redball = 'Ball_Block_Proj/RedBall/' 
redblock = 'Ball_Block_Proj/RedBlock/'
yellowball = 'Ball_Block_Proj/YellowBall/'
yellowblock = 'Ball_Block_Proj/YellowBall/'
src = [blueball,blueblock,greenball,greenblock, redball, redblock, yellowball, yellowblock]
labels = ['blueball','blueblock', 'greenball', 'greenblock', 'redball','redblock', 'yellowball', 'yellowblock']

In [52]:
import time
start_time = time.time()
make_dataset(new_dir, labels, src)
print("--- %s seconds ---" % (time.time() - start_time))

--- 23.561182737350464 seconds ---


In [65]:
new_dir = 'blck_ball_dir2' # the new directory name
blueball = 'Ball_Block_Proj/BlueBall/' 
blueblock = 'Ball_Block_Proj/BlueBlock/'
greenball = 'Ball_Block_Proj/GreenBall/'
greenblock = 'Ball_Block_Proj/GreenBlock/'
redball = 'Ball_Block_Proj/RedBall/' 
redblock = 'Ball_Block_Proj/RedBlock/'
yellowball = 'Ball_Block_Proj/YellowBall/'
yellowblock = 'Ball_Block_Proj/YellowBall/'
src = [blueball,blueblock,greenball,greenblock, redball, redblock, yellowball, yellowblock]
labels = ['blueball','blueblock', 'greenball', 'greenblock', 'redball','redblock', 'yellowball', 'yellowblock']

In [67]:
src

['Ball_Block_Proj/BlueBall/',
 'Ball_Block_Proj/BlueBlock/',
 'Ball_Block_Proj/GreenBall/',
 'Ball_Block_Proj/GreenBlock/',
 'Ball_Block_Proj/RedBall/',
 'Ball_Block_Proj/RedBlock/',
 'Ball_Block_Proj/YellowBall/',
 'Ball_Block_Proj/YellowBall/']

In [68]:
import time
if __name__ == '__main__':
    start_time = time.time()
    multi_process_dataset(new_dir, labels, src)
    print("--- %s seconds ---" % (time.time() - start_time))

--- 0.14690876007080078 seconds ---
