In [None]:
# generate a list of all files in folders named "true" and "false"
# flagging those in "true" with the label 1
# and those in any other folder with the label 0

In [None]:
# import required dependencies
from os import listdir, getcwd, mkdir
from os.path import isfile, isdir
import numpy as np
import pandas as pd
from shutil import copy2

In [None]:
# set TRAINING and VALIDATION fractions
# if train_frac + valid_frac < 1, the remainder will be allocated to the TEST set
train_frac = 0.6
valid_frac = 0.3

# ensure that fractions do not exceed 100%
assert (train_frac + valid_frac <= 1), "Train and validation fractions must sum to at most 1..."

In [None]:
# recursive search function
# returns a 2D list, each element is [filename,pathname]
# designed to dive recursively into directory structure from a starting point given by dirPath of the initial call
def getFileListing(dirPath):
    thisFileList = []
    thisDirListing = listdir(dirPath)
    for f in thisDirListing:
        fullFileName = dirPath + "\\" + f
        if (isfile(fullFileName) == True):
            thisFileList.append([f,dirPath])
        if (isdir(fullFileName) == True):
            for i in getFileListing( fullFileName ):
                thisFileList.append(i)
    return thisFileList

In [None]:
# get list of files starting from current directory
allFiles = getFileListing(getcwd()+"\\labeled")

In [None]:
# extract 1-D lists of filenames and paths
allFileNames = [row[0] for row in allFiles]
allFilePaths = [row[1] for row in allFiles]

In [None]:
# create a new dataframe
df = pd.DataFrame(columns=['Filename','Filepath','Label']);

# search through all files, making note of PNG images
for i in range(0,len(allFileNames)):
    thisFileName = allFileNames[i]
    thisFilePath = allFilePaths[i]
    thisExtension = thisFileName[-4:]
    if( thisExtension == '.png'):
        if( thisFilePath[thisFilePath.rfind('\\')+1:] == "true"):
            label = 1
        else:
            label = 0
        df = df.append({'Filename':thisFileName,'Filepath':thisFilePath,'Label':label}, ignore_index=True)

# sort files by name and write along with label to CSV file        
df['Label']=df['Label'].astype('int')
df = df.sort_values(by='Filename')
df.to_csv('all_labels.csv',columns=['Filename','Label'],index=False)

In [None]:
# parition data into training, validation, and maybe test sets
dfrand = df.loc[np.random.permutation(df.index)]
train_n = round(train_frac*len(df))
train = dfrand.iloc[range(0,train_n)]

# when USING a TEST set
if( train_frac + valid_frac < 1 ):
    valid_n = round(valid_frac*len(df))
    valid = dfrand.iloc[range(train_n,train_n+valid_n)]
    test = dfrand.iloc[range(train_n+valid_n,len(df))]
    
# when NOT using a TEST set
elif(train_frac+valid_frac == 1):
    valid = dfrand.iloc[range(train_n,len(df))]

In [None]:
print("Train True: %0.2f%%, Train False %0.2f%%" % ((100*len(train.loc[train['Label'] == 1])/len(train)), (100*len(train.loc[train['Label']== 0])/len(train))))
print("Valid True: %0.2f%%, Valid False %0.2f%%" % ((100*len(valid.loc[valid['Label'] == 1])/len(valid)), (100*len(valid.loc[valid['Label']== 0])/len(valid))))
if(train_frac + valid_frac < 1):
    print("Test True: %0.2f%%, Test False %0.2f%%" % ((100*len(test.loc[test['Label'] == 1])/len(test)), (100*len(test.loc[test['Label']== 0])/len(test))))

In [None]:
# copy TRAINING files
thisdir = getcwd()+"\\train"
if(not(isdir(thisdir))):
    mkdir(thisdir)
if(not(isdir(thisdir+"\\true"))):
    mkdir(thisdir+"\\true")
if(not(isdir(thisdir+"\\false"))):
    mkdir(thisdir+"\\false")

for index, row in train.iterrows():
    if(row['Label'] == 1):
        copy2(row['Filepath']+'\\'+row['Filename'],thisdir+"\\true\\"+row['Filename'])
    else:
        copy2(row['Filepath']+'\\'+row['Filename'],thisdir+"\\false\\"+row['Filename'])
    

In [None]:
# copy VALIDATION files
thisdir = getcwd()+"\\valid"
if(not(isdir(thisdir))):
    mkdir(thisdir)
if(not(isdir(thisdir+"\\true"))):
    mkdir(thisdir+"\\true")
if(not(isdir(thisdir+"\\false"))):
    mkdir(thisdir+"\\false")

for index, row in valid.iterrows():
    if(row['Label'] == 1):
        copy2(row['Filepath']+'\\'+row['Filename'],thisdir+"\\true\\"+row['Filename'])
    else:
        copy2(row['Filepath']+'\\'+row['Filename'],thisdir+"\\false\\"+row['Filename'])
    

In [None]:
# copy TEST files
if 'test' in locals():

    thisdir = getcwd()+"\\test"
    if(not(isdir(thisdir))):
        mkdir(thisdir)
    if(not(isdir(thisdir+"\\true"))):
        mkdir(thisdir+"\\true")
    if(not(isdir(thisdir+"\\false"))):
        mkdir(thisdir+"\\false")

    for index, row in test.iterrows():
        if(row['Label'] == 1):
            copy2(row['Filepath']+'\\'+row['Filename'],thisdir+"\\true\\"+row['Filename'])
        else:
            copy2(row['Filepath']+'\\'+row['Filename'],thisdir+"\\false\\"+row['Filename'])