# Separate Validation Data from Folder
- training data / validation data가 폴더로 구분되어 있을 경우 분리
- train_labels, validation_labels로 분리

In [1]:
def read_labeled_image_list_from_csv(image_list_file):
    """Reads a .csv file containing pathes and labeles
    Args:
       image_list_file: a .txt file with one /path/to/image per line
       label: optionally, if set label will be pasted after each line
    Returns:
       List with all filenames in file image_list_file
    """
    f = open(image_list_file, 'r')
    filenames = []
    labels = []
    except_title = False
    for line in f:
        #print(line)
        if except_title:
            filename, label = line[:-1].split(',')
            filename = filename
            filenames.append(filename)
            labels.append(int(label))
        if except_title == False:
            except_title = True
            
    return filenames, labels

- folder info 읽어오기

In [2]:
import os
classes = os.listdir("../Mushroom")

In [3]:
print(classes)

['01_oyster mushroom', '02_Sarcodon aspratus', '03_matsutake mushroom', '04_shiitake mushroom', '05_Ramaria botrytis Ricken', '06_Macrolepiota procera Sing', '07_Boletopsis leucomelas', '08_Hygrophorus russula', '09_Neolentinus lepideus', '10_Phellinus linteus']


In [4]:
data_list = []
label_list = []
for idx, cls in enumerate(classes):
    file = os.listdir("../Mushroom/" + cls)
    for fname in file:
        ext = os.path.splitext(file)[1]
        if ext == '.jpg':
            data_list.append(fname)
            label_list.append(idx)

- label_list에서 class별 index 추출

In [5]:
list(classes)[0]

'01_oyster mushroom'

In [6]:
classesDataIdx = [[] for i in range(len(classes))]
for idx, val in enumerate(label_list):
    for i in range(len(classes)):
        if val == i:
            classesDataIdx[i].append(idx)

- 데이터 중 20%를 validation데이터로 추출

In [7]:
import random
validationDataIdx = [[] for i in range(len(classes))]

for i in range(len(classes)):
    validationDataIdx[i] = random.sample(classesDataIdx[i], int(len(classesDataIdx[i])*0.2))
    validationDataIdx[i].sort()

- validation을 제외한 데이터를 train데이터로 추출

In [8]:
trainDataIdx = [[] for i in range(len(classes))]

for i in range(len(classes)):
    trainDataIdx[i] = tuple(x for x in classesDataIdx[i] if x not in set(validationDataIdx[i]))

- file 다시 옮기기

In [9]:
classes

['01_oyster mushroom',
 '02_Sarcodon aspratus',
 '03_matsutake mushroom',
 '04_shiitake mushroom',
 '05_Ramaria botrytis Ricken',
 '06_Macrolepiota procera Sing',
 '07_Boletopsis leucomelas',
 '08_Hygrophorus russula',
 '09_Neolentinus lepideus',
 '10_Phellinus linteus']

In [10]:
import os
import shutil
directory_train = '../Mushroom/data/validation'
if not os.path.exists(directory_train):
    os.makedirs(directory_train)
    
for i in range(len(classes)):
    for idx in validationDataIdx[i]:
        shutil.copy('../Mushroom/' + classes[i] + '/' + data_list[idx], '../Mushroom/data/validation/' + data_list[idx])

In [11]:
import os
directory_validation = '../Mushroom/data/train'
if not os.path.exists(directory_validation):
    os.makedirs(directory_validation)
    
for i in range(len(classes)):
    for idx in trainDataIdx[i]:
        shutil.copy('../Mushroom/' + classes[i] + '/' + data_list[idx], '../Mushroom/data/train/' + data_list[idx])

- csv 파일 재작성

In [12]:
import csv
with open('../Mushroom/data/validation_labels.csv', 'w', newline='') as csvfile:
    writer = csv.writer(csvfile, delimiter=',')
    #writer.writerow(['name', 'invasive'])
    for i in range(len(classes)):
        for idx in validationDataIdx[i]:
            # writer.writerow([data_list[idx], list(classes)[i]])
            writer.writerow([data_list[idx], i])
        
with open('../Mushroom/data/train_labels.csv', 'w', newline='') as csvfile:
    writer = csv.writer(csvfile, delimiter=',')
    #writer.writerow(['name', 'invasive'])
    for i in range(len(classes)):
        for idx in trainDataIdx[i]:
            # writer.writerow([data_list[idx], list(classes)[i]])
            writer.writerow([data_list[idx], i])