# Separate Validation Data
- training data / validation data가 한 파일(csv or txt)에 있을 경우 분리
- train_labels, validation_labels로 분리

In [1]:
def read_labeled_image_list_from_csv(image_list_file):
    """Reads a .csv file containing pathes and labeles
    Args:
       image_list_file: a .txt file with one /path/to/image per line
       label: optionally, if set label will be pasted after each line
    Returns:
       List with all filenames in file image_list_file
    """
    f = open(image_list_file, 'r')
    filenames = []
    labels = []
    except_title = False
    for line in f:
        #print(line)
        if except_title:
            filename, label = line[:-1].split(',')
            filename = filename
            filenames.append(filename)
            labels.append(int(label))
        if except_title == False:
            except_title = True
            
    return filenames, labels

- csv 파일 및 image 파일 path 읽어오기

In [2]:
filename = '../Species/train_labels.csv'

# Reads pfathes of images together with their labels
data_list, label_list = read_labeled_image_list_from_csv(filename)

In [3]:
classes = set(label_list)

- label_list에서 class별 index 추출

In [4]:
classesDataIdx = [[] for i in range(len(classes))]
for idx, val in enumerate(label_list):
    for i in range(len(classes)):
        if val == list(classes)[i]:
            classesDataIdx[i].append(idx)

- 데이터 중 20%를 validation데이터로 추출

In [5]:
import random
validationDataIdx = [[] for i in range(len(classes))]

for i in range(len(classes)):
    validationDataIdx[i] = random.sample(classesDataIdx[i], int(len(classesDataIdx[i])*0.2))
    validationDataIdx[i].sort()

- validation을 제외한 데이터를 train데이터로 추출

In [6]:
trainDataIdx = [[] for i in range(len(classes))]

for i in range(len(classes)):
    trainDataIdx[i] = tuple(x for x in classesDataIdx[i] if x not in set(validationDataIdx[i]))

- file 다시 옮기기

In [7]:
import os
import shutil
directory_train = '../Species/data/validation'
if not os.path.exists(directory_train):
    os.makedirs(directory_train)
    
for i in range(len(classes)):
    for idx in validationDataIdx[i]:
        shutil.copy('../Species/train/' + data_list[idx] + '.jpg', '../Species/data/validation/' + data_list[idx] + '.jpg')

In [8]:
import os
directory_validation = '../Species/data/train'
if not os.path.exists(directory_validation):
    os.makedirs(directory_validation)
    
for i in range(len(classes)):
    for idx in trainDataIdx[i]:
        shutil.copy('../Species/train/' + data_list[idx] + '.jpg', '../Species/data/train/' + data_list[idx] + '.jpg')

- csv 파일 재작성

In [9]:
import csv
with open('../Species/data/validation_labels.csv', 'w', newline='') as csvfile:
    writer = csv.writer(csvfile, delimiter=',')
    #writer.writerow(['name', 'invasive'])
    for i in range(len(classes)):
        for idx in validationDataIdx[i]:
            writer.writerow([data_list[idx], list(classes)[i]])
        
with open('../Species/data/train_labels.csv', 'w', newline='') as csvfile:
    writer = csv.writer(csvfile, delimiter=',')
    #writer.writerow(['name', 'invasive'])
    for i in range(len(classes)):
        for idx in trainDataIdx[i]:
            writer.writerow([data_list[idx], list(classes)[i]])