# Example 9 - Separate train, val and test data

In this notebook, we'll separate train, val and test data. Using only labeled echograms? 60%, 20%, 20%

In [1]:
# what is labeled? AH, some negative, a lot of unlabeled
# option 1: only use files with annotations, option 2: use a certain range! Better! 
# 09/25 - 11/14, 2019

In [27]:
import os
import glob
import random
import pickle

## Step 1. Get filenames

In this step, let's get all filenames, randomize. 

In [4]:
# add raw and bottom file dir
raw_dir = "../data/HB1906_EK60/rawfiles/"
bot_dir = "../data/HB1906_EK60/botfiles/"
raw_paths = sorted(glob.glob(raw_dir + '*.raw'))
bot_paths = sorted(glob.glob(bot_dir + '*.bot'))

In [7]:
len(raw_paths)

1710

In [6]:
# set range
# first annotation: ../data/HB1906_EK60/rawfiles/D20190925-T220811.raw
raw_paths.index('../data/HB1906_EK60/rawfiles/D20190925-T220811.raw')

518

In [9]:
sel_raw_paths = raw_paths[518:]
sel_bot_paths = bot_paths[518:]

In [10]:
print(sel_raw_paths[0])
print(sel_bot_paths[0])

../data/HB1906_EK60/rawfiles/D20190925-T220811.raw
../data/HB1906_EK60/botfiles/D20190925-T220811.bot


## Step 2. Separate into train, val, test data

In this step, we'll separate selected filenames into train, val and test data. 

In [17]:
raw_bot_paths = [[i, j] for i, j in zip(sel_raw_paths, sel_bot_paths)]

In [18]:
raw_bot_paths[0]

['../data/HB1906_EK60/rawfiles/D20190925-T220811.raw',
 '../data/HB1906_EK60/botfiles/D20190925-T220811.bot']

In [19]:
random.seed(1)
random.shuffle(raw_bot_paths)

In [20]:
raw_bot_paths[0]

['../data/HB1906_EK60/rawfiles/D20191109-T150801.raw',
 '../data/HB1906_EK60/botfiles/D20191109-T150801.bot']

In [21]:
# separate
len(raw_bot_paths)

1192

In [22]:
train_len = int(len(raw_bot_paths) * 0.6)
test_len = int(len(raw_bot_paths) * 0.2)

In [23]:
print(train_len, test_len)

715 238


In [24]:
train_li = raw_bot_paths[:train_len]
val_li = raw_bot_paths[train_len:train_len + test_len]
test_li = raw_bot_paths[train_len + test_len:]

In [25]:
print(len(train_li), len(val_li), len(test_li))

715 238 239


## Step 3. Save

In this step, save train, val, test data. 

In [32]:
pkl_dir = "pkl/"
# for compatible, set protocol to 4 (Python 3.7)
with open(pkl_dir + "train_li.pickle", "wb") as handle:
    pickle.dump(train_li, handle, protocol=4)
with open(pkl_dir + "val_li.pickle", "wb") as handle:
    pickle.dump(val_li, handle, protocol=4)
with open(pkl_dir + "test_li.pickle", "wb") as handle:
    pickle.dump(test_li, handle, protocol=4)