# Splitting the dataset into train validation and test sets for keras

Here I take to data from the [1_make_dataset_usable](1_make_dataset_usable.ipynb) notebook and
split it for model training with Keras.

In [1]:
import os
import pickle

import numpy as np

from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.model_selection import train_test_split

### Get data from [1_make_dataset_usable](1_make_dataset_usable.ipynb)

In [2]:
file_path = os.path.join('data', 'notebooks', '1_make_dataset_usable', 'pair_dicto.pkl')
with open(file_path, 'rb') as file:
    pair_dicto = pickle.load(file)

In [3]:
pairs = pair_dicto['clean_pairs']
print('number of pairs', len(pairs))
print('first pair:', pairs[0])

number of pairs 16185
first pair: ('000001.jpg', 'AM-General_Hummer_SUV-2000')


### Split into 70% training,  15% validation, and 15% test sets using sklearn

Make the pairs into image names without extension and binarized labels that sklearn can use to randomly generate training, validation, and test sets.

In [4]:
prefixes, labels = zip(*pairs)
prefixes = [prefix.rstrip('.jpg') for prefix in prefixes]
labels = list(labels)
label_set = set(labels)
print('first 5 prefixes:', prefixes[:5])
print('first 5 labels:', labels[:5])
print('number of unique labels:', len(label_set))

first 5 prefixes: ['000001', '000002', '000003', '000004', '000005']
first 5 labels: ['AM-General_Hummer_SUV-2000', 'AM-General_Hummer_SUV-2000', 'AM-General_Hummer_SUV-2000', 'AM-General_Hummer_SUV-2000', 'AM-General_Hummer_SUV-2000']
number of unique labels: 196


In [5]:
# This weird list of label sets is so that sklearn works
sklearn_label_setlist = [set([label]) for label in labels]

In [6]:
encoder = MultiLabelBinarizer()
labels_array = encoder.fit_transform(sklearn_label_setlist)
print('labels_arr shape:', labels_array.shape)

labels_arr shape: (16185, 196)


This part shows how you can go from binary label back to human-readable label

In [7]:
# Need to reshape so that it has (n_samples, n_features) rather than just (n_features,) 
feed_back = np.reshape(labels_array[1000], (1, -1)) 
print(feed_back)
print(encoder.inverse_transform(feed_back)[0][0])

[[0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
  0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
  0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
  0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
  0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
  0 0 0 0 0 0 0 0 0 0 0]]
Audi-A5_Coupe-2012


In [8]:
train_x, val_test_x, train_y, val_test_y = train_test_split(prefixes, labels_array, 
                                                            test_size=0.3,
                                                            random_state=7)
print('train samples:', len(train_x))

validation_x, test_x, validation_y, test_y = train_test_split(val_test_x, val_test_y,
                                                              test_size=0.5,
                                                              random_state=9)
print('validation samples: {}\ntest samples: {}' .format(len(validation_x), len(test_x)))

train samples: 11329
validation samples: 2428
test samples: 2428


Remap the new splits into (prefix, human-readable labels)

In [9]:
train_x_y = list(zip(train_x, train_y))
train_x_y = [(tup[0], 
              encoder.inverse_transform(np.reshape(tup[1], (1,-1)))[0][0])\
             for tup in train_x_y]

validation_x_y = list(zip(validation_x, validation_y))
validation_x_y = [(tup[0],
                   encoder.inverse_transform(np.reshape(tup[1], (1,-1)))[0][0])\
                  for tup in validation_x_y]

test_x_y = list(zip(test_x, test_y))
test_x_y = [(tup[0],
                   encoder.inverse_transform(np.reshape(tup[1], (1,-1)))[0][0])\
                  for tup in test_x_y]

tvt_dict = {'train': train_x_y,
            'validation': validation_x_y,
            'test': test_x_y}

print(tvt_dict['train'][0])

('008845', 'Ford-Freestar_Minivan-2007')


### Split images into folders named with each class so that Keras can use its image generators

```
train/
    label1/
        img1.jpg
        ...
    label2/
        img2.jpg
        ...
validation/
    label1/
        img3.jpg
        ...
    label2/
        img4.jpg
        ...
test/
    label1/
        img5.jpg
        ...
    label2/
        img6.jpg
        ...
```

In [10]:
import os
import shutil

In [11]:
BASE_PATH = os.getcwd()
STD_PATH = os.path.join(BASE_PATH, 'data', 'cars_dataset', 'original', 'images')
KERAS_PATH = os.path.join(BASE_PATH, 'data', 'cars_dataset', 'keras')
FAST_PATH = os.path.join(KERAS_PATH, 'fast')

In [12]:
for key, item in tvt_dict.items():
    # Make train, validation, and test directories
    temp_path = os.path.join(KERAS_PATH, key)
    if not os.path.isdir(temp_path):
        os.makedirs(temp_path)
    
    for label in encoder.classes_:
        #Make label directoties in each of the folders above
        to_path = os.path.join(temp_path, label)
        if not os.path.isdir(to_path):
            os.makedirs(to_path)
        
        for tup in item:
            # Copy file to their corresponding folder
            if tup[1] == label:
                from_path = os.path.join(STD_PATH, tup[0] + '.jpg')
                shutil.copy(from_path, to_path)

Create a copy of the three folders for fast testing with two images (for training) and 1 image (for val and test) in each class. 

Use same nesting scheme as above.

In [13]:
# fresh start for testing
if os.path.isdir(FAST_PATH):
    shutil.rmtree(FAST_PATH)

# get outer directories from KERAS_PATH
with os.scandir(KERAS_PATH) as outer_directories:
    outer_directories = list(outer_directories)


os.mkdir(FAST_PATH)
to_copy = []
# Iterate from outer directories
for outer_directory in outer_directories:
    if not outer_directory.name.startswith('.') and outer_directory.is_dir():
        new_outer_directory = os.path.join(FAST_PATH, outer_directory.name)
        old_outer_directory = os.path.join(KERAS_PATH, outer_directory.name)
        os.mkdir(new_outer_directory)
        # through model directories
        with os.scandir(old_outer_directory) as model_name_directories:
            for model_name_directory in model_name_directories:
                if not model_name_directory.name.startswith('.') and model_name_directory.is_dir():
                    new_model_directory = os.path.join(FAST_PATH, outer_directory.name, model_name_directory.name)
                    old_model_directory = os.path.join(KERAS_PATH, outer_directory.name, model_name_directory.name)
                    os.mkdir(new_model_directory)
                    # through image filenames in each model directory
                    with os.scandir(old_model_directory) as image_filenames:
                        image_filenames =\
                        [image_filename.name for image_filename in image_filenames if image_filename.is_file()]
                        # copy 2 or 1 files depending on whether outer directory is train or not
                        if outer_directory.name == 'train':
                            outer_directory_i = 2
                        else:
                            outer_directory_i = 1
                        to_copy = image_filenames[:outer_directory_i]
                        # Do the actual copying
                        for image_filename in to_copy:
                            new_image_path = os.path.join(new_model_directory, image_filename)
                            old_image_path = os.path.join(old_model_directory, image_filename)
                            shutil.copy(old_image_path, new_image_path)