# Splitting the dataset into train validation and test sets for keras

In [16]:
import os
import pickle

import numpy as np

from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.model_selection import train_test_split

In [3]:
file_path = os.path.join('data', 'notebooks', '1_make_dataset_usable', 'pair_dicto.pkl')
with open(file_path, 'rb') as file:
    pair_dicto = pickle.load(file)

In [4]:
pairs = pair_dicto['clean_pairs']
print('number of pairs', len(pairs))
print('first pair:', pairs[0])

number of pairs 16185
first pair: ('000001.jpg', 'AM-General_Hummer_SUV-2000')


Make the pairs into prefixes and binarized labels that sklearn can use to randomly generate training, validation, and test sets.

In [6]:
prefixes, labels = zip(*pairs)
prefixes = [prefix.rstrip('.jpg') for prefix in prefixes]
labels = list(labels)
label_set = set(labels)
print('first 5 prefixes:', prefixes[:5])
print('first 5 labels:', labels[:5])
print('number of unique labels:', len(label_set))

first 5 prefixes: ['000001', '000002', '000003', '000004', '000005']
first 5 labels: ['AM-General_Hummer_SUV-2000', 'AM-General_Hummer_SUV-2000', 'AM-General_Hummer_SUV-2000', 'AM-General_Hummer_SUV-2000', 'AM-General_Hummer_SUV-2000']
number of unique labels: 196


In [9]:
# This weird list of label sets is so that sklearn works
sklearn_label_setlist = [set([label]) for label in labels]

In [11]:
encoder = MultiLabelBinarizer()
labels_array = encoder.fit_transform(sklearn_label_setlist)
print('labels_arr shape:', labels_array.shape)

labels_arr shape: (16185, 196)


This part shows how you can go from binary label back to human-readable label

In [13]:
# Need to reshape so that it has (n_samples, n_features) rather than just (n_features,) 
feed_back = np.reshape(labels_array[1000], (1, -1)) 
print(feed_back)
print(encoder.inverse_transform(feed_back)[0][0])

[[0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
  0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
  0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
  0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
  0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
  0 0 0 0 0 0 0 0 0 0 0]]
Audi-A5_Coupe-2012


### Split into 70% training,  15% validation, and 15% test sets using sklearn

In [20]:
train_x, val_test_x, train_y, val_test_y = train_test_split(prefixes, labels_array, 
                                                            test_size=0.3,
                                                            random_state=7)
print('train samples:', len(train_x))

validation_x, test_x, validation_y, test_y = train_test_split(val_test_x, val_test_y,
                                                              test_size=0.5,
                                                              random_state=9)
print('validation samples: {}\ntest samples: {}' .format(len(validation_x), len(test_x)))

train samples: 11329
validation samples: 2428
test samples: 2428
