# Dataset split generation
Roughly 70% in train, 10% in val and 20% in test.  
One vehicle (`sedna`) is entirely removed from train and val, and only used in test. This is to measure the performance on an unseen vehicle, with different camera calibration.


__Count of number of samples per vehicle__  

| vehicle | poseidon | sedna | nammu | brizo | neptune | ebisu |
|:--------|:---------|:------|:------|:------|:--------|:------|
| count   | 6156     | 5191  | 5180  | 3347  | 1729    | 603   |

In [137]:
from pathlib import Path
import pandas as pd
from sklearn.model_selection import train_test_split

In [138]:
images = Path(
    '/mnt/remote/data/users/thomasssajot/yolo_dataset/traffic_lights_2020/distorted/images'
).glob(
    '**/*.jpeg'
)

images = list(images)
print(f'Found {len(images)} images.')

distrib = pd.DataFrame(
    [{
        'file':f, 
        'run_id': f.parent.relative_to(f.parent.parent.parent), 
        'vehicle': f.parent.parent.name
    } for f in images]
)
distrib.head()

Found 22206 images.


Unnamed: 0,file,run_id,vehicle
0,/mnt/remote/data/users/thomasssajot/yolo_datas...,nammu/2019-12-02--09-31-11--Neil,nammu
1,/mnt/remote/data/users/thomasssajot/yolo_datas...,nammu/2019-12-02--09-31-11--Neil,nammu
2,/mnt/remote/data/users/thomasssajot/yolo_datas...,nammu/2019-12-02--09-31-11--Neil,nammu
3,/mnt/remote/data/users/thomasssajot/yolo_datas...,nammu/2019-12-02--09-31-11--Neil,nammu
4,/mnt/remote/data/users/thomasssajot/yolo_datas...,nammu/2019-12-02--09-31-11--Neil,nammu


In [96]:
print('Count of samples per vehicle')
distrib.value_counts('vehicle')

Count of samples per vehicle


vehicle
poseidon    6156
sedna       5191
nammu       5180
brizo       3347
neptune     1729
ebisu        603
dtype: int64

In [142]:
def generate_tiny_dataset_split():
    size = 128
    train_set = distrib.sample(size, random_state=123)
    return train_set

In [148]:
def generate_dataset_split():
    # Define dataset sizes
    size = len(distrib)
    train_size = int(size * .7)
    test_size = int(size * .2)
    val_size = size - train_size - test_size
    print(f'Train/val/test split:        '
         f'({train_size:5g}, {val_size:5g}, {test_size:5g})')
    print(
        f'Train/val/test split (in %): '
        f'({train_size / size * 100 :5.1f}, {val_size / size * 100 :5.1f}, {test_size / size * 100 :5.1f})'
    )
    
    # Isolate `ebisu` vehicle for test set
    # ensure we have a car not included in the training set
    single_vehicle_test_set = distrib.query('vehicle == "ebisu"')
    remaining_test_set_size = test_size - len(single_vehicle_test_set)
    distrib_without_ebisu = distrib.query('vehicle != "ebisu"').reset_index(drop=True)
    
    stratification = distrib_without_ebisu['run_id'].astype(str)
    counts = stratification.value_counts()
    stratification[stratification.isin(counts[counts <= 10].index.tolist())] = 'few'

    # Generate remaining train/val/test split
    train_set, val_test_set = train_test_split(
        distrib_without_ebisu,
        test_size=val_size + remaining_test_set_size,
        train_size=train_size,
        random_state=123,
        shuffle=True,
        stratify=stratification
    )

    val_set, complementary_test_set = train_test_split(
        val_test_set,
        train_size=val_size,
        test_size=remaining_test_set_size,
        random_state=123,
        shuffle=True
    )

    test_set = pd.concat([single_vehicle_test_set, complementary_test_set])


    print(f'Actual Train/val/test split: '
         f'({len(train_set):5g}, {len(val_set):5g}, {len(test_set):5g})')

    assert (len(train_set), len(val_set), len(test_set)) == (train_size, val_size, test_size)
    return train_set, val_set, test_set

# Save files .txt

In [150]:
root = Path('/mnt/remote/data/users/thomasssajot/yolo_dataset/traffic_lights_2020/distorted/')
train_file = root / 'train.txt'
test_file = root / 'test.txt'
val_file = root / 'val.txt'


for file_name, dataset in zip([train_file, val_file, test_file], generate_dataset_split()):
    data = '\n'.join(dataset['file'].astype(str)) + '\n'
    with file_name.open('w') as f:
        f.write(data)

Train/val/test split:        (15544,  2221,  4441)
Train/val/test split (in %): ( 70.0,  10.0,  20.0)
Actual Train/val/test split: (15544,  2221,  4441)
