In [1]:
import time
import numpy as np
import datasets.bdd.NightDriveDataset as bdd
import pandas as pd
import os

In [2]:
root_dir = "/home/till/data/driving/BerkeleyDeepDrive/bdd100k"  # "/home/SharedFolder/CurrentDatasets/bdd100k"

# create data sets
ds_database = 'bdd_all'
ds_train = bdd.NightDriveDataset(root_dir, database=ds_database, split="test")

>> Loading BDD training label dataset
>> Loading BDD validation label dataset


In [3]:
data = ds_train.data # .sample(frac=20, replace=True)
data.head()

Unnamed: 0,labels,name,weather,timeofday,scene,split
0,"[{'category': 'traffic sign', 'attributes': {'...",/home/till/data/driving/BerkeleyDeepDrive/bdd1...,clear,night,city street,unassigned
1,"[{'category': 'traffic sign', 'attributes': {'...",/home/till/data/driving/BerkeleyDeepDrive/bdd1...,snowy,daytime,city street,unassigned
2,"[{'category': 'traffic light', 'attributes': {...",/home/till/data/driving/BerkeleyDeepDrive/bdd1...,clear,night,city street,unassigned
3,"[{'category': 'car', 'attributes': {'occluded'...",/home/till/data/driving/BerkeleyDeepDrive/bdd1...,clear,night,undefined,unassigned
4,"[{'category': 'person', 'attributes': {'occlud...",/home/till/data/driving/BerkeleyDeepDrive/bdd1...,snowy,daytime,city street,unassigned


In [263]:
cross_total = pd.crosstab(data['timeofday'], data['weather'])
cross_total = cross_total.reindex(sorted(cross_total.columns), axis=1)  # columns need to be in same order as sampler_table
cross_total

weather,clear,cloudy,rainy,snowy
timeofday,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
daytime,14218,13490,2918,3284
night,26158,144,2494,2522


In [323]:
sampler_dict = {
    'train'     : {'n': 40000, 'class_dist': {'daytime': 1.,   'dawn/dusk': 0.,     'night': 0.},  'balancing': 'over', 'class_min': None},
    'train_dev' : {'n': 2.0e3, 'class_dist': {'daytime': 1.,   'dawn/dusk': 0.,     'night': 0.},  'balancing': 'over', 'class_min': None},
    'test'      : {'n': 2.0e3, 'class_dist': {'daytime': 1./2, 'dawn/dusk': 0.,   'night': 1./2},  'balancing': 'none', 'class_min': 50},
    'valid'     : {'n': 2.0e3, 'class_dist': {'daytime': 1./2, 'dawn/dusk': 0.,   'night': 1./2},  'balancing': 'none', 'class_min': 50},
}
_splits = sampler_dict.keys()
_timeofday_classes = data.timeofday.unique()
_weather_classes = np.sort(data.weather.unique())
_num_weather_classes = len(_weather_classes)
_num_timeofday_classes = len(_weather_classes)
# _dist_weather_classes = data.loc[:, 'weather'].value_counts().reindex(sorted(cross_total.columns), axis=1) / data.shape[0]
# _dist_weather_classes = cross_total / cross_total.sum(axis=1) # normalized by row
_dist_weather_classes = cross_total.div(cross_total.sum(axis=1), axis=0)
_dist_weather_classes

weather,clear,cloudy,rainy,snowy
timeofday,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
daytime,0.419286,0.397818,0.086051,0.096845
night,0.835239,0.004598,0.079635,0.080529


In [352]:
# Inintialize empty sampler table
iterables = [_splits, _timeofday_classes]
index = pd.MultiIndex.from_product(iterables, names=['split', 'timeofday'])
sampler_table = pd.DataFrame(np.zeros([8,len(_weather_classes)]), index=index, columns=_weather_classes)
sampler_table = sampler_table.reindex(sorted(sampler_table.columns), axis=1)
over_table = sampler_table.copy()
sampler_table

Unnamed: 0_level_0,Unnamed: 1_level_0,clear,cloudy,rainy,snowy
split,timeofday,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
train,night,0.0,0.0,0.0,0.0
train,daytime,0.0,0.0,0.0,0.0
train_dev,night,0.0,0.0,0.0,0.0
train_dev,daytime,0.0,0.0,0.0,0.0
test,night,0.0,0.0,0.0,0.0
test,daytime,0.0,0.0,0.0,0.0
valid,night,0.0,0.0,0.0,0.0
valid,daytime,0.0,0.0,0.0,0.0


In [353]:
# First, we need to process all ocurrences of min_thr
# sampler_dict[()]
for s in _splits:
    if sampler_dict[s]['class_min'] is not None:
        sampler_table.loc[s] = sampler_dict[s]['class_min']
cross_avail = cross_total - sampler_table.groupby(level='timeofday').sum()
print(cross_avail.astype('int32'))
print(sampler_table.astype('int32'))
assert ~(cross_avail < 0).any().any(), 'Error, insufficient samples available to fullfil request.'

weather    clear  cloudy  rainy  snowy
timeofday                             
daytime    14118   13390   2818   3184
night      26058      44   2394   2422
                     clear  cloudy  rainy  snowy
split     timeofday                             
train     night          0       0      0      0
          daytime        0       0      0      0
train_dev night          0       0      0      0
          daytime        0       0      0      0
test      night         50      50     50     50
          daytime       50      50     50     50
valid     night         50      50     50     50
          daytime       50      50     50     50


In [388]:
s = sampler_table.index.get_level_values(level='timeofday').unique()
for i in s:
    print(i)

night
daytime
night
daytime
night
daytime
night
daytime


In [None]:
_splits = ['test', 'valid', 'train_dev', 'train']  # sampler_dict.keys()
for sp in _splits:   # sampler_table.index.get_level_values(level='split'):  # for each split set; note order matters here if reproducibility is needed, i.e. val and test need to go first
    for td in sampler_table.index.get_level_values(level='timeofday'):  # for each timeofday
        print(sp, td)

In [368]:
_splits = ['test', 'valid', 'train_dev', 'train']  # sampler_dict.keys()
for sp in _splits:   # sampler_table.index.get_level_values(level='split'):  # for each split set; note order matters here if reproducibility is needed, i.e. val and test need to go first
    for td in sampler_table.index.get_level_values(level='timeofday'):  # for each timeofday
        for we in sampler_table.columns:
            print(sp, td, we)

test night clear
test night cloudy
test night rainy
test night snowy
test daytime clear
test daytime cloudy
test daytime rainy
test daytime snowy
test night clear
test night cloudy
test night rainy
test night snowy
test daytime clear
test daytime cloudy
test daytime rainy
test daytime snowy
test night clear
test night cloudy
test night rainy
test night snowy
test daytime clear
test daytime cloudy
test daytime rainy
test daytime snowy
test night clear
test night cloudy
test night rainy
test night snowy
test daytime clear
test daytime cloudy
test daytime rainy
test daytime snowy
valid night clear
valid night cloudy
valid night rainy
valid night snowy
valid daytime clear
valid daytime cloudy
valid daytime rainy
valid daytime snowy
valid night clear
valid night cloudy
valid night rainy
valid night snowy
valid daytime clear
valid daytime cloudy
valid daytime rainy
valid daytime snowy
valid night clear
valid night cloudy
valid night rainy
valid night snowy
valid daytime clear
valid daytime c

In [294]:
# Second, case 'none' i.e. no balancing; note this case has priority over under and over
i = 0
for s in _splits:
    if sampler_dict[s]['balancing'] == 'none':
        for t, f in sampler_dict[s]['class_dist'].items():  # for each timeofday
            if f > 0.0:  # note this will intentionally exclude dusk/dawn
                i = i+1
                _wanted = sampler_dict[s]['n'] * f * _dist_weather_classes.loc[t]
                # correct for min_thr
                _ = _wanted - sampler_table.loc[s,t]
                __ = (sum(_wanted) + _.where(_ < 0).fillna(0).sum() ) / sum(_wanted)
                _wanted = _wanted * __
                if i == 9999:
                    print(_wanted)
                    _ = _wanted - sampler_table.loc[s,t]
                    __ = (sum(_wanted) + _.where(_ < 0).fillna(0).sum() ) / sum(_wanted)
                    _wanted = _wanted * __
                    print(_wanted)
                    asfasd
                sampler_table.loc[s,t] = np.maximum(sampler_table.loc[s,t], _wanted)  # maximum on min_thr and balanced
cross_avail = cross_total - sampler_table.groupby(level='timeofday').sum()
print(cross_avail.astype('int32'))
print(sampler_table.astype('int32'))
assert ~(cross_avail < 0).any().any(), 'Error, insufficient samples available to fullfil request.'

  return_indexers=True)


ValueError: operands could not be broadcast together with shapes (5,) (4,) 

In [286]:
# Third, we need to process all cases of undersampling ('under'); note this case has priority over 'over'
for s in _splits:
    if sampler_dict[s]['balancing'] == 'under':
        for t, f in sampler_dict[s]['class_dist'].items():  # for each timeofday
            if f > 0.0:  # note this will intentionally exclude dusk/dawn
                sampler_table.loc[s,t] = np.maximum(sampler_table.loc[s,t], sampler_dict[s]['n'] * f * np.ones(_num_weather_classes) / _num_weather_classes)  # maximum on min_thr and balanced
cross_avail = cross_total - sampler_table.groupby(level='timeofday').sum()
print(cross_avail.astype('int32'))
print(sampler_table.astype('int32'))
assert ~(cross_avail < 0).any().any(), 'Error, insufficient samples available to fullfil request.'

weather    clear  cloudy  rainy  snowy
timeofday                             
daytime    13379   12694   2745   3090
night      24563      44   2341   2368
                     clear  cloudy  rainy  snowy
split     timeofday                             
train     night          0       0      0      0
          daytime        0       0      0      0
train_dev night          0       0      0      0
          daytime        0       0      0      0
test      night        797      50     76     76
          daytime      419     397     86     96
valid     night        797      50     76     76
          daytime      419     397     86     96


In [354]:
# Fourth, we can process the cases of oversampling
# We oversample across all splits that want oversampling in proportion to their n
# Therefore, we need to know the 
n_total_over = sum(list({sampler_dict[k]['n'] for k in sampler_dict if sampler_dict[k]['balancing'] == 'over'}))
for s in _splits:
    if sampler_dict[s]['balancing'] == 'over':
        f_total_over = sampler_dict[s]['n'] / n_total_over  # fraction of total remaing samples per class going into this split
        for t, f in sampler_dict[s]['class_dist'].items():  # for each timeofday
            if f > 0.0:
                # get max number of remaining samples available (for each weather condition)
                _avail = cross_avail.loc[t]  # ! plus those that this split aleady has from min_thr, if any
                # get the assigned fraction of them for this split
                _assigned = _avail *  f_total_over
                _wanted = sampler_dict[s]['n'] * f * np.ones(_num_weather_classes) / _num_weather_classes
                _given = np.minimum(_assigned, _wanted)
                sampler_table.loc[s,t] = np.maximum(sampler_table.loc[s,t], _given)  # maximum on min_thr and balanced
                # store number of samples to oversample per class
                over_table.loc[s,t] = _wanted - sampler_table.loc[s,t]
                
cross_avail = cross_total - sampler_table.groupby(level='timeofday').sum()
print(cross_avail.astype('int32'))
print(sampler_table.astype('int32'))
assert ~(cross_avail < 0).any().any(), 'Error, insufficient samples available to fullfil request.'

weather    clear  cloudy  rainy  snowy
timeofday                             
daytime     3618    2890      0      0
night      26058      44   2394   2422
                     clear  cloudy  rainy  snowy
split     timeofday                             
train     night          0       0      0      0
          daytime    10000   10000   2683   3032
train_dev night          0       0      0      0
          daytime      500     500    134    151
test      night         50      50     50     50
          daytime       50      50     50     50
valid     night         50      50     50     50
          daytime       50      50     50     50


In [355]:
over_table, sampler_table

(                     clear  cloudy        rainy        snowy
 split     timeofday                                         
 train     night        0.0     0.0     0.000000     0.000000
           daytime      0.0     0.0  7316.190476  6967.619048
 train_dev night        0.0     0.0     0.000000     0.000000
           daytime      0.0     0.0   365.809524   348.380952
 test      night        0.0     0.0     0.000000     0.000000
           daytime      0.0     0.0     0.000000     0.000000
 valid     night        0.0     0.0     0.000000     0.000000
           daytime      0.0     0.0     0.000000     0.000000,
                        clear   cloudy        rainy        snowy
 split     timeofday                                            
 train     night          0.0      0.0     0.000000     0.000000
           daytime    10000.0  10000.0  2683.809524  3032.380952
 train_dev night          0.0      0.0     0.000000     0.000000
           daytime      500.0    500.0   134.190476   

In [338]:
over_table.loc['test','night'] = _wanted - sampler_table.loc['test','night']

Unnamed: 0_level_0,Unnamed: 1_level_0,clear,cloudy,rainy,snowy
split,timeofday,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
train,night,0.0,0.0,0.0,0.0
train,daytime,0.0,0.0,2779.047619,3127.619048
train_dev,night,0.0,0.0,0.0,0.0
train_dev,daytime,0.0,0.0,138.952381,156.380952
test,night,0.0,0.0,0.0,0.0
test,daytime,0.0,0.0,0.0,0.0
valid,night,0.0,0.0,0.0,0.0
valid,daytime,0.0,0.0,0.0,0.0


In [241]:
# cast to int
sampler_table = sampler_table.astype('int32')
sampler_table

Unnamed: 0_level_0,Unnamed: 1_level_0,clear,cloudy,rainy,snowy
split,timeofday,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
train,night,0,0,0,0
train,daytime,10000,10000,2615,2943
train_dev,night,0,0,0,0
train_dev,daytime,500,500,130,147
test,night,797,50,76,76
test,daytime,419,397,86,96
valid,night,797,50,76,76
valid,daytime,419,397,86,96


In [242]:
# show some summary stats
sampler_table_show = sampler_table.copy()
sampler_table_show['total'] = sampler_table_show.sum(axis=1)
print('\nMulti-variate sample distribution for each split:\n')
print(sampler_table_show.reindex(sorted(sampler_table_show.index), axis=0))
sampler_table_show = sampler_table_show.groupby(level='split').sum()
print('\nSample distribution grouped by split:\n')
print(sampler_table_show.reindex(sorted(sampler_table_show.index), axis=0))
print('\nData not used:\n')
print(cross_avail.reindex(sorted(cross_avail.index), axis=0).astype('int32'))


Multi-variate sample distribution for each split:

                     clear  cloudy  rainy  snowy  total
split     timeofday                                    
test      daytime      419     397     86     96    998
          night        797      50     76     76    999
train     daytime    10000   10000   2615   2943  25558
          night          0       0      0      0      0
train_dev daytime      500     500    130    147   1277
          night          0       0      0      0      0
valid     daytime      419     397     86     96    998
          night        797      50     76     76    999

Sample distribution grouped by split:

           clear  cloudy  rainy  snowy  total
split                                        
test        1216     447    162    172   1997
train      10000   10000   2615   2943  25558
train_dev    500     500    130    147   1277
valid       1216     447    162    172   1997

Data not used:

weather    clear  cloudy  rainy  snowy
timeofday       

In [124]:
# Fourth, we can process the cases of oversampling
# We oversample across all splits that want oversampling in proportion to their n
# Therefore, we need to know the 
n_total_over = sum(list({sampler_dict[k]['n'] for k in sampler_dict if sampler_dict[k]['balancing'] == 'over'}))
for s in _splits:
    if sampler_dict[s]['balancing'] == 'over':
        f_total_over = sampler_dict[s]['n'] / n_total_over  # fraction of total remaing samples per class going into this split
        for t, f in sampler_dict[s]['class_dist'].items():  # for each timeofday
            if f > 0.0:
                # get max number of remaining samples available
                cross_avail = cross_total.loc[t] - sampler_table.groupby(level='timeofday').sum().loc[t] + sampler_table.loc[s,t]  # plus those that this split aleady has from min_thr, if any
                cross_wanted = sampler_dict[s]['n'] * f_total_over * f * np.ones(_num_weather_classes)
                print('cross_wanted')
                print(cross_avail)
                print('cross_wanted')
                print(cross_total.loc[t])
                print('cross_wanted')
                print(sampler_table.groupby(level='timeofday').sum().loc[t])
                print('cross_wanted')
                print(sampler_table.loc[s,t])
                print('cross_wanted')
                print(cross_wanted)
                asfgsad
                cross_given = np.minimum(cross_avail, cross_wanted)
                sampler_table.loc[s,t] = np.maximum(sampler_table.loc[s,t], sampler_dict[s]['n'] * f * np.ones(_num_weather_classes) / _num_weather_classes)  # maximum on min_thr and balanced
cross_avail = cross_total - sampler_table.groupby(level='timeofday').sum()
print(cross_avail)
sampler_table

weather
clear     13782.052248
cloudy    13076.373950
rainy      2818.000000
snowy      3183.307046
dtype: float64
weather
clear     14218
cloudy    13490
rainy      2918
snowy      3284
Name: daytime, dtype: int64
clear     435.947752
cloudy    413.626050
rainy     100.000000
snowy     100.692954
Name: daytime, dtype: float64
clear     0.0
cloudy    0.0
rainy     0.0
snowy     0.0
Name: (train, daytime), dtype: float64
cross_wanted
[8333.33333333 8333.33333333 8333.33333333 8333.33333333]


NameError: name 'asfgsad' is not defined

### Accessing

In [254]:
sampler_table.index.get_level_values(level='split')
sampler_table.index.get_level_values(level='timeofday')
sampler_table.columns

Index(['clear', 'cloudy', 'rainy', 'snowy'], dtype='object')

### Verification

In [389]:
ds_database = 'bdd_all'
ds_train = bdd.WeatherClassifierDataset(root_dir, database=ds_database, split="test")


>> Loading BDD training label dataset
>> Loading BDD validation label dataset


In [390]:
ds_train[0]

(<PIL.JpegImagePlugin.JpegImageFile image mode=RGB size=1280x720 at 0x7F6DD3501940>,
 0)