In [None]:
#default_exp data.source

In [None]:
#export
from local.torch_basics import *
from local.test import *
from local.data.core import *
from local.data.transform import *
from local.data.pipeline import *
from local.notebook.showdoc import show_doc

# Data source
> Base container for all the items

## Convenience functions

In [None]:
#export core
def all_union(sets):
    "Set of union of all `sets` (each `setified` if needed)"
    return set().union(*(map(setify,sets)))

In [None]:
sets = [[1,2],[2,3]]
test_eq(all_union(sets), {1,2,3})

In [None]:
#export core
def all_disjoint(sets):
    "`True` iif no element appears in more than one item of `sets`"
    return sum(map(len,sets))==len(all_union(sets))

In [None]:
assert not all_disjoint(sets)
assert all_disjoint([[1,2],[3,4]])
assert all_disjoint([[1,2],[]])
assert all_disjoint([[1,2]])
assert all_disjoint([])

## DataSource -

In [None]:
#export
def _mk_subset(self, i):
    tfms = [o.tfms for o in self.tls]
    return TfmdDS(L._gets(self, self.filts[i]), tfms=tfms, do_setup=False, filt=i)

In [None]:
# export
class _FiltTfmdList(TfmdList):
    "Like `TfmdList` but with filters and train/valid attribute, for proper setup"
    def __init__(self, items, tfms, filt_idx, do_setup=True, use_list=None):
        self.filt_idx = filt_idx
        super().__init__(items, tfms, do_setup=do_setup, as_item=True, use_list=use_list, filt=None)

    def _new(self, items, *args, **kwargs): return super()._new(items, tfms=self.tfms, filt_idx=self.filt_idx, do_setup=False, use_list=None)
    def subset(self, i): return _mk_subset(self, i)
    def _get(self, i):
        self.filt = self.filt_idx[i]
        return super()._get(i)

_FiltTfmdList.train,_FiltTfmdList.valid = add_props(lambda i,x: x.subset(i), 2)

In [None]:
# export
class DataSource(TfmdDS):
    "Applies a `tfm` to filtered subsets of `items`"
    def __init__(self, items, tfms=None, filts=None, do_setup=True):
        super(TfmdDS,self).__init__(items, use_list=None)
        if filts is None: filts = [range_of(items)]
        self.filts = L(mask2idxs(filt) for filt in filts)

        # Create map from item id to filter id
        assert all_disjoint(self.filts)
        self.filt_idx = L([None]*len(self.items))
        for i,f in enumerate(self.filts): self.filt_idx[f] = i
        self.tls = [_FiltTfmdList(self.items, t, self.filt_idx, do_setup=do_setup) for t in L(tfms)]

    def __repr__(self): return '\n'.join(map(str,self.subsets())) + f'\ntls - {self.tls}'
    def subsets(self): return map(self.subset, range_of(self.filts))
    def subset(self, i): return _mk_subset(self, i)
    def _get(self, i):
        self.filt = self.filt_idx[i]
        return super()._get(i)

    @delegates(TfmdDL.__init__)
    def databunch(self, bs=16, val_bs=None, shuffle_train=True, **kwargs):
        n = len(self.filts)-1
        bss = [bs] + [2*bs]*n if val_bs is None else [bs] + [val_bs]*n
        shuffles = [shuffle_train] + [False]*n
        return DataBunch(*[TfmdDL(self.subset(i), bs=b, shuffle=s, drop_last=s, **kwargs)
                           for i,(b,s) in enumerate(zip(bss, shuffles))])

DataSource.train,DataSource.valid = add_props(lambda i,x: x.subset(i), 2)

In [None]:
add_docs(DataSource,
         subset="Filtered `DsrcSubset` `i`",
         subsets="Iterator for all subsets",
         databunch="Create a `DataBunch`",
         show="Show item `o` in `ctx`")

A `DataSource` provides filtering and transformation capabilities to a list of items. Although it has all the attributes of `TfmdDS` (since it's a subclass) they are mainly used internally; you will generally want to instead access its `subset`s (`DataSource.train` or `DataSource.valid` for instance).

If you don't pass any filters or transforms, it simply provides a single subset (of type `DsrcSubset`) with the same behavior as a `L`.

In [None]:
inp = [0,1,2,3,4]
dsrc = DataSource(inp, tfms=[None])

test_eq(len(dsrc.filts), 1)
test_eq(*dsrc[2], 2)          # Retrieve one item (subset 0 is the default)
test_eq(dsrc[1,2], [(1,),(2,)])    # Retrieve two items by index
mask = [True,False,False,True,False]
test_eq(dsrc[mask], [(0,),(3,)])   # Retrieve two items by mask

In [None]:
inp = pd.DataFrame(dict(a=[5,1,2,3,4]))
dsrc = DataSource(inp, tfms=itemgetter(0)).subset(0)
test_eq(*dsrc[2], (2,))          # Retrieve one item (subset 0 is the default)
test_eq(dsrc[1,2], [(1,),(2,)])    # Retrieve two items by index
mask = [True,False,False,True,False]
test_eq(dsrc[mask], [(5,),(3,)])   # Retrieve two items by mask

Passing `filts` to the `DataSource` constructor allows you to create multiple subsets, each of type `DsrcSubset`.

In [None]:
# filts can be indices
dsrc = DataSource(range(5), tfms=[None], filts=[tensor([0,2]), [1,3,4]])

test_eq(len(dsrc.filts), 2)
test_eq(dsrc.subset(0), [(0,),(2,)])
test_eq(dsrc.train, [(0,),(2,)])       # Subset 0 is aliased to `train`
test_eq(dsrc.subset(1), [(1,),(3,),(4,)])
test_eq(dsrc.valid, [(1,),(3,),(4,)])     # Subset 1 is aliased to `valid`
test_eq(*dsrc.valid[2], 4)
assert '[(1,),(3,),(4,)]' in str(dsrc) and '[(0,),(2,)]' in str(dsrc)
dsrc

(#2) [(0,),(2,)]
(#3) [(1,),(3,),(4,)]
tls - [_FiltTfmdList: [0, 1, 2, 3, 4]
tfms - [Transform: True {'object': 'noop'} {}]]

In [None]:
# filts can be boolean masks (they don't have to cover all items, but must be disjoint)
filts = [[False,True,True,False,True], [True,False,False,False,False]]
dsrc = DataSource(range(5), tfms=[None], filts=filts)

test_eq(dsrc.train, [(1,),(2,),(4,)])
test_eq(dsrc.valid, [(0,)])

Pass `tfms` to have transformations applied before returning items.

In [None]:
# apply transforms to all items
tfm = [[lambda x: x*2,lambda x: x+1]]
filts = [[1,2],[0,3,4]]
dsrc = DataSource(range(5), tfm, filts=filts)
test_eq(dsrc.train,[(3,),(5,)])
test_eq(dsrc.valid,[(1,),(7,),(9,)])
test_eq(dsrc.train[False,True], [(5,)])

The subset index is also passed to your transform, so if it is an instance of `Transform` it will only be applied if the filt idx matches.

In [None]:
# only transform subset 1
class _Tfm(Transform):
    filt=1
    def encodes(self, x): return x*2
    def decodes(self, x): return Str(x//2)

In [None]:
dsrc = DataSource(range(5), [_Tfm()], filts=[[1,2],[0,3,4]])
test_eq(dsrc.train,[(1,),(2,)])
test_eq(dsrc.valid,[(0,),(6,),(8,)])
test_eq(dsrc.train[False,True], [(2,)])
dsrc

(#2) [(1,),(2,)]
(#3) [(0,),(6,),(8,)]
tls - [_FiltTfmdList: [0, 1, 2, 3, 4]
tfms - [_Tfm: True {'object': 'encodes'} {'object': 'decodes'}]]

In [None]:
#hide
#Test setup works with train attribute
def _lbl(o): return o.split('_')[0]

test_fns = ['dog_0.jpg','cat_0.jpg','cat_2.jpg','cat_1.jpg','dog_1.jpg']
tcat = Categorize()
dsrc = DataSource(test_fns, [[tcat,_lbl]], filts=[[1,2,4], [0,3]])
test_eq(tcat.vocab, ['cat','dog'])
test_eq(dsrc.train, [(0,),(0,),(1,)])
test_eq(dsrc.valid, [(1,),(0,)])
test_stdout(lambda: dsrc.train.show_at(0), "cat")
#test_eq(dsrc.vocab, ['cat','dog'])

In [None]:
#hide
#Test DataSource pickles
dsrc1 = pickle.loads(pickle.dumps(dsrc))
test_eq(dsrc.train, dsrc1.train)
test_eq(dsrc.valid, dsrc1.valid)
#test_eq(dsrc1.vocab, ['cat','dog'])

In [None]:
dsrc = DataSource(range(5), [_Tfm(),noop], filts=[[1,2],[0,3,4]])
test_eq(dsrc.train,[(1,1),(2,2)])
test_eq(dsrc.valid,[(0,0),(6,3),(8,4)])

### `DataSource` Methods

You won't need to use many methods of `DataSource`, since normally you'll be accessing subsets, and therefore will be using `DsrcSubset` methods. However there are a few `DataSource` methods that may be useful:

In [None]:
show_doc(DataSource.databunch)

<h4 id="DataSource.databunch" class="doc_header"><code>DataSource.databunch</code><a href="https://github.com/fastai/fastai_dev/tree/master/dev/__main__.py#L22" class="source_link" style="float:right">[source]</a></h4>

> <code>DataSource.databunch</code>(**`bs`**=*`16`*, **`val_bs`**=*`None`*, **`shuffle_train`**=*`True`*, **`shuffle`**=*`False`*, **`num_workers`**=*`None`*, **`drop_last`**=*`False`*, **`indexed`**=*`None`*, **`pin_memory`**=*`False`*, **`timeout`**=*`0`*, **`wif`**=*`None`*, **`before_iter`**=*`None`*, **`create_batches`**=*`None`*, **`sampler`**=*`None`*, **`create_item`**=*`None`*, **`after_item`**=*`None`*, **`before_batch`**=*`None`*, **`create_batch`**=*`None`*, **`retain`**=*`None`*, **`after_batch`**=*`None`*, **`after_iter`**=*`None`*)

Create a [`DataBunch`](/data.core.html#DataBunch)

In [None]:
show_doc(DataSource.subset)

<h4 id="DataSource.subset" class="doc_header"><code>DataSource.subset</code><a href="https://github.com/fastai/fastai_dev/tree/master/dev/__main__.py#L17" class="source_link" style="float:right">[source]</a></h4>

> <code>DataSource.subset</code>(**`i`**)

Filtered `DsrcSubset` `i`

Subset 0 is aliased to the `train` property, and subset 1 is aliased to the `valid` property.

In [None]:
dsrc.subset(1)

(#3) [(0, 0),(6, 3),(8, 4)]

In [None]:
show_doc(DataSource.subsets)

<h4 id="DataSource.subsets" class="doc_header"><code>DataSource.subsets</code><a href="https://github.com/fastai/fastai_dev/tree/master/dev/__main__.py#L16" class="source_link" style="float:right">[source]</a></h4>

> <code>DataSource.subsets</code>()

Iterator for all subsets

## Export -

In [None]:
#hide
from local.notebook.export import notebook2script
notebook2script(all_fs=True)

Converted 00_test.ipynb.
Converted 01_core.ipynb.
Converted 01a_torch_core.ipynb.
Converted 01b_script.ipynb.
Converted 01c_dataloader.ipynb.
Converted 02_data_transforms.ipynb.
Converted 03_data_pipeline.ipynb.
Converted 05_data_core.ipynb.
Converted 06_data_source.ipynb.
Converted 07_vision_core.ipynb.
Converted 08_pets_tutorial.ipynb.
Converted 09_vision_augment.ipynb.
Converted 11_layers.ipynb.
Converted 11a_vision_models_xresnet.ipynb.
Converted 12_optimizer.ipynb.
Converted 13_learner.ipynb.
Converted 14_callback_schedule.ipynb.
Converted 15_callback_hook.ipynb.
Converted 16_callback_progress.ipynb.
Converted 17_callback_tracker.ipynb.
Converted 18_callback_fp16.ipynb.
Converted 19_callback_mixup.ipynb.
Converted 20_metrics.ipynb.
Converted 21_tutorial_imagenette.ipynb.
Converted 30_text_core.ipynb.
Converted 31_text_data.ipynb.
Converted 32_text_models_awdlstm.ipynb.
Converted 33_text_models_core.ipynb.
Converted 34_callback_rnn.ipynb.
Converted 35_tutorial_wikitext.ipynb.
Conve