In [None]:
#default_exp tabular.core

In [None]:
#export
from local.torch_basics import *
from local.test import *
from local.core import *
from local.data.all import *

In [None]:
from local.notebook.showdoc import *

In [None]:
#export
pd.set_option('mode.chained_assignment','raise')

# Tabular core

> Basic function to preprocess tabular data before assembling it in a `DataBunch`.

## Tabular -

In [None]:
#export
class _TabIloc:
    "Get/set rows by iloc and cols by name"
    def __init__(self,to): self.to = to
    def __getitem__(self, idxs):
        df = self.to.items
        if isinstance(idxs,tuple):
            rows,cols = idxs
            cols = df.columns.isin(cols) if is_listy(cols) else df.columns.get_loc(cols)
        else: rows,cols = idxs,slice(None)
        return self.to.new(df.iloc[rows, cols])

In [None]:
#export
class Tabular(CollBase, GetAttr, FilteredBase):
    "A `DataFrame` wrapper that knows which cols are cont/cat/y, and returns rows in `__getitem__`"
    _default='items'
    def __init__(self, df, procs=None, cat_names=None, cont_names=None, y_names=None, is_y_cat=True, splits=None, do_setup=True):
        if splits is None: splits=[range_of(df)]
        df = df.iloc[sum(splits, [])].copy()
        super().__init__(df)

        store_attr(self, 'y_names,is_y_cat')
        self.cat_names,self.cont_names,self.procs = L(cat_names),L(cont_names),Pipeline(procs, as_item=True)
        self.cat_y  = None if not is_y_cat else y_names
        self.cont_y = None if     is_y_cat else y_names
        self.split = len(splits[0])
        if do_setup: self.procs.setup(self)

    def subset(self, i): return self.new(self.items[slice(0,self.split) if i==0 else slice(self.split,len(self))])
    def copy(self): self.items = self.items.copy(); return self
    def new(self, df): return type(self)(df, do_setup=False, **attrdict(self, 'procs','cat_names','cont_names','y_names','is_y_cat'))
    def show(self, max_n=10, **kwargs): display_df(self.all_cols[:max_n])
    def setup(self): self.procs.setup(self)
    def process(self): self.procs(self)
    def iloc(self): return _TabIloc(self)
    def targ(self): return self.items[self.y_names]
    def all_cont_names(self): return self.cont_names + self.cont_y
    def all_cat_names (self): return self.cat_names  + self.cat_y
    def all_col_names (self): return self.all_cont_names + self.all_cat_names
    def n_subsets(self): return 2

properties(Tabular,'iloc','targ','all_cont_names','all_cat_names','all_col_names','n_subsets')

In [None]:
#export
class TabularPandas(Tabular):
    def transform(self, cols, f): self[cols] = self[cols].transform(f)

In [None]:
#export
def _add_prop(cls, nm):
    @property
    def f(o): return o[list(getattr(o,nm+'_names'))]
    @f.setter
    def fset(o, v): o[getattr(o,nm+'_names')] = v
    setattr(cls, nm+'s', f)
    setattr(cls, nm+'s', fset)

_add_prop(Tabular, 'cat')
_add_prop(Tabular, 'all_cat')
_add_prop(Tabular, 'cont')
_add_prop(Tabular, 'all_cont')
_add_prop(Tabular, 'all_col')

In [None]:
df = pd.DataFrame({'a':[0,1,2,0,2], 'b':[0,0,0,0,1]})
to = TabularPandas(df, cat_names='a')
t = pickle.loads(pickle.dumps(to))
test_eq(t.items,to.items)
test_eq(to.all_cols,to[['a']])
to.show() # only shows 'a' since that's the only col in `TabularPandas`

Unnamed: 0,a
0,0
1,1
2,2
3,0
4,2


In [None]:
#export
class TabularProc(InplaceTransform):
    "Base class to write a non-lazy tabular processor for dataframes"
    def setup(self, items=None):
        super().setup(getattr(items,'train',items))
        # Procs are called as soon as data is available
        return self(items.items if isinstance(items,DataSource) else items)

In [None]:
#export
class Categorify(TabularProc):
    "Transform the categorical variables to that type."
    order = 1
    def setups(self, to):
        self.classes = {n:CategoryMap(to.iloc[:,n].items, add_na=(n in to.cat_names)) for n in to.all_cat_names}
    def _apply_cats (self, add, c): return c.cat.codes+add if is_categorical_dtype(c) else c.map(self[c.name].o2i)
    def _decode_cats(self, c): return c.map(dict(enumerate(self[c.name].items)))
    def encodes(self, to):
        to.transform(to.cat_names, partial(self._apply_cats,1))
        to.transform(L(to.cat_y),  partial(self._apply_cats,0))
    def decodes(self, to): to.transform(to.all_cat_names, self._decode_cats)
    def __getitem__(self,k): return self.classes[k]

In [None]:
show_doc(Categorify, title_level=3)

<h3 id="Categorify" class="doc_header"><code>class</code> <code>Categorify</code><a href="" class="source_link" style="float:right">[source]</a></h3>

> <code>Categorify</code>(**`enc`**=*`None`*, **`dec`**=*`None`*, **`split_idx`**=*`None`*, **`as_item`**=*`False`*) :: [`TabularProc`](/tabular.core.html#TabularProc)

Transform the categorical variables to that type.

In [None]:
df = pd.DataFrame({'a':[0,1,2,0,2]})
to = TabularPandas(df, Categorify, 'a')
cat = to.procs.categorify
test_eq(cat['a'], ['#na#',0,1,2])
test_eq(to.a, [1,2,3,1,3])

In [None]:
df1 = pd.DataFrame({'a':[1,0,3,-1,2]})
to1 = to.new(df1)
to1.process()
#Values that weren't in the training df are sent to 0 (na)
test_eq(to1.a, [2,1,0,0,3])
to2 = cat.decode(to1)
test_eq(to2.a, [1,0,'#na#','#na#',2])

In [None]:
#test with splits
cat = Categorify()
df = pd.DataFrame({'a':[0,1,2,3,2]})
to = TabularPandas(df, cat, 'a', splits=[[0,1,2],[3,4]])
test_eq(cat['a'], ['#na#',0,1,2])
test_eq(to['a'], [1,2,3,0,3])

In [None]:
df = pd.DataFrame({'a':pd.Categorical(['M','H','L','M'], categories=['H','M','L'], ordered=True)})
to = TabularPandas(df, Categorify, 'a')
cat = to.procs.categorify
test_eq(cat['a'], ['#na#','H','M','L'])
test_eq(to.a, [2,1,3,2])
to2 = cat.decode(to)
test_eq(to2.a, ['M','H','L','M'])

In [None]:
#export
class Normalize(TabularProc):
    "Normalize the continuous variables."
    order = 2
    def setups(self, dsrc): self.means,self.stds = dsrc.conts.mean(),dsrc.conts.std(ddof=0)+1e-7
    def encodes(self, to): to.conts = (to.conts-self.means) / self.stds
    def decodes(self, to): to.conts = (to.conts*self.stds ) + self.means

In [None]:
show_doc(Normalize, title_level=3)

<h3 id="Normalize" class="doc_header"><code>class</code> <code>Normalize</code><a href="" class="source_link" style="float:right">[source]</a></h3>

> <code>Normalize</code>(**`enc`**=*`None`*, **`dec`**=*`None`*, **`split_idx`**=*`None`*, **`as_item`**=*`False`*) :: [`TabularProc`](/tabular.core.html#TabularProc)

Normalize the continuous variables.

In [None]:
norm = Normalize()
df = pd.DataFrame({'a':[0,1,2,3,4]})
to = TabularPandas(df, norm, cont_names='a')
x = np.array([0,1,2,3,4])
m,s = x.mean(),x.std()
test_eq(norm.means['a'], m)
test_close(norm.stds['a'], s)
test_close(to.a.values, (x-m)/s)

In [None]:
df1 = pd.DataFrame({'a':[5,6,7]})
to1 = to.new(df1)
to1.process()
test_close(to1['a'].values, (np.array([5,6,7])-m)/s)
to2 = norm.decode(to1)
test_close(to2.a.values, [5,6,7])

In [None]:
norm = Normalize()
df = pd.DataFrame({'a':[0,1,2,3,4]})
to = TabularPandas(df, norm, cont_names='a', splits=[[0,1,2],[3,4]])
x = np.array([0,1,2])
m,s = x.mean(),x.std()
test_eq(norm.means['a'], m)
test_close(norm.stds['a'], s)
test_close(to['a'].values, (np.array([0,1,2,3,4])-m)/s)

In [None]:
#export
class FillStrategy:
    "Namespace containing the various filling strategies."
    def median  (c,fill): return c.median()
    def constant(c,fill): return fill
    def mode    (c,fill): return c.dropna().value_counts().idxmax()

In [None]:
#export
class FillMissing(TabularProc):
    "Fill the missing values in continuous columns."
    def __init__(self, fill_strategy=FillStrategy.median, add_col=True, fill_vals=None):
        if fill_vals is None: fill_vals = defaultdict(int)
        store_attr(self, 'fill_strategy,add_col,fill_vals')

    def setups(self, dsrc):
        self.na_dict = {n:self.fill_strategy(dsrc[n], self.fill_vals[n])
                        for n in pd.isnull(dsrc.conts).any().keys()}

    def encodes(self, to):
        missing = pd.isnull(to.conts)
        for n in missing.any().keys():
            assert n in self.na_dict, f"nan values in `{n}` but not in setup training set"
            to[n].fillna(self.na_dict[n], inplace=True)
            if self.add_col:
                to.loc[:,n+'_na'] = missing[n]
                if n+'_na' not in to.cat_names: to.cat_names.append(n+'_na')

In [None]:
show_doc(FillMissing, title_level=3)

<h3 id="FillMissing" class="doc_header"><code>class</code> <code>FillMissing</code><a href="" class="source_link" style="float:right">[source]</a></h3>

> <code>FillMissing</code>(**`fill_strategy`**=*`'median'`*, **`add_col`**=*`True`*, **`fill_vals`**=*`None`*) :: [`TabularProc`](/tabular.core.html#TabularProc)

Fill the missing values in continuous columns.

In [None]:
fill1,fill2,fill3 = (FillMissing(fill_strategy=s) 
                     for s in [FillStrategy.median, FillStrategy.constant, FillStrategy.mode])
df = pd.DataFrame({'a':[0,1,np.nan,1,2,3,4]})
df1 = df.copy(); df2 = df.copy()
tos = TabularPandas(df, fill1, cont_names='a'),TabularPandas(df1, fill2, cont_names='a'),TabularPandas(df2, fill3, cont_names='a')
test_eq(fill1.na_dict, {'a': 1.5})
test_eq(fill2.na_dict, {'a': 0})
test_eq(fill3.na_dict, {'a': 1.0})

for t in tos: test_eq(t.cat_names, ['a_na'])

for to_,v in zip(tos, [1.5, 0., 1.]):
    test_eq(to_.a.values, np.array([0, 1, v, 1, 2, 3, 4]))
    test_eq(to_.a_na.values, np.array([0, 0, 1, 0, 0, 0, 0]))

In [None]:
dfa = pd.DataFrame({'a':[np.nan,0,np.nan]})
tos = [t.new(o) for t,o in zip(tos,(dfa,dfa.copy(),dfa.copy()))]
for t in tos: t.process()
for to_,v in zip(tos, [1.5, 0., 1.]):
    test_eq(to_.a.values, np.array([v, 0, v]))
    test_eq(to_.a_na.values, np.array([1, 0, 1]))

## TabularPandas Pipelines -

In [None]:
procs = [Normalize, Categorify, FillMissing, noop]
df = pd.DataFrame({'a':[0,1,2,1,1,2,0], 'b':[0,1,np.nan,1,2,3,4]})
to = TabularPandas(df, procs, cat_names='a', cont_names='b')

#Test setup and apply on df_main
test_eq(to.cat_names, ['a', 'b_na'])
test_eq(to.a, [1,2,3,2,2,3,1])
test_eq(to.b_na, [1,1,2,1,1,1,1])
x = np.array([0,1,1.5,1,2,3,4])
m,s = x.mean(),x.std()
test_close(to.b.values, (x-m)/s)
test_eq(to.procs.classes, {'a': ['#na#',0,1,2], 'b_na': ['#na#',False,True]})

In [None]:
#Test apply on y_names
df = pd.DataFrame({'a':[0,1,2,1,1,2,0], 'b':[0,1,np.nan,1,2,3,4], 'c': ['b','a','b','a','a','b','a']})
to = TabularPandas(df, procs, 'a', 'b', y_names='c')

test_eq(to.cat_names, ['a', 'b_na'])
test_eq(to.a, [1,2,3,2,2,3,1])
test_eq(to.b_na, [1,1,2,1,1,1,1])
test_eq(to.c, [1,0,1,0,0,1,0])
x = np.array([0,1,1.5,1,2,3,4])
m,s = x.mean(),x.std()
test_close(to.b.values, (x-m)/s)
test_eq(to.procs.classes, {'a': ['#na#',0,1,2], 'b_na': ['#na#',False,True], 'c': ['a','b']})

In [None]:
df = pd.DataFrame({'a':[0,1,2,1,1,2,0], 'b':[0,1,np.nan,1,2,3,4], 'c': ['b','a','b','a','a','b','a']})
to = TabularPandas(df, procs, 'a', 'b', y_names='c')

test_eq(to.cat_names, ['a', 'b_na'])
test_eq(to.a, [1,2,3,2,2,3,1])
test_eq(df.a.dtype,int)
test_eq(to.b_na, [1,1,2,1,1,1,1])
test_eq(to.c, [1,0,1,0,0,1,0])

In [None]:
df = pd.DataFrame({'a':[0,1,2,1,1,2,0], 'b':[0,np.nan,1,1,2,3,4], 'c': ['b','a','b','a','a','b','a']})
to = TabularPandas(df, procs, cat_names='a', cont_names='b', y_names='c', splits=[[0,1,4,6], [2,3,5]])

test_eq(to.cat_names, ['a', 'b_na'])
test_eq(to.a, [1,2,2,1,0,2,0])
test_eq(df.a.dtype,int)
test_eq(to.b_na, [1,2,1,1,1,1,1])
test_eq(to.c, [1,0,0,0,1,0,1])

In [None]:
#export
class ReadTabBatch(ItemTransform):
    def __init__(self, to): self.to = to
    # TODO: use float for cont targ
    def encodes(self, to): return tensor(to.cats).long(),tensor(to.conts).float(), tensor(to.targ).long()

    def decodes(self, o):
        cats,conts,targs = to_np(o)
        vals = np.concatenate([cats,conts,targs[:,None]], axis=1)
        df = pd.DataFrame(vals, columns=self.to.cat_names+self.to.cont_names+self.to.y_names)
        to = self.to.new(df)
        to = self.to.procs.decode(to)
        return to

In [None]:
#export
@typedispatch
def show_batch(x: Tabular, y, its, max_n=10, ctxs=None):
    x.show()

In [None]:
#export
@delegates()
class TabDataLoader(TfmdDL):
    do_item = noops
    def __init__(self, dataset, bs=16, shuffle=False, after_batch=None, num_workers=0, **kwargs):
        after_batch = L(after_batch)+ReadTabBatch(dataset)
        super().__init__(dataset, bs=bs, shuffle=shuffle, after_batch=after_batch, num_workers=num_workers, **kwargs)

    def create_batch(self, b): return self.dataset.iloc[b]

TabularPandas._dl_type = TabDataLoader

## Integration example

In [None]:
path = untar_data(URLs.ADULT_SAMPLE)
df = pd.read_csv(path/'adult.csv')
df_main,df_test = df.iloc[:10000].copy(),df.iloc[10000:].copy()
df_main.head()

Unnamed: 0,age,workclass,fnlwgt,education,education-num,marital-status,occupation,relationship,race,sex,capital-gain,capital-loss,hours-per-week,native-country,salary
0,49,Private,101320,Assoc-acdm,12.0,Married-civ-spouse,,Wife,White,Female,0,1902,40,United-States,>=50k
1,44,Private,236746,Masters,14.0,Divorced,Exec-managerial,Not-in-family,White,Male,10520,0,45,United-States,>=50k
2,38,Private,96185,HS-grad,,Divorced,,Unmarried,Black,Female,0,0,32,United-States,<50k
3,38,Self-emp-inc,112847,Prof-school,15.0,Married-civ-spouse,Prof-specialty,Husband,Asian-Pac-Islander,Male,0,0,40,United-States,>=50k
4,42,Self-emp-not-inc,82297,7th-8th,,Married-civ-spouse,Other-service,Wife,Black,Female,0,0,50,United-States,<50k


In [None]:
cat_names = ['workclass', 'education', 'marital-status', 'occupation', 'relationship', 'race']
cont_names = ['age', 'fnlwgt', 'education-num']
procs = [Categorify, FillMissing, Normalize]
splits = RandomSplitter()(range_of(df_main))

In [None]:
%time to = TabularPandas(df_main, procs, cat_names, cont_names, y_names="salary", splits=splits)

CPU times: user 206 ms, sys: 245 µs, total: 206 ms
Wall time: 205 ms


In [None]:
dbch = to.databunch()
dbch.valid_dl.show_batch()

Unnamed: 0,age,fnlwgt,education-num,workclass,education,marital-status,occupation,relationship,race,age_na,fnlwgt_na,education-num_na,salary
0,58.999999,247552.003168,9.0,Self-emp-not-inc,HS-grad,Married-civ-spouse,Exec-managerial,Husband,White,False,False,False,<50k
1,26.0,118497.002064,9.0,Private,HS-grad,Married-civ-spouse,Craft-repair,Husband,White,False,False,False,<50k
2,36.0,176634.000068,13.0,Private,Bachelors,Married-civ-spouse,Sales,Wife,White,False,False,False,>=50k
3,18.0,216508.000263,9.0,?,HS-grad,Never-married,?,Own-child,White,False,False,False,<50k
4,35.0,328465.997477,5.0,Private,9th,Married-civ-spouse,Other-service,Husband,White,False,False,False,<50k
5,42.0,118685.99927,10.0,Private,Some-college,Divorced,Prof-specialty,Unmarried,White,False,False,False,<50k
6,48.0,173938.000765,9.0,Private,HS-grad,Married-civ-spouse,Exec-managerial,Husband,White,False,False,False,<50k
7,33.0,136330.997268,9.0,Private,HS-grad,Married-civ-spouse,Handlers-cleaners,Husband,White,False,False,False,<50k
8,23.0,69846.995196,13.0,Private,Bachelors,Never-married,Prof-specialty,Own-child,Asian-Pac-Islander,False,False,False,<50k
9,61.0,120939.00165,15.0,Private,Prof-school,Married-civ-spouse,Transport-moving,Husband,White,False,False,False,>=50k


In [None]:
to_tst = to.new(df_test)
to_tst.process()
to_tst.all_cols.head()

Unnamed: 0,age,fnlwgt,education-num,workclass,education,marital-status,occupation,relationship,race,age_na,fnlwgt_na,education-num_na,salary
10000,0.464358,1.335795,1.156903,5,10,3,2,1,2,1,1,1,0
10001,-0.929807,1.249181,-0.430342,5,12,3,15,1,4,1,1,1,0
10002,1.051375,0.151684,-1.223964,5,2,1,9,2,5,1,1,1,0
10003,0.537735,-0.279386,-0.430342,5,12,7,2,5,5,1,1,1,0
10004,0.757866,1.437848,0.36328,6,9,3,5,1,5,1,1,1,1


## Not being used now - for multi-modal

In [None]:
class TensorTabular(Tuple):
    def get_ctxs(self, max_n=10, **kwargs):
        n_samples = min(self[0].shape[0], max_n)
        df = pd.DataFrame(index = range(n_samples))
        return [df.iloc[i] for i in range(n_samples)]

    def display(self, ctxs): display_df(pd.DataFrame(ctxs))

class TabularLine(pd.Series):
    "A line of a dataframe that knows how to show itself"
    def show(self, ctx=None, **kwargs): return self if ctx is None else ctx.append(self)

class ReadTabLine(ItemTransform):
    def __init__(self, proc): self.proc = proc

    def encodes(self, row):
        cats,conts = (o.map(row.__getitem__) for o in (self.proc.cat_names,self.proc.cont_names))
        return TensorTabular(tensor(cats).long(),tensor(conts).float())

    def decodes(self, o):
        to = TabularPandas(o, self.proc.cat_names, self.proc.cont_names, self.proc.y_names)
        to = self.proc.decode(to)
        return TabularLine(pd.Series({c: v for v,c in zip(to.items[0]+to.items[1], self.proc.cat_names+self.proc.cont_names)}))

class ReadTabTarget(ItemTransform):
    def __init__(self, proc): self.proc = proc
    def encodes(self, row): return row[self.proc.y_names].astype(np.int64)
    def decodes(self, o): return Category(self.proc.classes[self.proc.y_names][o])

In [None]:
# tds = TfmdDS(to.items, tfms=[[ReadTabLine(proc)], ReadTabTarget(proc)])
# enc = tds[1]
# test_eq(enc[0][0], tensor([2,1]))
# test_close(enc[0][1], tensor([-0.628828]))
# test_eq(enc[1], 1)

# dec = tds.decode(enc)
# assert isinstance(dec[0], TabularLine)
# test_close(dec[0], pd.Series({'a': 1, 'b_na': False, 'b': 1}))
# test_eq(dec[1], 'a')

# test_stdout(lambda: print(show_at(tds, 1)), """a               1
# b_na        False
# b               1
# category        a
# dtype: object""")

## Export -

In [None]:
#hide
from local.notebook.export import notebook2script
notebook2script(all_fs=True)

Converted 00_test.ipynb.
Converted 01_core.ipynb.
Converted 01a_utils.ipynb.
Converted 01b_dispatch.ipynb.
Converted 01c_torch_core.ipynb.
Converted 02_script.ipynb.
Converted 03_dataloader.ipynb.
Converted 04_transform.ipynb.
Converted 05_data_core.ipynb.
Converted 06_data_transforms.ipynb.
Converted 07_vision_core.ipynb.
Converted 08_pets_tutorial.ipynb.
Converted 09_vision_augment.ipynb.
Converted 10_data_block.ipynb.
Converted 11_layers.ipynb.
Converted 11a_vision_models_xresnet.ipynb.
Converted 12_optimizer.ipynb.
Converted 13_learner.ipynb.
Converted 14_callback_schedule.ipynb.
Converted 14a_callback_data.ipynb.
Converted 15_callback_hook.ipynb.
Converted 16_callback_progress.ipynb.
Converted 17_callback_tracker.ipynb.
Converted 18_callback_fp16.ipynb.
Converted 19_callback_mixup.ipynb.
Converted 20_metrics.ipynb.
Converted 21_tutorial_imagenette.ipynb.
Converted 22_vision_learner.ipynb.
Converted 23_tutorial_transfer_learning.ipynb.
Converted 30_text_core.ipynb.
Converted 31_tex