In [None]:
#export
from local.torch_basics import *
from local.test import *
from local.tabular.core import *

In [None]:
from local.notebook.showdoc import *

In [None]:
#default_exp tabular.model

# Tabular model

> A basic model that can be used on tabular data

## Model

In [None]:
def emb_sz_rule(n_cat): 
    "Rule of thumb to pick embedding size corresponding to `n_cat`"
    return min(600, round(1.6 * n_cat**0.56))

In [None]:
def _one_emb_sz(classes, n, sz_dict=None):
    "Pick an embedding size for `n` depending on `classes` if not given in `sz_dict`."
    sz_dict = ifnone(sz_dict, {})
    n_cat = len(classes[n])
    sz = sz_dict.get(n, int(emb_sz_rule(n_cat)))  # rule of thumb
    return n_cat,sz

In [None]:
def get_emb_sz(to, sz_dict=None):
    "Get default embedding size from `TabularPreprocessor` `proc` or the ones in `sz_dict`"
    return [_one_emb_sz(to.procs.classes, n, sz_dict) for n in to.cat_names]

In [None]:
class TabularModel(Module):
    "Basic model for tabular data."
    def __init__(self, emb_szs, n_cont, out_sz, layers, ps=None, embed_p=0., y_range=None, use_bn=True, bn_final=False):
        ps = ifnone(ps, [0]*len(layers))
        if not is_listy(ps): ps = [ps]*len(layers)
        self.embeds = nn.ModuleList([Embedding(ni, nf) for ni,nf in emb_szs])
        self.emb_drop = nn.Dropout(embed_p)
        self.bn_cont = nn.BatchNorm1d(n_cont)
        n_emb = sum(e.embedding_dim for e in self.embeds)
        self.n_emb,self.n_cont,self.y_range = n_emb,n_cont,y_range
        sizes = [n_emb + n_cont] + layers + [out_sz]
        actns = [nn.ReLU(inplace=True) for _ in range(len(sizes)-2)] + [None]
        _layers = [BnDropLin(sizes[i], sizes[i+1], bn=use_bn and i!=0, p=p, act=a)
                       for i,(p,a) in enumerate(zip([0.]+ps,actns))]
        if bn_final: _layers.append(nn.BatchNorm1d(sizes[-1]))
        self.layers = nn.Sequential(*_layers)
    
    def forward(self, x_cat, x_cont):
        if self.n_emb != 0:
            x = [e(x_cat[:,i]) for i,e in enumerate(self.embeds)]
            x = torch.cat(x, 1)
            x = self.emb_drop(x)
        if self.n_cont != 0:
            x_cont = self.bn_cont(x_cont)
            x = torch.cat([x, x_cont], 1) if self.n_emb != 0 else x_cont
        x = self.layers(x)
        if self.y_range is not None:
            x = (self.y_range[1]-self.y_range[0]) * torch.sigmoid(x) + self.y_range[0]
        return x

## Integration example with training

In [None]:
from local.data.all import *
from local.tabular.core import *
from local.optimizer import *
from local.learner import *
from local.metrics import *
from local.callback.all import *

In [None]:
path = untar_data(URLs.ADULT_SAMPLE)
df = pd.read_csv(path/'adult.csv')

In [None]:
cat_names = ['workclass', 'education', 'marital-status', 'occupation', 'relationship', 'race']
cont_names = ['age', 'fnlwgt', 'education-num']
procs = [Categorify, FillMissing, Normalize]
splits = RandomSplitter()(range_of(df))

In [None]:
to = TabularPandas(df, procs, cat_names, cont_names, y_names="salary", splits=splits)

In [None]:
trn_dl = TabDataLoader(to.train, bs=64, num_workers=0, shuffle=True, drop_last=True)
val_dl = TabDataLoader(to.valid, bs=128, num_workers=0)
dbunch = DataBunch(trn_dl, val_dl)
dbunch.show_batch()

Unnamed: 0,age,fnlwgt,education-num,workclass,education,marital-status,occupation,relationship,race,age_na,fnlwgt_na,education-num_na,salary
0,39.0,167777.000751,10.0,Private,11th,Married-civ-spouse,Craft-repair,Husband,White,False,False,True,<50k
1,18.0,331510.994855,10.0,?,Some-college,Never-married,?,Own-child,White,False,False,False,<50k
2,19.0,247297.997987,8.0,Private,12th,Married-spouse-absent,Other-service,Own-child,Other,False,False,False,<50k
3,41.0,141327.001318,14.0,Self-emp-not-inc,Masters,Divorced,Prof-specialty,Unmarried,White,False,False,False,<50k
4,23.999999,191072.999956,9.0,Federal-gov,HS-grad,Never-married,Armed-Forces,Own-child,White,False,False,False,<50k
5,38.0,478829.000011,15.0,Self-emp-inc,Prof-school,Married-civ-spouse,Prof-specialty,Husband,White,False,False,False,>=50k
6,59.0,108495.999087,10.0,Private,Masters,Married-civ-spouse,Exec-managerial,Husband,White,False,False,True,>=50k
7,90.000001,313748.998304,13.0,Private,Bachelors,Never-married,Prof-specialty,Own-child,White,False,False,False,<50k
8,25.0,470202.989434,13.0,Private,Bachelors,Never-married,Prof-specialty,Not-in-family,White,False,False,False,<50k
9,34.0,137900.001016,9.0,Private,HS-grad,Never-married,Sales,Not-in-family,White,False,False,False,<50k


In [None]:
model = TabularModel(get_emb_sz(to), len(to.cont_names), 2, [200,100])

In [None]:
opt_func = partial(Adam, wd=0.01, eps=1e-5)
learn = Learner(dbunch, model, CrossEntropyLossFlat(), opt_func=opt_func, metrics=accuracy)

In [None]:
learn.fit_one_cycle(1)

epoch,train_loss,valid_loss,accuracy,time
0,0.409279,0.364448,0.829699,00:25


In [None]:
#export
@typedispatch
def show_results(x:Tabular, y:Tabular, its, ctxs=None, max_n=10, **kwargs):
    df = x.all_cols[:max_n]
    df[to.y_names+'_pred'] = y[to.y_names][:max_n].values
    display_df(df)

In [None]:
learn.show_results()

Unnamed: 0,age,fnlwgt,education-num,workclass,education,marital-status,occupation,relationship,race,age_na,fnlwgt_na,education-num_na,salary,salary_pred
0,36.0,210452.000142,13.0,Private,Bachelors,Divorced,Prof-specialty,Unmarried,White,False,False,False,<50k,<50k
1,40.0,333529.998393,14.0,Local-gov,Masters,Married-civ-spouse,Prof-specialty,Wife,White,False,False,False,>=50k,>=50k
2,39.0,178947.999776,9.0,Self-emp-not-inc,HS-grad,Married-civ-spouse,Farming-fishing,Wife,White,False,False,False,<50k,<50k
3,55.000001,325007.005137,12.0,Private,Assoc-acdm,Married-civ-spouse,Tech-support,Husband,White,False,False,False,<50k,>=50k
4,34.0,110553.997633,9.0,Private,HS-grad,Divorced,Sales,Own-child,White,False,False,False,<50k,<50k
5,30.0,203833.999595,13.0,?,Bachelors,Never-married,?,Not-in-family,Asian-Pac-Islander,False,False,False,<50k,<50k
6,38.0,33982.996384,10.0,Private,Some-college,Married-civ-spouse,Transport-moving,Husband,White,False,False,False,<50k,<50k
7,59.0,188002.999998,13.0,Self-emp-not-inc,Bachelors,Widowed,Prof-specialty,Not-in-family,White,False,False,False,>=50k,<50k
8,67.999999,123652.996893,3.0,Private,5th-6th,Separated,Other-service,Not-in-family,White,False,False,False,<50k,<50k
9,22.0,65703.997362,9.0,Private,HS-grad,Never-married,Sales,Own-child,White,False,False,False,<50k,<50k


## Export -

In [None]:
#hide
from local.notebook.export import notebook2script
notebook2script(all_fs=True)

Converted 00_test.ipynb.
Converted 01_core.ipynb.
Converted 01a_torch_core.ipynb.
Converted 02_script.ipynb.
Converted 03_dataloader.ipynb.
Converted 04_transform.ipynb.
Converted 05_data_core.ipynb.
Converted 06_data_transforms.ipynb.
Converted 07_vision_core.ipynb.
Converted 08_pets_tutorial.ipynb.
Converted 09_vision_augment.ipynb.
Converted 11_layers.ipynb.
Converted 11a_vision_models_xresnet.ipynb.
Converted 12_optimizer.ipynb.
Converted 13_learner.ipynb.
Converted 14_callback_schedule.ipynb.
Converted 15_callback_hook.ipynb.
Converted 16_callback_progress.ipynb.
Converted 17_callback_tracker.ipynb.
Converted 18_callback_fp16.ipynb.
Converted 19_callback_mixup.ipynb.
Converted 20_metrics.ipynb.
Converted 21_tutorial_imagenette.ipynb.
Converted 22_vision_learner.ipynb.
Converted 23_tutorial_transfer_learning.ipynb.
Converted 30_text_core.ipynb.
Converted 31_text_data.ipynb.
Converted 32_text_models_awdlstm.ipynb.
Converted 33_text_models_core.ipynb.
Converted 34_callback_rnn.ipynb.