In [5]:
from fastai2.test import *
from fastai2.torch_basics import *
from fastai2.basics import *
from fastai2.callback.all import *
from fastai2.vision.all import *
from fastai2.tabular.core import *
from fastai2.layers import *

In [6]:
from fastai2.notebook.showdoc import *

In [7]:
pd.set_option('mode.chained_assignment','raise')

In [8]:
path = untar_data(URLs.ADULT_SAMPLE)
df = pd.read_csv(path/'adult.csv')
df_main,df_test = df.iloc[:10000].copy(),df.iloc[10000:].copy()
df_main.head()

Unnamed: 0,age,workclass,fnlwgt,education,education-num,marital-status,occupation,relationship,race,sex,capital-gain,capital-loss,hours-per-week,native-country,salary
0,49,Private,101320,Assoc-acdm,12.0,Married-civ-spouse,,Wife,White,Female,0,1902,40,United-States,>=50k
1,44,Private,236746,Masters,14.0,Divorced,Exec-managerial,Not-in-family,White,Male,10520,0,45,United-States,>=50k
2,38,Private,96185,HS-grad,,Divorced,,Unmarried,Black,Female,0,0,32,United-States,<50k
3,38,Self-emp-inc,112847,Prof-school,15.0,Married-civ-spouse,Prof-specialty,Husband,Asian-Pac-Islander,Male,0,0,40,United-States,>=50k
4,42,Self-emp-not-inc,82297,7th-8th,,Married-civ-spouse,Other-service,Wife,Black,Female,0,0,50,United-States,<50k


In [9]:
df.shape

(32561, 15)

In [10]:
class Normalize(TabularProc):
    "Normalize the continuous variables."
    order = 2
    def setups(self, dsrc): self.means,self.stds = dsrc.conts.mean(),dsrc.conts.std(ddof=0)+1e-7
    def encodes(self, to): to.conts = (to.conts-self.means) / self.stds
    def decodes(self, to): to.conts = (to.conts*self.stds ) + self.means

In [11]:
class FillStrategy:
    "Namespace containing the various filling strategies."
    def median        (c,fill):    return c.median()
    def constant      (c,fill):    return fill
    def mode          (c,fill):    return c.dropna().value_counts().idxmax()

In [12]:
class FillMissing(TabularProc):
    "Fill the missing values in continuous columns."
    def __init__(self, fill_strategy=FillStrategy.median, add_col=True, fill_vals=None):
        if fill_vals is None: fill_vals = defaultdict(int)
        store_attr(self, 'fill_strategy,add_col,fill_vals')

    def setups(self, dsrc):
        self.na_dict = {n:self.fill_strategy(dsrc[n], self.fill_vals[n])
                        for n in pd.isnull(dsrc.conts).any().keys()}

    def encodes(self, to):
        missing = pd.isnull(to.conts)
        for n in missing.any().keys():
            assert n in self.na_dict, f"nan values in `{n}` but not in setup training set"
            to[n].fillna(self.na_dict[n], inplace=True)
            if self.add_col:
                to.loc[:,n+'_na'] = missing[n]
                if n+'_na' not in to.cat_names: to.cat_names.append(n+'_na')

In [13]:
cat_names = ['workclass', 'education', 'marital-status', 'occupation', 'relationship', 'race']
cont_names = ['age', 'fnlwgt', 'education-num']
procs = [Categorify, FillMissing, Normalize]
splits = RandomSplitter()(range_of(df_main))

In [14]:
%time to = TabularPandas(df_main, procs, cat_names, cont_names, y_names='salary',  splits=splits)

CPU times: user 148 ms, sys: 3.11 ms, total: 151 ms
Wall time: 161 ms


In [15]:
trn_dl = TabDataLoader(to.train, bs=64, shuffle=True, drop_last=True)
val_dl = TabDataLoader(to.valid, bs=128)

dbunch = DataBunch(trn_dl, val_dl)

In [16]:
dbunch.valid_dl.show()

Unnamed: 0,workclass,education,marital-status,occupation,relationship,race,age_na,fnlwgt_na,education-num_na,age,fnlwgt,education-num,salary
4556,6,13,1,11,2,5,1,1,1,1.205986,-0.66417,1.564507,0
3841,5,13,5,2,4,5,1,1,1,0.099949,-0.747393,1.564507,0
5749,5,12,3,14,1,3,1,1,1,1.353458,-0.221012,-0.432184,1
9301,5,16,5,13,4,5,1,1,1,-1.227296,-1.289638,-0.032846,0
901,5,12,3,0,1,5,1,1,1,0.394892,-0.733707,-0.432184,0
595,5,10,5,0,2,5,1,1,1,-1.006089,-1.428941,1.165169,0
5104,8,12,3,2,1,5,1,1,1,1.279722,-0.761784,-0.432184,0
5807,5,16,5,13,4,5,1,1,1,-1.448504,-0.217085,-0.032846,0
7944,5,9,3,4,1,5,1,1,1,-0.342466,0.453039,0.366493,0
4433,3,9,3,14,6,5,1,1,1,-0.711145,1.67371,0.366493,1


In [24]:
def emb_sz_rule(n_cat): 
    "Rule of thumb to pick embedding size corresponding to `n_cat`"
    return min(600, round(1.6 * n_cat**0.56))

def _one_emb_sz(classes, n, sz_dict=None):
    "Pick an embedding size for `n` depending on `classes` if not given in `sz_dict`."
    sz_dict = ifnone(sz_dict, {})
    n_cat = len(classes[n])
    sz = sz_dict.get(n, int(emb_sz_rule(n_cat)))  # rule of thumb
    return n_cat,sz

def get_emb_sz(to, sz_dict=None):
    "Get default embedding size from `TabularPreprocessor` `proc` or the ones in `sz_dict`"
    return [_one_emb_sz(to.procs.classes, n, sz_dict) for n in to.cat_names]

class TabularModel(Module):
    "Basic model for tabular data."
    def __init__(self, emb_szs, n_cont, out_sz, layers, ps=None, embed_p=0., y_range=None, use_bn=True, bn_final=False):
        ps = ifnone(ps, [0]*len(layers))
        if not is_listy(ps): ps = [ps]*len(layers)
        self.embeds = nn.ModuleList([Embedding(ni, nf) for ni,nf in emb_szs])
        self.emb_drop = nn.Dropout(embed_p)
        self.bn_cont = nn.BatchNorm1d(n_cont)
        n_emb = sum(e.embedding_dim for e in self.embeds)
        self.n_emb,self.n_cont,self.y_range = n_emb,n_cont,y_range
        sizes = [n_emb + n_cont] + layers + [out_sz]
        actns = [nn.ReLU(inplace=True) for _ in range(len(sizes)-2)] + [None]
        _layers = [LinBnDrop(sizes[i], sizes[i+1], bn=use_bn and i!=0, p=p, act=a)
                       for i,(p,a) in enumerate(zip([0.]+ps,actns))]
        if bn_final: _layers.append(nn.BatchNorm1d(sizes[-1]))
        self.layers = nn.Sequential(*_layers)
    
    def forward(self, x_cat, x_cont):
        if self.n_emb != 0:
            x = [e(x_cat[:,i]) for i,e in enumerate(self.embeds)]
            x = torch.cat(x, 1)
            x = self.emb_drop(x)
        if self.n_cont != 0:
            x_cont = self.bn_cont(x_cont)
            x = torch.cat([x, x_cont], 1) if self.n_emb != 0 else x_cont
        x = self.layers(x)
        if self.y_range is not None:
            x = (self.y_range[1]-self.y_range[0]) * torch.sigmoid(x) + self.y_range[0]

        return x


In [25]:

emb_szs = get_emb_sz(to);emb_szs

[(10, 6), (17, 8), (8, 5), (16, 8), (7, 5), (6, 4), (2, 2), (2, 2), (3, 3)]

In [26]:

cont_len = len(to.cont_names); cont_len

3

In [27]:
net = TabularModel(emb_szs, cont_len, 2, [200,100]);net

TabularModel(
  (embeds): ModuleList(
    (0): Embedding(10, 6)
    (1): Embedding(17, 8)
    (2): Embedding(8, 5)
    (3): Embedding(16, 8)
    (4): Embedding(7, 5)
    (5): Embedding(6, 4)
    (6): Embedding(2, 2)
    (7): Embedding(2, 2)
    (8): Embedding(3, 3)
  )
  (emb_drop): Dropout(p=0.0, inplace=False)
  (bn_cont): BatchNorm1d(3, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
  (layers): Sequential(
    (0): LinBnDrop(
      (0): Linear(in_features=46, out_features=200, bias=True)
      (1): ReLU(inplace=True)
    )
    (1): LinBnDrop(
      (0): Linear(in_features=200, out_features=100, bias=False)
      (1): ReLU(inplace=True)
      (2): BatchNorm1d(100, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    )
    (2): LinBnDrop(
      (0): Linear(in_features=100, out_features=2, bias=False)
      (1): BatchNorm1d(2, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    )
  )
)

In [28]:
Adam

<function fastai2.optimizer.Adam(params, lr, mom=0.9, sqr_mom=0.99, eps=1e-05, wd=0.0, decouple_wd=True)>

In [29]:
opt_func = partial(Adam, wd=0.01, eps=1e-5)
learn = Learner(dbunch, net, CrossEntropyLossFlat(), opt_func=opt_func, metrics=accuracy)

In [30]:
learn.fit(1)

epoch,train_loss,valid_loss,accuracy,time
0,0.46988,0.414378,0.8155,00:03
