In [None]:
# default_exp data.tabular

# Data Tabular

> Main Tabular functions used throughout the library. This is helpful when you have additional time series data like metadata, time series features, etc.

In [None]:
#export
from tsai.imports import *
from tsai.utils import *
from fastai.tabular.all import *

In [None]:
#export
@delegates(TabularPandas.__init__)
def get_tabular_ds(df, procs=[Categorify, FillMissing, Normalize], cat_names=None, cont_names=None, y_names=None, 
                   y_block=None, splits=None, do_setup=True, inplace=False, reduce_memory=True, device=None, **kwargs):
    device = ifnone(device, default_device())
    cat_names = str2list(cat_names)
    cont_names = str2list(cont_names)
    y_names = str2list(y_names)
    cols = []
    for _cols in [cat_names, cont_names, y_names]: 
        if _cols is not None: cols.extend(_cols)
    cols = list(set(cols))
    if y_names is None: y_block = None
    elif y_block is None:
        num_cols = df._get_numeric_data().columns
        y_block = CategoryBlock() if any([True for n in y_names if n not in num_cols]) else RegressionBlock()
    else: y_block = None
    pd.options.mode.chained_assignment=None
    to = TabularPandas(df[cols], procs=procs, cat_names=cat_names, cont_names=cont_names, y_names=y_names, y_block=y_block,
                       splits=splits, do_setup=do_setup, inplace=inplace, reduce_memory=reduce_memory, device=device)
    return to

In [None]:
#export
@delegates(DataLoaders.__init__)
def get_tabular_dls(df, procs=[Categorify, FillMissing, Normalize], cat_names=None, cont_names=None, y_names=None, bs=64, 
                    y_block=None, splits=None, do_setup=True, inplace=False, reduce_memory=True, device=None, **kwargs):
    to = get_tabular_ds(df, procs=procs, cat_names=cat_names, cont_names=cont_names, y_names=y_names, 
                        y_block=y_block, splits=splits, do_setup=do_setup, inplace=inplace, reduce_memory=reduce_memory, device=device, **kwargs)
    if splits is not None: bs = min(len(splits[0]), bs)
    else: bs = min(len(df), bs)
    return to.dataloaders(device=device, bs=bs, **kwargs)

In [None]:
path = untar_data(URLs.ADULT_SAMPLE)
df = pd.read_csv(path/'adult.csv')
# df['salary'] = np.random.rand(len(df)) # uncomment to simulate a cont dependent variable

cat_names = ['workclass', 'education', 'education-num', 'marital-status', 'occupation', 'relationship', 'race', 'sex',
             'capital-gain', 'capital-loss', 'native-country']
cont_names = ['age', 'fnlwgt', 'hours-per-week']
target = ['salary']
splits = RandomSplitter()(range_of(df))

dls = get_tabular_dls(df, cat_names=cat_names, cont_names=cont_names, y_names='salary', splits=splits, bs=512)
dls.show_batch()

Unnamed: 0,workclass,education,education-num,marital-status,occupation,relationship,race,sex,capital-gain,capital-loss,native-country,age,fnlwgt,hours-per-week,salary
0,State-gov,Some-college,10,Married-civ-spouse,Adm-clerical,Husband,Asian-Pac-Islander,Male,0,0,United-States,28.0,73211.002357,20.0,<50k
1,?,Some-college,10,Divorced,?,Not-in-family,White,Female,0,0,United-States,40.0,341538.99976,30.0,<50k
2,Federal-gov,Some-college,10,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,United-States,44.0,113597.002477,55.0,>=50k
3,Private,Bachelors,13,Married-civ-spouse,Craft-repair,Husband,White,Male,0,0,United-States,54.000001,186116.999978,45.0,>=50k
4,?,7th-8th,4,Married-civ-spouse,?,Husband,White,Male,0,0,United-States,69.000001,168794.000089,48.0,<50k
5,Private,9th,5,Never-married,Sales,Own-child,Black,Male,0,0,United-States,17.0,230789.000349,22.0,<50k
6,Federal-gov,HS-grad,9,Married-civ-spouse,Adm-clerical,Husband,White,Male,5178,0,United-States,41.0,168293.999985,40.0,>=50k
7,Private,HS-grad,9,Married-civ-spouse,Machine-op-inspct,Husband,White,Male,0,0,United-States,47.0,188081.000041,40.0,>=50k
8,Private,Bachelors,13,Married-civ-spouse,Other-service,Husband,White,Male,0,0,United-States,29.0,464536.008586,40.0,<50k
9,Private,Some-college,10,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,United-States,34.0,561334.001646,50.0,>=50k


In [None]:
metrics = mae if dls.c == 1 else accuracy
learn = tabular_learner(dls, layers=[200, 100], y_range=None, metrics=metrics)
learn.fit(1, 1e-2)

epoch,train_loss,valid_loss,accuracy,time
0,0.345188,0.302588,0.866247,00:04


In [None]:
learn.dls.one_batch()

(tensor([[ 5, 16, 10,  ...,  1,  1, 40],
         [ 5, 10, 13,  ...,  1,  1, 40],
         [ 5, 16, 10,  ...,  1,  1, 40],
         ...,
         [ 3, 13, 14,  ...,  1, 50, 40],
         [ 5, 10, 13,  ...,  1, 44, 30],
         [ 5,  7,  5,  ...,  1,  1, 40]]),
 tensor([[-0.0419, -0.2263,  0.7724],
         [ 0.2514, -1.0469, -0.0336],
         [ 0.3248, -1.2777, -0.0336],
         ...,
         [ 1.0581,  0.0993,  1.1754],
         [-0.3352,  0.3181, -0.0336],
         [-0.3352, -1.5428,  0.7724]]),
 tensor([[0],
         [1],
         [0],
         [1],
         [0],
         [0],
         [0],
         [0],
         [0],
         [0],
         [0],
         [0],
         [0],
         [1],
         [1],
         [0],
         [0],
         [0],
         [0],
         [0],
         [0],
         [0],
         [0],
         [1],
         [0],
         [0],
         [0],
         [0],
         [0],
         [0],
         [0],
         [0],
         [0],
         [0],
         [1],
    

In [None]:
learn.model

TabularModel(
  (embeds): ModuleList(
    (0): Embedding(10, 6)
    (1): Embedding(17, 8)
    (2): Embedding(17, 8)
    (3): Embedding(8, 5)
    (4): Embedding(16, 8)
    (5): Embedding(7, 5)
    (6): Embedding(6, 4)
    (7): Embedding(3, 3)
    (8): Embedding(115, 23)
    (9): Embedding(91, 20)
    (10): Embedding(43, 13)
  )
  (emb_drop): Dropout(p=0.0, inplace=False)
  (bn_cont): BatchNorm1d(3, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
  (layers): Sequential(
    (0): LinBnDrop(
      (0): Linear(in_features=106, out_features=200, bias=False)
      (1): ReLU(inplace=True)
      (2): BatchNorm1d(200, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    )
    (1): LinBnDrop(
      (0): Linear(in_features=200, out_features=100, bias=False)
      (1): ReLU(inplace=True)
      (2): BatchNorm1d(100, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    )
    (2): LinBnDrop(
      (0): Linear(in_features=100, out_features=2, bias=True)
   

In [None]:
#hide
from tsai.imports import create_scripts
from tsai.export import get_nb_name
nb_name = get_nb_name()
create_scripts(nb_name);