In [None]:
# default_exp data.tabular

# Data Tabular

> Main Tabular functions used throughout the library. This is helpful when you have additional time series data like metadata, time series features, etc.

In [None]:
#export
from tsai.imports import *
from tsai.utils import *
from fastai.tabular.all import *

In [None]:
#export
@delegates(DataLoaders.__init__)
def get_tabular_dls(df, procs=[Categorify, FillMissing, Normalize], cat_names=None, cont_names=None, y_names=None, bs=64, 
                    y_block=None, splits=None, do_setup=True, inplace=False, reduce_memory=True, device=None, **kwargs):
    device = ifnone(device, default_device())
    cat_names = str2list(cat_names)
    cont_names = str2list(cont_names)
    y_names = str2list(y_names)
    cols = []
    for _cols in [cat_names, cont_names, y_names]: 
        if _cols is not None: cols.extend(_cols)
    cols = list(set(cols))
    if y_names is None: y_block = None
    elif y_block is None:
        num_cols = df._get_numeric_data().columns
        y_block = CategoryBlock() if any([True for n in y_names if n not in num_cols]) else RegressionBlock()
    else: y_block = None
    pd.options.mode.chained_assignment=None
    to = TabularPandas(df[cols], procs=procs, cat_names=cat_names, cont_names=cont_names, y_names=y_names, y_block=y_block,
                       splits=splits, do_setup=do_setup, inplace=inplace, reduce_memory=reduce_memory, device=device)
    if splits is not None: bs = min(len(splits[0]), bs)
    else: bs = min(len(df), bs)
    return to.dataloaders(device=device, bs=bs, **kwargs)

In [None]:
path = untar_data(URLs.ADULT_SAMPLE)
df = pd.read_csv(path/'adult.csv')
# df['salary'] = np.random.rand(len(df)) # uncomment to simulate a cont dependent variable

cat_names = ['workclass', 'education', 'education-num', 'marital-status', 'occupation', 'relationship', 'race', 'sex',
             'capital-gain', 'capital-loss', 'native-country']
cont_names = ['age', 'fnlwgt', 'hours-per-week']
target = ['salary']
splits = RandomSplitter()(range_of(df))

dls = get_tabular_dls(df, cat_names=cat_names, cont_names=cont_names, y_names='salary', splits=splits, bs=512)
dls.show_batch()

Unnamed: 0,workclass,education,education-num,marital-status,occupation,relationship,race,sex,capital-gain,capital-loss,native-country,age,fnlwgt,hours-per-week,salary
0,Private,Some-college,10.0,Married-spouse-absent,Craft-repair,Not-in-family,White,Male,0,0,United-States,35.0,169103.99926,40.0,<50k
1,Private,Assoc-voc,11.0,Never-married,Tech-support,Not-in-family,White,Female,0,0,United-States,38.0,81965.002066,40.0,<50k
2,Private,Some-college,10.0,Never-married,Adm-clerical,Own-child,White,Female,0,0,United-States,22.0,174460.999724,14.999999,<50k
3,Self-emp-not-inc,9th,5.0,Married-civ-spouse,Craft-repair,Husband,White,Male,0,0,Portugal,26.0,117124.998195,40.0,<50k
4,Private,Masters,14.0,Never-married,Prof-specialty,Not-in-family,White,Male,0,0,United-States,43.0,427382.004489,50.0,>=50k
5,Federal-gov,Bachelors,13.0,Married-civ-spouse,Protective-serv,Husband,White,Male,0,1887,United-States,39.0,99146.002607,60.0,>=50k
6,Private,HS-grad,9.0,Never-married,Sales,Own-child,Amer-Indian-Eskimo,Female,0,0,United-States,18.0,301867.001281,20.0,<50k
7,Private,Bachelors,13.0,Married-civ-spouse,Prof-specialty,Husband,White,Male,0,0,United-States,45.0,34126.997287,50.0,>=50k
8,Private,HS-grad,9.0,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,United-States,64.999999,95302.999546,40.0,>=50k
9,Private,Some-college,10.0,Married-civ-spouse,Transport-moving,Husband,White,Male,0,0,United-States,46.0,165138.00006,45.0,>=50k


In [None]:
metrics = mae if dls.c == 1 else accuracy
learn = tabular_learner(dls, layers=[200, 100], y_range=None, metrics=metrics)
learn.fit(1, 1e-2)

epoch,train_loss,valid_loss,accuracy,time
0,0.344403,0.284031,0.869011,00:04


In [None]:
learn.dls.one_batch()

(tensor([[ 5,  8, 12,  ...,  1,  1, 40],
         [ 5, 16, 10,  ...,  1,  1, 40],
         [ 5, 12,  9,  ...,  1,  1, 40],
         ...,
         [ 5, 16, 10,  ...,  1,  1, 40],
         [ 5,  7,  5,  ...,  1,  1, 20],
         [ 7,  8, 12,  ...,  1,  1, 40]]),
 tensor([[ 0.3952,  1.0213,  0.7707],
         [-0.9215, -0.4102, -0.0330],
         [ 0.0294, -0.8546, -0.0330],
         ...,
         [-0.4095, -0.7308, -0.0330],
         [ 0.4683, -0.1372, -0.0330],
         [ 0.1026, -0.1787,  0.7707]]),
 tensor([[0],
         [0],
         [0],
         [1],
         [1],
         [0],
         [0],
         [0],
         [1],
         [0],
         [0],
         [0],
         [0],
         [0],
         [0],
         [0],
         [0],
         [0],
         [1],
         [0],
         [0],
         [0],
         [0],
         [1],
         [0],
         [1],
         [0],
         [0],
         [0],
         [0],
         [0],
         [0],
         [0],
         [0],
         [1],
    

In [None]:
learn.model

TabularModel(
  (embeds): ModuleList(
    (0): Embedding(10, 6)
    (1): Embedding(17, 8)
    (2): Embedding(17, 8)
    (3): Embedding(8, 5)
    (4): Embedding(16, 8)
    (5): Embedding(7, 5)
    (6): Embedding(6, 4)
    (7): Embedding(3, 3)
    (8): Embedding(120, 23)
    (9): Embedding(90, 20)
    (10): Embedding(43, 13)
  )
  (emb_drop): Dropout(p=0.0, inplace=False)
  (bn_cont): BatchNorm1d(3, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
  (layers): Sequential(
    (0): LinBnDrop(
      (0): Linear(in_features=106, out_features=200, bias=False)
      (1): ReLU(inplace=True)
      (2): BatchNorm1d(200, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    )
    (1): LinBnDrop(
      (0): Linear(in_features=200, out_features=100, bias=False)
      (1): ReLU(inplace=True)
      (2): BatchNorm1d(100, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    )
    (2): LinBnDrop(
      (0): Linear(in_features=100, out_features=2, bias=True)
   

In [None]:
#hide
from tsai.imports import create_scripts
from tsai.export import get_nb_name
nb_name = get_nb_name()
create_scripts(nb_name);