In [None]:
# default_exp data.tabular

# Data Tabular

> Main Tabular functions used throughout the library. This is helpful when you have additional time series data like metadata, time series features, etc.

In [None]:
#export
from tsai.imports import *
from fastai.tabular.all import *

In [None]:
# # hide
# class TabularDataset():
#     "A `Numpy` dataset from a `TabularPandas` object"
#     def __init__(self, to, n_inp=None):
#         self.cats = to.cats.to_numpy().astype(np.long)
#         self.conts = to.conts.to_numpy().astype(np.float32)
#         self.ys = to.ys.to_numpy()
#         self.cat_names = to.cat_names
#         self.classes = to.classes
#         self.cont_names = to.cont_names
#         self.loss_func = MSELossFlat() if isinstance(self.ys[0][0], float) else CrossEntropyLossFlat()
#         self.n_inp = ifnone(n_inp, 1)
#     def __getitem__(self, idx): return self.cats[idx], self.conts[idx], self.ys[idx]
#     def __len__(self): return len(self.cats)
#     @property
#     def c(self): 
#         return 0 if self.ys is None else 1 if isinstance(self.ys[0][0].cpu(), (torch.FloatTensor, torch.DoubleTensor)) else len(np.unique(self.ys.cpu()))

# class TabularDataLoader(DataLoader):
#     def __init__(self, dataset, bs=1, **kwargs):
#         super().__init__(dataset, bs=min(bs, len(dataset)), **kwargs)
#         shuffle = kwargs.get('shuffle',None)
#         device = ifnone(kwargs.get('device',None), default_device())
#         self.device, self.shuffle = device, shuffle
#         self.n_inp = self.dataset.n_inp
#     def create_item(self, s): return s
#     def get_idxs(self):
#         idxs = Inf.count if self.indexed else Inf.nones
#         if self.n is not None: idxs = list(range(len(self.dataset)))
#         if self.shuffle: self.shuffle_fn()
#         return idxs
#     def create_batch(self, b):
#         return self.dataset[b[0]:b[0]+self.bs]
#     def shuffle_fn(self):
#         "Shuffle dataset after each epoch"
#         rng = np.random.permutation(len(self.dataset))
#         self.dataset.cats = self.dataset.cats[rng]
#         self.dataset.conts = self.dataset.conts[rng]
#         self.dataset.ys = self.dataset.ys[rng]
#     def to(self, device): 
#         self.device = device
#         self.dataset.cats = tensor(self.dataset.cats).to(device=self.device)
#         self.dataset.conts = tensor(self.dataset.conts).to(device=self.device)
#         self.dataset.ys = tensor(self.dataset.ys).to(device=self.device)

In [None]:
#hide
# train_ds = TabularDataset(to.train)
# valid_ds = TabularDataset(to.valid)
# train_dl = TabularDataLoader(train_ds, bs=512, train=True)
# valid_dl = TabularDataLoader(valid_ds, bs=512)
# dls = DataLoaders(train_dl,valid_dl)
# dls.c, dls.loss_func

In [None]:
# hide
# emb_szs = get_emb_sz(to)
# net = TabularModel(emb_szs, len(dls.cont_names), dls.c, layers=[200,100], y_range=None)#.cuda()
# metrics=mae if dls.c == 1 else accuracy
# learn = Learner(dls, net, metrics=metrics, loss_func=MSELossFlat() if dls.c == 1 else CrossEntropyLossFlat())
# learn.fit(1, 1e-2)

In [None]:
#export
@delegates(DataLoaders.__init__)
def get_tabular_dls(df, procs=[Categorify, FillMissing, Normalize], cat_names=None, cont_names=None, y_names=None,
                    y_block=None, splits=None, do_setup=True, inplace=False, reduce_memory=True, device=None, **kwargs):
    device = ifnone(device, default_device())
    if cat_names is not None and not isinstance(cat_names, (list, L)): 
        if isinstance(cat_names, pd.core.indexes.base.Index): cat_names = cat_names.tolist()
        else: cat_names = [cat_names]
    assert cat_names is None or isinstance(cat_names, (list, L)), 'cat_names must be either None or a list'
    if cont_names is not None and not isinstance(cont_names, (list, L)): 
        if isinstance(cont_names, pd.core.indexes.base.Index): cont_names = cont_names.tolist()
        else: cont_names = [cont_names]
    assert cont_names is None or isinstance(cont_names, (list, L)), 'cont_names must be either None or a list'
    if y_names is not None and not isinstance(y_names, (list, L)): 
        if isinstance(y_names, pd.core.indexes.base.Index): y_names = y_names.tolist()
        else: y_names = [y_names]
    if cat_names is not None and len([cat_name for cat_name  in cat_names if cat_name in y_names]) != 0:
        warnings.warn('y_names are included in cont_names!')
    if cont_names is not None and len([cont_name for cont_name  in cont_names if cont_name in y_names]) != 0:
        warnings.warn('y_names are included in cont_names!')
    y_block = ifnone(y_block, RegressionBlock() if isinstance(df[y_names].values.flatten()[-1], float) else CategoryBlock())
    pd.options.mode.chained_assignment=None
    to = TabularPandas(df, procs=procs, cat_names=cat_names, cont_names=cont_names, y_names=y_names, y_block=y_block,
                       splits=splits, do_setup=do_setup, inplace=inplace, reduce_memory=reduce_memory, device=device)
    return to.dataloaders(device=device, **kwargs)

In [None]:
path = untar_data(URLs.ADULT_SAMPLE)
df = pd.read_csv(path/'adult.csv')
# df['salary'] = np.random.rand(len(df)) # uncomment to simulate a cont dependent variable

cat_names = ['workclass', 'education', 'education-num', 'marital-status', 'occupation', 'relationship', 'race', 'sex',
             'capital-gain', 'capital-loss', 'native-country']
cont_names = ['age', 'fnlwgt', 'hours-per-week']
target = ['salary']
splits = RandomSplitter()(range_of(df))

dls = get_tabular_dls(df, cat_names=cat_names, cont_names=cont_names, y_names='salary', splits=splits, bs=512)
dls.show_batch()

Unnamed: 0,workclass,education,education-num,marital-status,occupation,relationship,race,sex,capital-gain,capital-loss,native-country,age,fnlwgt,hours-per-week,salary
0,Private,Assoc-voc,11,Divorced,Sales,Unmarried,White,Female,0,0,United-States,36.0,133973.997706,40.0,<50k
1,State-gov,Masters,14,Never-married,Adm-clerical,Not-in-family,Black,Female,0,0,United-States,24.0,163480.000142,40.0,<50k
2,Private,7th-8th,4,Divorced,Craft-repair,Not-in-family,White,Male,0,1590,United-States,41.0,48086.998553,40.0,<50k
3,Private,HS-grad,9,Divorced,Exec-managerial,Not-in-family,White,Female,0,2258,United-States,48.0,175070.000064,40.0,>=50k
4,Self-emp-inc,Bachelors,13,Never-married,Prof-specialty,Own-child,White,Male,0,0,United-States,40.0,111482.998203,40.0,<50k
5,Private,Some-college,10,Never-married,Sales,Own-child,White,Male,0,0,United-States,17.999999,90934.000222,28.0,<50k
6,Private,Some-college,10,Never-married,Exec-managerial,Not-in-family,White,Female,0,0,United-States,28.0,266069.998456,40.0,<50k
7,Federal-gov,Bachelors,13,Never-married,Prof-specialty,Not-in-family,White,Male,0,1741,United-States,24.0,314525.003945,45.0,<50k
8,?,HS-grad,9,Married-civ-spouse,?,Not-in-family,White,Female,0,0,United-States,47.0,186804.99998,35.0,<50k
9,?,HS-grad,9,Never-married,?,Own-child,White,Female,0,0,United-States,17.999999,236089.999732,40.0,<50k


In [None]:
metrics = mae if dls.c == 1 else accuracy
learn = tabular_learner(dls, layers=[200, 100], y_range=None, metrics=metrics)
learn.fit(1, 1e-2)

epoch,train_loss,valid_loss,accuracy,time
0,0.297967,0.312385,0.858722,00:06


In [None]:
learn.model

TabularModel(
  (embeds): ModuleList(
    (0): Embedding(10, 6)
    (1): Embedding(17, 8)
    (2): Embedding(17, 8)
    (3): Embedding(8, 5)
    (4): Embedding(16, 8)
    (5): Embedding(7, 5)
    (6): Embedding(6, 4)
    (7): Embedding(3, 3)
    (8): Embedding(120, 23)
    (9): Embedding(90, 20)
    (10): Embedding(43, 13)
  )
  (emb_drop): Dropout(p=0.0, inplace=False)
  (bn_cont): BatchNorm1d(3, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
  (layers): Sequential(
    (0): LinBnDrop(
      (0): BatchNorm1d(106, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      (1): Linear(in_features=106, out_features=200, bias=False)
      (2): ReLU(inplace=True)
    )
    (1): LinBnDrop(
      (0): BatchNorm1d(200, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      (1): Linear(in_features=200, out_features=100, bias=False)
      (2): ReLU(inplace=True)
    )
    (2): LinBnDrop(
      (0): Linear(in_features=100, out_features=2, bias=True)
   

In [None]:
#hide
beep(create_scripts())

<IPython.core.display.Javascript object>

Converted 000_utils.ipynb.
Converted 000b_data.validation.ipynb.
Converted 000c_data.preparation.ipynb.
Converted 001_data.external.ipynb.
Converted 002_data.core.ipynb.
Converted 003_data.preprocessing.ipynb.
Converted 003b_data.transforms.ipynb.
Converted 003c_data.mixed_augmentation.ipynb.
Converted 003d_data.image.ipynb.
Converted 003e_data.features.ipynb.
Converted 005_data.tabular.ipynb.
Converted 006_data.mixed.ipynb.
Converted 007_metrics.ipynb.
Converted 008_learner.ipynb.
Converted 008b_tslearner.ipynb.
Converted 009_optimizer.ipynb.
Converted 010_callback.core.ipynb.
Converted 011_callback.noisy_student.ipynb.
Converted 012_callback.gblend.ipynb.
Converted 013_callback.MVP.ipynb.
Converted 014_callback.PredictionDynamics.ipynb.
Converted 100_models.layers.ipynb.
Converted 100b_models.utils.ipynb.
Converted 100c_models.explainability.ipynb.
Converted 101_models.ResNet.ipynb.
Converted 101b_models.ResNetPlus.ipynb.
Converted 102_models.InceptionTime.ipynb.
Converted 102b_model