In [None]:
# default_exp data.tabular

# Data Tabular

> Main Tabular functions used throughout the library. This is helpful when you have additional time series data like metadata, time series features, etc.

In [None]:
#export
from tsai.imports import *
from fastai.tabular.all import *

In [None]:
# hide
class TabularDataset():
    "A `Numpy` dataset from a `TabularPandas` object"
    def __init__(self, to, n_inp=None):
        self.cats = to.cats.to_numpy().astype(np.long)
        self.conts = to.conts.to_numpy().astype(np.float32)
        self.ys = to.ys.to_numpy()
        self.cat_names = to.cat_names
        self.classes = to.classes
        self.cont_names = to.cont_names
        self.loss_func = MSELossFlat() if isinstance(self.ys[0][0], float) else CrossEntropyLossFlat()
        self.n_inp = ifnone(n_inp, 1)
    def __getitem__(self, idx): return self.cats[idx], self.conts[idx], self.ys[idx]
    def __len__(self): return len(self.cats)
    @property
    def c(self): 
        return 0 if self.ys is None else 1 if isinstance(self.ys[0][0].cpu(), (torch.FloatTensor, torch.DoubleTensor)) else len(np.unique(self.ys.cpu()))

class TabularDataLoader(DataLoader):
    def __init__(self, dataset, bs=1, **kwargs):
        super().__init__(dataset, bs=min(bs, len(dataset)), **kwargs)
        shuffle = kwargs.get('shuffle',None)
        device = ifnone(kwargs.get('device',None), default_device())
        self.device, self.shuffle = device, shuffle
        self.n_inp = self.dataset.n_inp
    def create_item(self, s): return s
    def get_idxs(self):
        idxs = Inf.count if self.indexed else Inf.nones
        if self.n is not None: idxs = list(range(len(self.dataset)))
        if self.shuffle: self.shuffle_fn()
        return idxs
    def create_batch(self, b):
        return self.dataset[b[0]:b[0]+self.bs]
    def shuffle_fn(self):
        "Shuffle dataset after each epoch"
        rng = np.random.permutation(len(self.dataset))
        self.dataset.cats = self.dataset.cats[rng]
        self.dataset.conts = self.dataset.conts[rng]
        self.dataset.ys = self.dataset.ys[rng]
    def to(self, device): 
        self.device = device
        self.dataset.cats = tensor(self.dataset.cats).to(device=self.device)
        self.dataset.conts = tensor(self.dataset.conts).to(device=self.device)
        self.dataset.ys = tensor(self.dataset.ys).to(device=self.device)

In [None]:
#hide
# train_ds = TabularDataset(to.train)
# valid_ds = TabularDataset(to.valid)
# train_dl = TabularDataLoader(train_ds, bs=512, train=True)
# valid_dl = TabularDataLoader(valid_ds, bs=512)
# dls = DataLoaders(train_dl,valid_dl)
# dls.c, dls.loss_func

In [None]:
# hide
# emb_szs = get_emb_sz(to)
# net = TabularModel(emb_szs, len(dls.cont_names), dls.c, layers=[200,100], y_range=None)#.cuda()
# metrics=mae if dls.c == 1 else accuracy
# learn = Learner(dls, net, metrics=metrics, loss_func=MSELossFlat() if dls.c == 1 else CrossEntropyLossFlat())
# learn.fit(1, 1e-2)

In [None]:
path = untar_data(URLs.ADULT_SAMPLE)
df = pd.read_csv(path/'adult.csv')
# df['salary'] = np.random.rand(len(df)) # uncomment to simulate a cont dependent variable
procs = [Categorify, FillMissing, Normalize]
cat_names = ['workclass', 'education', 'marital-status', 'occupation', 'relationship', 'race']
cont_names = ['age', 'fnlwgt', 'education-num']
y_names = ['salary']
y_block = RegressionBlock() if isinstance(df['salary'].values[0], float) else CategoryBlock()
splits = RandomSplitter()(range_of(df))
pd.options.mode.chained_assignment=None
to = TabularPandas(df, procs=procs, cat_names=cat_names, cont_names=cont_names,
                   y_names=y_names, y_block=y_block, splits=splits, inplace=True,
                   reduce_memory=False)
to.show(5)

Unnamed: 0,workclass,education,marital-status,occupation,relationship,race,education-num_na,age,fnlwgt,education-num,salary
23319,Private,Some-college,Married-civ-spouse,Machine-op-inspct,Husband,White,False,39.0,115076.0,10.0,<50k
9998,Local-gov,HS-grad,Never-married,Adm-clerical,Not-in-family,White,False,45.0,255559.0,9.0,<50k
22895,Private,Assoc-acdm,Married-civ-spouse,Exec-managerial,Wife,White,False,29.0,446559.0,12.0,<50k
28874,Self-emp-not-inc,HS-grad,Never-married,Craft-repair,Own-child,White,False,26.0,102476.0,9.0,<50k
25759,Private,HS-grad,Never-married,Handlers-cleaners,Own-child,White,False,20.0,243178.0,9.0,<50k


In [None]:
dls = to.dataloaders(bs=512, device=default_device())
dls.show_batch()

Unnamed: 0,workclass,education,marital-status,occupation,relationship,race,education-num_na,age,fnlwgt,education-num,salary
0,Private,HS-grad,Never-married,Adm-clerical,Not-in-family,White,False,31.0,96479.998834,9.0,<50k
1,Private,Bachelors,Married-civ-spouse,Machine-op-inspct,Husband,White,False,38.0,136629.000744,13.0,<50k
2,State-gov,Doctorate,Married-civ-spouse,Prof-specialty,Husband,White,False,41.0,116519.999588,16.0,>=50k
3,?,Bachelors,Married-civ-spouse,?,Wife,White,False,44.0,109912.001219,13.0,>=50k
4,Private,HS-grad,Married-civ-spouse,Sales,Husband,White,False,25.0,166970.999606,9.0,<50k
5,Private,HS-grad,Married-civ-spouse,Transport-moving,Husband,White,False,61.0,163393.000679,9.0,<50k
6,Private,Some-college,Never-married,Sales,Own-child,White,False,24.000001,155817.999705,10.0,<50k
7,?,Bachelors,Never-married,?,Not-in-family,White,False,61.0,42937.996961,13.0,>=50k
8,Local-gov,Bachelors,Married-civ-spouse,Protective-serv,Husband,White,False,52.0,175339.00039,13.0,>=50k
9,?,Some-college,Never-married,?,Own-child,White,False,18.999999,60688.004701,10.0,<50k


In [None]:
metrics = mae if dls.c == 1 else accuracy
learn = tabular_learner(dls, layers=[200, 100], y_range=None, metrics=metrics)
learn.fit(1, 1e-2)

epoch,train_loss,valid_loss,accuracy,time
0,0.376641,0.357149,0.833692,00:05


In [None]:
learn.model

TabularModel(
  (embeds): ModuleList(
    (0): Embedding(10, 6)
    (1): Embedding(17, 8)
    (2): Embedding(8, 5)
    (3): Embedding(16, 8)
    (4): Embedding(7, 5)
    (5): Embedding(6, 4)
    (6): Embedding(3, 3)
  )
  (emb_drop): Dropout(p=0.0, inplace=False)
  (bn_cont): BatchNorm1d(3, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
  (layers): Sequential(
    (0): LinBnDrop(
      (0): BatchNorm1d(42, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      (1): Linear(in_features=42, out_features=200, bias=False)
      (2): ReLU(inplace=True)
    )
    (1): LinBnDrop(
      (0): BatchNorm1d(200, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      (1): Linear(in_features=200, out_features=100, bias=False)
      (2): ReLU(inplace=True)
    )
    (2): LinBnDrop(
      (0): Linear(in_features=100, out_features=2, bias=True)
    )
  )
)

In [None]:
#hide
beep(create_scripts())