In [None]:
# default_exp data.tabular

# Data Tabular

> Main Tabular functions used throughout the library.

In [None]:
#export
from tsai.imports import *
from fastai.tabular.all import *

In [None]:
# hide
class TabularDataset():
    "A `Numpy` dataset from a `TabularPandas` object"
    def __init__(self, to, n_inp=None):
        self.cats = to.cats.to_numpy().astype(np.long)
        self.conts = to.conts.to_numpy().astype(np.float32)
        self.ys = to.ys.to_numpy()
        self.cat_names = to.cat_names
        self.classes = to.classes
        self.cont_names = to.cont_names
        self.loss_func = MSELossFlat() if isinstance(self.ys[0][0], float) else CrossEntropyLossFlat()
        self.n_inp = ifnone(n_inp, 1)
    def __getitem__(self, idx): return self.cats[idx], self.conts[idx], self.ys[idx]
    def __len__(self): return len(self.cats)
    @property
    def c(self): 
        return 0 if self.ys is None else 1 if isinstance(self.ys[0][0].cpu(), (torch.FloatTensor, torch.DoubleTensor)) else len(np.unique(self.ys.cpu()))

class TabularDataLoader(DataLoader):
    def __init__(self, dataset, bs=1, **kwargs):
        super().__init__(dataset, bs=min(bs, len(dataset)), **kwargs)
        shuffle = kwargs.get('shuffle',None)
        device = ifnone(kwargs.get('device',None), default_device())
        self.device, self.shuffle = device, shuffle
        self.n_inp = self.dataset.n_inp
    def create_item(self, s): return s
    def get_idxs(self):
        idxs = Inf.count if self.indexed else Inf.nones
        if self.n is not None: idxs = list(range(len(self.dataset)))
        if self.shuffle: self.shuffle_fn()
        return idxs
    def create_batch(self, b):
        return self.dataset[b[0]:b[0]+self.bs]
    def shuffle_fn(self):
        "Shuffle dataset after each epoch"
        rng = np.random.permutation(len(self.dataset))
        self.dataset.cats = self.dataset.cats[rng]
        self.dataset.conts = self.dataset.conts[rng]
        self.dataset.ys = self.dataset.ys[rng]
    def to(self, device): 
        self.device = device
        self.dataset.cats = tensor(self.dataset.cats).to(device=self.device)
        self.dataset.conts = tensor(self.dataset.conts).to(device=self.device)
        self.dataset.ys = tensor(self.dataset.ys).to(device=self.device)

In [None]:
#hide
# train_ds = TabularDataset(to.train)
# valid_ds = TabularDataset(to.valid)
# train_dl = TabularDataLoader(train_ds, bs=512, train=True)
# valid_dl = TabularDataLoader(valid_ds, bs=512)
# dls = DataLoaders(train_dl,valid_dl)
# dls.c, dls.loss_func

In [None]:
# hide
# emb_szs = get_emb_sz(to)
# net = TabularModel(emb_szs, len(dls.cont_names), dls.c, layers=[200,100], y_range=None)#.cuda()
# metrics=mae if dls.c == 1 else accuracy
# learn = Learner(dls, net, metrics=metrics, loss_func=MSELossFlat() if dls.c == 1 else CrossEntropyLossFlat())
# learn.fit(1, 1e-2)

In [None]:
@patch
def cws(self:TabDataLoader):
    if isinstance(self.dataset.ys.values[0][0], Integral):
        target = tensor(dls.dataset.ys).flatten().to(dtype=torch.int64)
        # Compute samples weight (each sample should get its own weight)
        class_sample_count = torch.tensor([(target == t).sum() for t in torch.unique(target, sorted=True)])
        weights = 1. / class_sample_count.float()
        return (weights / weights.sum()).to(default_device())
    else: return None

In [None]:
path = untar_data(URLs.ADULT_SAMPLE)
df = pd.read_csv(path/'adult.csv')
# df['salary'] = np.random.rand(len(df)) # uncomment to simulate a cont dependent variable
procs = [Categorify, FillMissing, Normalize]
cat_names = ['workclass', 'education', 'marital-status', 'occupation', 'relationship', 'race']
cont_names = ['age', 'fnlwgt', 'education-num']
y_names = ['salary']
y_block = RegressionBlock() if isinstance(df['salary'].values[0], float) else CategoryBlock()
splits = RandomSplitter()(range_of(df))
pd.options.mode.chained_assignment=None
to = TabularPandas(df, procs=procs, cat_names=cat_names, cont_names=cont_names,
                   y_names=y_names, y_block=y_block, splits=splits, inplace=True,
                   reduce_memory=False)
to.show(5)

Unnamed: 0,workclass,education,marital-status,occupation,relationship,race,education-num_na,age,fnlwgt,education-num,salary
9706,Private,Bachelors,Divorced,Tech-support,Not-in-family,White,False,36.0,215392.0,13.0,<50k
20331,Private,Some-college,Married-civ-spouse,Transport-moving,Husband,White,False,49.0,252079.0,10.0,>=50k
28470,Self-emp-inc,HS-grad,Divorced,Craft-repair,Not-in-family,White,False,41.0,60949.0,9.0,<50k
26354,Private,Assoc-voc,Widowed,Tech-support,Unmarried,Black,False,32.0,257978.0,11.0,<50k
14569,Private,HS-grad,Never-married,Craft-repair,Own-child,White,False,19.0,248339.0,9.0,<50k


In [None]:
dls = to.dataloaders(bs=512, device=default_device())
dls.show_batch()

Unnamed: 0,workclass,education,marital-status,occupation,relationship,race,education-num_na,age,fnlwgt,education-num,salary
0,Private,11th,Divorced,Prof-specialty,Unmarried,Amer-Indian-Eskimo,False,46.0,176552.000154,7.0,>=50k
1,Self-emp-inc,Assoc-voc,Married-civ-spouse,Sales,Husband,White,False,56.0,24127.005397,11.0,>=50k
2,Self-emp-not-inc,11th,Married-civ-spouse,Farming-fishing,Husband,White,False,44.0,315406.005525,7.0,<50k
3,Private,Assoc-voc,Married-civ-spouse,Tech-support,Husband,White,False,28.0,176683.000022,11.0,>=50k
4,Private,Bachelors,Married-civ-spouse,Exec-managerial,Husband,White,False,44.0,230683.999742,13.0,>=50k
5,Self-emp-inc,Bachelors,Divorced,Sales,Not-in-family,White,False,23.0,284650.997145,13.0,<50k
6,Private,HS-grad,Divorced,Machine-op-inspct,Unmarried,White,False,30.0,103435.002398,9.0,<50k
7,Private,HS-grad,Married-civ-spouse,Transport-moving,Husband,White,False,50.0,145408.999996,9.0,>=50k
8,Self-emp-inc,HS-grad,Married-civ-spouse,Farming-fishing,Husband,White,False,52.0,77392.005619,9.0,<50k
9,Self-emp-not-inc,HS-grad,Never-married,Farming-fishing,Not-in-family,White,False,37.0,154641.001065,9.0,<50k


In [None]:
dls.cws()

tensor([0.2401, 0.7599])

In [None]:
b = first(dls.train)

In [None]:
metrics=mae if dls.c == 1 else accuracy
learn = tabular_learner(dls, layers=[200,100], y_range=None, metrics=metrics)
learn.fit(1, 1e-2)

epoch,train_loss,valid_loss,accuracy,time
0,0.376322,0.364071,0.83016,00:07


In [None]:
learn.model

TabularModel(
  (embeds): ModuleList(
    (0): Embedding(10, 6)
    (1): Embedding(17, 8)
    (2): Embedding(8, 5)
    (3): Embedding(16, 8)
    (4): Embedding(7, 5)
    (5): Embedding(6, 4)
    (6): Embedding(3, 3)
  )
  (emb_drop): Dropout(p=0.0, inplace=False)
  (bn_cont): BatchNorm1d(3, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
  (layers): Sequential(
    (0): LinBnDrop(
      (0): BatchNorm1d(42, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      (1): Linear(in_features=42, out_features=200, bias=False)
      (2): ReLU(inplace=True)
    )
    (1): LinBnDrop(
      (0): BatchNorm1d(200, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      (1): Linear(in_features=200, out_features=100, bias=False)
      (2): ReLU(inplace=True)
    )
    (2): LinBnDrop(
      (0): Linear(in_features=100, out_features=2, bias=True)
    )
  )
)

In [None]:
#hide
beep(create_scripts())

<IPython.core.display.Javascript object>

Converted 000_utils.ipynb.
Converted 000b_data.validation.ipynb.
Converted 001_data.external.ipynb.
Converted 002_data.core.ipynb.
Converted 003_data.transforms.ipynb.
Converted 003b_data.image.ipynb.
Converted 005_data.tabular.ipynb.
Converted 006_data.mixed.ipynb.
Converted 007_metrics.ipynb.
Converted 008_learner.ipynb.
Converted 009_optimizer.ipynb.
Converted 010_callback.ipynb.
Converted 100_models.utils.ipynb.
Converted 100b_models.layers.ipynb.
Converted 101_models.ResNet.ipynb.
Converted 101b_models.ResNetPlus.ipynb.
Converted 102_models.InceptionTime.ipynb.
Converted 102b_models.InceptionTimePlus.ipynb.
Converted 103_models.FCN.ipynb.
Converted 103b_models.FCNPlus.ipynb.
Converted 104_models.ResCNN.ipynb.
Converted 105_models.RNN.ipynb.
Converted 105_models.RNNPlus.ipynb.
Converted 106_models.XceptionTime.ipynb.
Converted 106b_models.XceptionTimePlus.ipynb.
Converted 107_models.RNN_FCN.ipynb.
Converted 107b_models.RNN_FCNPlus.ipynb.
Converted 108_models.TransformerModel.ipynb.