=> Our inputs data are tensors resulting from cudf processing 

In [None]:
import os
import torch
GPU_id = 1
os.environ['CUDA_VISIBLE_DEVICES'] = str(GPU_id)

In [1]:
import warnings
warnings.filterwarnings("ignore")

In [2]:
import pandas as pd 
data = pd.read_pickle("/datasets/trivago/preproc/data_pair_all.pkl")

In [3]:
data.head(3)

Unnamed: 0,row_id,candidate_order,item_id,price,row_id_count,item_count,user_id,session_id,timestamp,step,...,item_past_6,item_past_7,item_past_8,item_past_9,num_past_items,has_past,search_destination,search_poi,nb_shared_filters,compliance
0,147528,1.050011,193861,-0.441959,25,-0.508469,B3QVP6A5RRMD,fd460d6f31ffe,1541392347,2,...,0,0,0,0,0,False,"Lima, Peru",Miraflores,0.0,0.0
1,147528,-0.767499,1276522,-0.522123,25,-0.537149,B3QVP6A5RRMD,fd460d6f31ffe,1541392347,2,...,0,0,0,0,0,False,"Lima, Peru",Miraflores,0.0,0.0
2,147528,0.910203,6007390,-0.436233,25,-0.298804,B3QVP6A5RRMD,fd460d6f31ffe,1541392347,2,...,0,0,0,0,0,False,"Lima, Peru",Miraflores,0.0,0.0


In [4]:
%%time
train = data[data.clickout_missing==0]
test = data[data.clickout_missing>0]
print(train.shape,test.shape)

(42739624, 257) (5760337, 257)
CPU times: user 41 s, sys: 34.1 s, total: 1min 15s
Wall time: 1min 15s


In [5]:
%%time
from fastai.basic_data import *
from fastai.tabular import *
cat_names = ['user_id','item_id','platform','city','device','current_filters'] + [i for i in train.columns if i.startswith('is_')] 
cont_names = ['price','candidate_order','item_count'] + [i for i in train.columns if i.startswith('count') or 'rank' in i or i.startswith('delta_')]

print('got cols')

procs = [FillMissing, Categorify]
train['is_va'] = train.row_id%5 == 0

print('got valids')

test_list = TabularList.from_df(test, path='./', cat_names=cat_names, cont_names=cont_names)
print('got tests')


got cols
got valids
got tests
CPU times: user 2.86 s, sys: 4.08 s, total: 6.94 s
Wall time: 8.67 s


In [6]:
%%time
data = (TabularList.from_df(train, cat_names=cat_names, cont_names=cont_names, procs=procs)
                           .split_from_df('is_va')
                           .label_from_df(cols='target')
                           .add_test(test_list))
print('got data')

got data
CPU times: user 7min 55s, sys: 7min 7s, total: 15min 2s
Wall time: 15min 2s


-  Get the size of vocab of each categorical 

In [8]:
data.train.y[0].data

0

In [12]:
for key, v in data.train.x.classes.items(): 
    print("key %s : %s" %(key, len(v)))

key user_id : 715920
key item_id : 850794
key platform : 56
key city : 32763
key device : 4
key current_filters : 27842
key is_count_item_user_id_session_id_null : 3
key is_count_item_user_id_null : 3
key is_last_viewed_item_reference_any : 3
key is_last_viewed_item_reference_interaction item rating : 3
key is_last_viewed_item_reference_interaction item image : 3
key is_last_viewed_item_reference_interaction item info : 3
key is_last_viewed_item_reference_interaction item deals : 3
key is_1 Star : 3
key is_2 Star : 3
key is_3 Star : 3
key is_4 Star : 3
key is_5 Star : 3
key is_Accessible Hotel : 3
key is_Accessible Parking : 3
key is_Adults Only : 3
key is_Air Conditioning : 3
key is_Airport Hotel : 3
key is_Airport Shuttle : 3
key is_All Inclusive (Upon Inquiry) : 3
key is_Balcony : 3
key is_Bathtub : 3
key is_Beach : 3
key is_Beach Bar : 3
key is_Beauty Salon : 3
key is_Bed & Breakfast : 3
key is_Bike Rental : 3
key is_Boat Rental : 3
key is_Body Treatments : 3
key is_Boutique Hotel 

<h1> <center> New Data Bunch </center> </h1> 

- For the rest of the notebook, we'll assume we have built the following processed tensors ( using cudf)  : 
    - train : cat_tensor, cont_tensor, label_tensor 
    
    - valid : cat_tensor, cont_tensor, label_tensor 
    
    - test : cat_tensor, cont_tensor, label_tensor 
    
- The size of vocaublary of each categorical variable is also known 

In [30]:
cat_tensors = torch.cat([data.train.x[i].data[0] for i in range(10000)], dim=0).reshape(10000,171)
cont_tensors = torch.cat([data.train.x[i].data[1] for i in range(10000)], dim=0).reshape(10000,37)
labels = tensor([data.train.y[i].data for i in range(10000)]).reshape(10000,)

### Define Tensor batch loader  : 

In [69]:
class BatchDataset(object):
    """An abstract class representing a Batch Dataset.
    All other datasets should subclass this. All subclasses should override
    ``__len__``, which provides the size of the dataset, ``__getitem__``,
    supporting integer indexing of batches in range from 0 to len(self)//batchsize exclusive,
    and ``shuffle`` which randomly shuffles the data, generally called per epoch.
    Batch datasets are meant to be iterated over in order rather than randomly accessed
    so the randomization has to happen first.
    """

    def __getitem__(self, index):
        raise NotImplementedError

    def __len__(self):
        raise NotImplementedError

    def __add__(self):
        raise NotImplementedError

    def shuffle(self):
        raise NotImplementedError


class TensorBatchDataset(BatchDataset):
    """Batch Dataset wrapping Tensors.
    Arguments:
        *tensors (Tensor): tensors that have the same size of the first dimension.
        batch_size: The size of the batch to return
        pin_memory (bool, optional): If ``True``, the dataset will be pinned memory for faster copy to GPU.
        I saw no performance improvement to doing so but results may vary.
    """

    def __init__(self, tensors, batch_size=1, pin_memory=False):
        assert all(tensors[0].size(0) == tensor.size(0) for tensor in tensors)
        
        
        self.tensors = tensors
        self.batch_size = batch_size

        self.num_samples = tensors[0].size(0)

        if pin_memory:
            for tensor in self.tensors:
                tensor.pin_memory()

    def __len__(self):
        if self.num_samples % self.batch_size == 0:
            return self.num_samples // self.batch_size
        else:
            return self.num_samples // self.batch_size + 1

    def __getitem__(self, item):
        idx = item * self.batch_size
        # Need to handle odd sized batches if data isn't divisible by batchsize
        if idx < self.num_samples and (
                idx + self.batch_size < self.num_samples or self.num_samples % self.batch_size == 0):
            return [tensor[idx:idx + self.batch_size] for tensor in self.tensors]

        elif idx < self.num_samples and idx + self.batch_size > self.num_samples:
            return [tensor[idx:] for tensor in self.tensors]
        else:
            raise IndexError

    def __add__(self, tensors):
        assert all(tensors[0].size(0) == tensor.size(0) for tensor in tensors)
        assert len(self.tensors) == len(tensors)
        assert all(self_tensor[0].shape == tensor[0].shape for self_tensor, tensor in zip(self.tensors, tensors))

        num_add_samples = tensors[0].size(0)
        self.num_samples = self.num_samples + num_add_samples
        self.tensors = [torch.cat((self_tensor, tensor)) for self_tensor, tensor in zip(self.tensors, tensors)]

    def shuffle(self):
        idx = torch.randperm(self.num_samples, dtype=torch.int64)
        self.tensors = [tensor[idx] for tensor in self.tensors]
        
        
import torch
from torch import _utils

class BatchDataLoader(object):
    """Batch Data loader. Takes in a batch dataset and returns iterators that return whole batches of data.
    Arguments:
        dataset (BatchDataset): dataset from which to load the data.
        shuffle (bool, optional): set to ``True`` to have the data reshuffled
            at every epoch (default: ``False``).
        pin_memory (bool, optional): If ``True``, the data loader will copy tensors
            into CUDA pinned memory before returning them.
        drop_last (bool, optional): set to ``True`` to drop the last incomplete batch,
            if the dataset size is not divisible by the batch size. If ``False`` and
            the size of dataset is not divisible by the batch size, then the last batch
            will be smaller. (default: ``False``)
        device: str,  return batch data in the related device  (default: )
        
    """

    def __init__(self, batchdataset, shuffle=False,
                 pin_memory=False, drop_last=False, device='cuda'):

        self.batch_size = batchdataset.batch_size
        self.dataset = batchdataset
        self.shuffle = shuffle
        self.pin_memory = pin_memory
        self.drop_last = drop_last
        self.device = device


    def __iter__(self):
        return _BatchDataLoaderIter(self)

    def __len__(self):
        if self.drop_last and self.dataset.num_samples%self.batch_size != 0:
            return len(self.dataset)-1
        else:
            return len(self.dataset)

    
class _BatchDataLoaderIter(object):
    """Iterates once over the BatchDataLoader's batchdataset, shuffling if requested"""
    def __init__(self, loader):
        self.batchdataset = loader.dataset
        self.batch_size = loader.batch_size
        self.pin_memory = loader.pin_memory and torch.cuda.is_available()
        self.drop_last = loader.drop_last
        self.device = loader.device

        if loader.shuffle:
            self.batchdataset.shuffle()

        self.idx = 0

    def __len__(self):
        if self.drop_last and self.batchdataset.num_samples%self.batch_size != 0:
            return len(self.batchdataset)-1
        else:
            return len(self.batchdataset)
         
    
    def __next__(self):
        if self.idx >= len(self):
            raise StopIteration
        batch = self.batchdataset[self.idx]
        # Note Pinning memory was ~10% _slower_ for the test examples I explored
        if self.pin_memory:
            batch = _utils.pin_memory.pin_memory_batch(batch)
        self.idx = self.idx+1
        # move the batch data to device 
        batch = to_device(batch, self.device)
        # return in the form of : xb,yb = (x_cat, x_cont), y
        return (batch[0],batch[1]), batch[2]

    next = __next__  # Python 2 compatibility

    def __iter__(self):
        return self



- test : 

In [74]:
batchdataset = TensorBatchDataset([cat_tensors, cont_tensors, labels], batch_size=10)
dataloader = BatchDataLoader(batchdataset, device='cuda')

In [75]:
t = dataloader.__iter__().next()

In [76]:
t[0][0].shape, t[0][1].shape, t[1].shape

(torch.Size([10, 171]), torch.Size([10, 37]), torch.Size([10]))

In [77]:
t[0][0]

tensor([[259235,  74035,      3,  ...,      1,      2,      1],
        [259235, 153471,      3,  ...,      2,      2,      1],
        [259235, 479807,      3,  ...,      2,      2,      1],
        ...,
        [259235, 105181,      3,  ...,      2,      2,      1],
        [259235,  53309,      3,  ...,      2,      2,      1],
        [259235, 146892,      3,  ...,      2,      2,      1]],
       device='cuda:0')

## Define a custom databunch fastai that takes a TensorBatchDataLoader instead of the DataLoader 

In [85]:
from fastai.basic_data import DataBunch
class BatchDataBunch(DataBunch):
    
    @classmethod
    def create(cls, train_ds, valid_ds, test_ds=None, path:PathOrStr='.', bs:int=64, val_bs=None, 
                      num_workers:int=defaults.cpus, device:torch.device=None,
                      collate_fn:Callable=data_collate, size:int=None, **kwargs)->'BatchDataBunch':
        
        val_bs = ifnone(val_bs, bs)
        
        datasets = [TensorBatchDataset(train_ds, batch_size=bs), 
                    TensorBatchDataset(valid_ds, batch_size=val_bs)]
        
        if test_ds is not None:
            datasets.append(TensorBatchDataset(test_ds, batch_size=val_bs))
        else: 
            datasets.append(test_ds)
        
        cls.device = defaults.device if device is None else device
        
        dls = [BatchDataLoader(d, shuffle=s, pin_memory=False, drop_last=s, device=cls.device) for d,s in
               zip(datasets,(True,False,False)) if d is not None]

        cls.path = path 
        
        cls.dls = dls
    
        
        
        assert not isinstance(dls[0],DeviceDataLoader)
        
        
        # load batch in device 
        
        if test_ds is not None:
            cls.train_dl, cls.valid_dl, cls.test_dl = dls
        else: 
            cls.train_dl, cls.valid_dl = dls
            
            
        cls.path = Path(path)
        return cls


**Test**

In [86]:
train = [cat_tensors, cont_tensors, labels]
validation = [cat_tensors, cont_tensors, labels]
test = [cat_tensors, cont_tensors, labels]
databunch = BatchDataBunch.create(train, validation, device='cuda')

In [87]:
from fastai.tabular import TabularModel

model = TabularModel(emb_szs = [(715920,16), (850794,16), (56,4), (32763,8), (4,1), (27842,8)], n_cont=37, 
                     out_sz=2, layers=[128,64], ps=0.1, emb_drop=0.2,  y_range=[0,1])

learn =  Learner(databunch, model,metrics=None,)

In [88]:
model

TabularModel(
  (embeds): ModuleList(
    (0): Embedding(715920, 16)
    (1): Embedding(850794, 16)
    (2): Embedding(56, 4)
    (3): Embedding(32763, 8)
    (4): Embedding(4, 1)
    (5): Embedding(27842, 8)
  )
  (emb_drop): Dropout(p=0.2)
  (bn_cont): BatchNorm1d(37, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
  (layers): Sequential(
    (0): Linear(in_features=90, out_features=128, bias=True)
    (1): ReLU(inplace)
    (2): BatchNorm1d(128, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    (3): Dropout(p=0.1)
    (4): Linear(in_features=128, out_features=64, bias=True)
    (5): ReLU(inplace)
    (6): BatchNorm1d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    (7): Dropout(p=0.1)
    (8): Linear(in_features=64, out_features=2, bias=True)
  )
)

In [89]:
learn.loss_func = torch.nn.CrossEntropyLoss()

In [90]:
learn.fit_one_cycle(1, 1e-3)

epoch,train_loss,valid_loss,time
0,0.542712,#na#,00:01
