# Prerequisites

## Get the repo

```shell
git clone https://github.com/z-a-f/tabformer.git
cd tabformer
git submodule update --init --recursive
git submodule sync
```

## Install the prerequisites

```shell
conda install numpy, pandas, einops -c conda-forge
conda install transformers, datasets
```


In [127]:
import os
import sys
import json
import re
from operator import itemgetter
from collections import defaultdict
from pprint import pprint
from functools import partial

import numpy as np
import pandas as pd

import torch
from torch import nn

sys.path.append(os.path.abspath('third-party/tab-transformer/tab_transformer_pytorch'))

from tab_transformer_pytorch import TabTransformer

demo_data_path = 'demo_data'
csv_path = os.path.join(demo_data_path, 'data.csv')

os.makedirs(demo_data_path, exist_ok=True)

# All columns
all_columns = [
    'page_protocol',
    'page_host',
    'api_protocol',
    'api_host',
    'api_path',
    'api_method',
    'psm',
    'scope',
    'field_name',
    'field_path',
    'field_sample',
    'data_type_name',
    'some_cat_column',
    'some_num_column',
    'some_float_column',
]

# Hash size as value
categorical_hash_columns = {
    'page_protocol': 128,
    'page_host': 128,
    'api_protocol': 128,
    'api_host': 128,
    'api_path': 128,
    'api_method': 128,
    'psm': 128,
    'scope': 128,
    'field_name': 128,
    'field_path': 128,
    'field_sample': 128,
}

# Number of categories as value
categorical_columns = [
    'some_cat_column',
]

numerical_columns = [
    'some_num_column',
    'some_float_column',
]

target_column = ('data_type_name', 32)  # Maximum number of classes


In [5]:
data = pd.read_csv(csv_path)
data

Unnamed: 0,page_protocol,page_host,api_protocol,api_host,api_path,api_method,psm,scope,field_name,field_path,field_sample,data_type_name,some_cat_column,some_num_column,some_float_column
0,https,market-au.example.com,https,market-au.example.com,/api/v1/product/prohibited/words/check,post,oec.product.product_api,query,browser_language,browser_language,en-US,Language,1,3,-0.3
1,https,market-au.example.com,https,market-au.example.com,/api/v1/product/prohibited/words/check,post,oec.product.product_api,query,browser_language,browser_language,en-US,Language,2,2,-0.2
2,https,market-au.example.com,https,market-au.example.com,/api/v1/product/prohibited/words/check,post,oec.product.product_api,query,browser_language,browser_language,en-US,Language,1,1,-0.1
3,https,market-au.example.com,https,open-api.example.com,/v0/oauth/check_qr,get,abcd.openapi.gateway,query,token,token,abcd_aueast3,Account Setting,3,0,0.0
4,https,market-au.example.com,https,open-api.example.com,/v0/oauth/check_qr,get,abcd.openapi.gateway,query,token,token,wxyz_aueast3,Account Setting,1,1,0.2
5,https,market-au.example.com,https,market-au.example.com,/api/v1/product/list/seller/warehouses,get,oec.product.product_api,query,browser_version,browser_version,5.0%20%28Windows%20NT%2010.0%3B%20Win64%3B%20x...,User Agent,5,2,0.4
6,https,market-au.example.com,https,market-au.example.com,/api/v1/seller/holiday_mode/list,get,oec.seller.profile_api,query,locale,locale,en,Language,8,3,0.6


In [205]:
# Utility functions
import copy
import hashlib

class Tokenizer:
    def __init__(self,
                 categorical_hash_columns=None,
                 categorical_columns=None,
                 numerical_columns=None,
                 target_columns=None,
                 shared_categories=False):
        # We will be accessing by index, which means the keys don't matter
        self.categorical_hash_columns = categorical_hash_columns
        self.categorical_columns = categorical_columns
        self.numerical_columns = numerical_columns
        self.target_columns = target_columns
        self.shared_categories = shared_categories

        self.hash_functions = [
            np.vectorize(lambda value: self._hash_fn(value, hash_mod))
            for hash_mod in self.categorical_hash_columns.values()
        ]

        self.target_mapping = {}
        self.target_indexing = []

        # Need to fit the data
        self.UNK = '<UNK>'

        if self.shared_categories:
            self.cat_mapping = {self.UNK: 0}
            self.cat_indexing = [self.UNK]
            self.cat_functions = np.vectorize(lambda value: self.cat_mapping.get(value, self.cat_mapping[self.UNK]))
        else:
            self.cat_mapping = [
                {self.UNK: 0}
                for _ in range(len(categorical_columns))
            ]
            self.cat_indexing = [
                [self.UNK]
                for _ in range(len(categorical_columns))
            ]
            self.cat_functions = [
                np.vectorize(lambda value: self.cat_mapping[idx].get(value, self.cat_mapping[idx][self.UNK]))
                for idx in range(len(categorical_columns))
            ]

    def __call__(self, hash_data, cat_data, num_data, target_data=None):
        hash_data, cat_data, num_data, target_data = self.tokenize(
            hash_data=hash_data, cat_data=cat_data,
            num_data=num_data, target_data=target_data
        )
        X_cats = np.hstack([hash_data, cat_data])
        X_nums = num_data
        y = target_data
        return self.to_tensor(X_cats, X_nums, y)

    @staticmethod
    def to_tensor(X_cat, X_num, y=None):
        X_cat_tensor = torch.tensor(X_cat, dtype=torch.long)
        X_num_tensor = torch.tensor(X_num, dtype=torch.float32)
    
        if y is not None:
            y_tensor = torch.tensor(y, dtype=torch.long)
        
            if y_tensor.ndim == 2 and y_tensor.shape[1] == 1:
                y_tensor = y_tensor.flatten()
            if y_tensor.ndim != 1:
                raise ValueError(f'Multitarget is not supported yet')
        
            return X_cat_tensor, X_num_tensor, y_tensor
        else:
            return X_cat_tensor, X_num_tensor
                                            
    def tokenize(self, *, hash_data=None, cat_data=None, num_data=None, target_data=None):
        return (
            self.tokenize_hash(hash_data) if hash_data is not None else None,
            self.tokenize_cats(cat_data) if cat_data is not None else None,
            num_data.to_numpy() if isinstance(num_data, pd.DataFrame) else num_data,
            self.tokenize_target(target_data) if target_data is not None else None
        )

    def tokenize_hash(self, data):
        if isinstance(data, pd.DataFrame):
            data = data.to_numpy()
        assert data.shape[1] == len(self.categorical_hash_columns)
        result = np.vstack([
            self.hash_functions[idx](data[:, idx])
            for idx in range(data.shape[1])
        ]).T
        return result

    def tokenize_cats(self, data):
        if isinstance(data, pd.DataFrame):
            data = data.to_numpy()
        assert data.shape[1] == len(self.categorical_columns)
        if self.shared_categories:
            result = [
                self.cat_functions(data[:, idx])
                for idx in range(data.shape[1])
            ]
        else:
            result = [
                self.cat_functions[idx](data[:, idx])
                for idx in range(data.shape[1])
            ]
        return np.vstack(result).T

    def tokenize_target(self, target):
        if isinstance(target, pd.DataFrame):
            target = target.to_numpy()
        return np.vectorize(lambda value: self.target_mapping[value])(target).reshape(target.shape)

    def decode_target(self, target_tokens):
        return np.vectorize(lambda value: self.target_indexing[value])(target_tokens).reshape(target_tokens.shape)
    
    @staticmethod
    def _hash_fn(value: str, hash_mod: int):
        value = str(value).encode()
        # Applies hash to a single string
        # Using multiple different hash algorithms reduces the collision rate
        value = ( int(hashlib.sha1(value).hexdigest(), 16) % hash_mod
                + int(hashlib.md5(value).hexdigest(), 16) % hash_mod
                + int(hashlib.sha256(value).hexdigest(), 16) % hash_mod
        ) % hash_mod
        return value

    def fit_categorical(self, data):
        if isinstance(data, pd.DataFrame):
            data = data.to_numpy()
        if self.shared_categories:
            unique_values = set(np.unique(data))
            current_tokens = set(self.cat_indexing)
            new_tokens = unique_values - current_tokens
            self.cat_indexing.extend(new_tokens)
            self.cat_mapping = {key: idx for idx, key in enumerate(self.cat_indexing)}
        else:
            unique_values = [set(np.unique(data[:, idx])) for idx in range(data.shape[1])]
            current_tokens = [set(toks) for toks in self.cat_indexing]
            new_tokens = [unique_values[idx] - current_tokens[idx] for idx in range(data.shape[1])]
            [self.cat_indexing[idx].extend(new_tokens[idx]) for idx in range(data.shape[1])]
            self.cat_mapping = [
                {key: idx for ci in self.cat_indexing for idx, key in enumerate(ci)}
            ]

    def fit_target(self, targets):
        unique_values = set(np.unique(targets))
        new_tokens = unique_values - set(self.target_indexing)
        self.target_indexing.extend(new_tokens)
        self.target_mapping = {key: idx for idx, key in enumerate(self.target_indexing)}

# Create tokenizer
tokenizer = Tokenizer(hash_columns, cat_columns, num_columns, target_column[0], shared_categories=False)

# Split the data into hashable, categorical, and numerical data
hash_data = data[list(categorical_hash_columns.keys())]
cat_data = data[list(categorical_columns)]
num_data = data[list(numerical_columns)]
target_data = data[target_column[0]]

# Learn the categories from the categorical data
tokenizer.fit_categorical(cat_data)
tokenizer.fit_target(target_data)

# Tokenize the data
tokenizer(hash_data=hash_data, cat_data=cat_data, num_data=num_data, target_data=target_data)

(tensor([[220, 104, 220, 104,  86, 159,  69, 164,  42,  42,  35,   1],
         [220, 104, 220, 104,  86, 159,  69, 164,  42,  42,  35,   2],
         [220, 104, 220, 104,  86, 159,  69, 164,  42,  42,  35,   1],
         [220, 104, 220,  34, 201, 124, 136, 164,  94,  94, 177,   3],
         [220, 104, 220,  34, 201, 124, 136, 164,  94,  94, 164,   1],
         [220, 104, 220, 104, 191, 124,  69, 164, 153, 153, 160,   4],
         [220, 104, 220, 104,  52, 124,  72, 164, 136, 136,  46,   5]]),
 tensor([[ 3.0000, -0.3000],
         [ 2.0000, -0.2000],
         [ 1.0000, -0.1000],
         [ 0.0000,  0.0000],
         [ 1.0000,  0.2000],
         [ 2.0000,  0.4000],
         [ 3.0000,  0.6000]]),
 tensor([0, 0, 0, 1, 1, 2, 0]))

# Step 0. Load the CSV file + Statistical information

In [206]:
data = pd.read_csv(csv_path)

# Step 1. Create dataloaders

In [207]:
class TabularCSVDataset(torch.utils.data.Dataset):
    r'''PyTorch dataset to load CSV files

    Args:
        csv_path: Path for the CSV file
        tokenizer: Tokenizer that is used to tokenize the data
        transform: Transformation callable to apply to the data before returning

    Methods:
        __getitem__:
            Returns a dict with keys 'X_cat', 'X_num', 'y_cat'

    Example:

        >>> dataset = TabularCSVDataset(csv_path, meta_path)
        >>> batch = dataset[:10]
    '''
    def __init__(self,
                 csv_path,
                 tokenizer,
                 transform=None):
        super().__init__()
        self.csv_path = csv_path
        self.tokenizer = tokenizer
        self.transform = transform

        # We don't have to open the CSV -- we can just load the data iteratively
        # Load the data
        data = pd.read_csv(self.csv_path)
        # Separate the data into columns
        data_hash = data[list(tokenizer.categorical_hash_columns.keys())]
        data_cats = data[list(tokenizer.categorical_columns)]
        data_nums = data[list(tokenizer.numerical_columns)]
        data_target = data[tokenizer.target_columns]

        self.X_cat, self.X_num, self.y = self.tokenizer(
            hash_data=data_hash, cat_data=data_cats, num_data=data_nums, target_data=data_target)

    def __len__(self):
        return len(self.y)

    def __getitem__(self, idx):
        data = {'X_cat': self.X_cat[idx], 'X_num': self.X_num[idx], 'y': self.y[idx]}
        if self.transform is not None:
            data = self.transform(data)
        return data

    

training_set = TabularCSVDataset(csv_path, tokenizer)
training_loader = torch.utils.data.DataLoader(training_set, batch_size=512, shuffle=True)

batch = next(iter(training_loader))
# batch_df = training_set.from_numeric(**batch)

print(f'Numeric batch')
print(batch)
print(f'DataFrame for the same numeric values')
# batch_df

Numeric batch
{'X_cat': tensor([[220, 104, 220, 104, 191, 124,  69, 164, 153, 153, 160,   4],
        [220, 104, 220,  34, 201, 124, 136, 164,  94,  94, 164,   1],
        [220, 104, 220, 104,  52, 124,  72, 164, 136, 136,  46,   5],
        [220, 104, 220,  34, 201, 124, 136, 164,  94,  94, 177,   3],
        [220, 104, 220, 104,  86, 159,  69, 164,  42,  42,  35,   1],
        [220, 104, 220, 104,  86, 159,  69, 164,  42,  42,  35,   2],
        [220, 104, 220, 104,  86, 159,  69, 164,  42,  42,  35,   1]]), 'X_num': tensor([[ 2.0000,  0.4000],
        [ 1.0000,  0.2000],
        [ 3.0000,  0.6000],
        [ 0.0000,  0.0000],
        [ 1.0000, -0.1000],
        [ 2.0000, -0.2000],
        [ 3.0000, -0.3000]]), 'y': tensor([2, 1, 0, 1, 0, 0, 0])}
DataFrame for the same numeric values


# Step 3. Create model

In [208]:
# Hyperparameters from the original paper
architecture_params = {
    'dim': 32,
    'depth': 6,
    'heads': 6,
    'attn_dropout': 0.1,
    'ff_dropout': 0.1,
    'mlp_hidden_mults': (4, 2),
    'mlp_act': nn.ReLU(),  # Can be reused, as this is stateless
    'use_shared_categ_embed': True,
}

data_params = {
    # [Hash, Cat]
    'categories': [num_cats for num_cats in tokenizer.categorical_hash_columns.values()] + ([len(cm) for cm in tokenizer.cat_mapping]),
    'num_continuous': len(tokenizer.numerical_columns),
    'dim_out': len(tokenizer.target_mapping)
}

model = TabTransformer(
    **data_params,
    **architecture_params,
)

model

TabTransformer(
  (category_embed): Embedding(2824, 28)
  (norm): LayerNorm((2,), eps=1e-05, elementwise_affine=True)
  (transformer): Transformer(
    (layers): ModuleList(
      (0-5): 6 x ModuleList(
        (0): PreNorm(
          (norm): LayerNorm((32,), eps=1e-05, elementwise_affine=True)
          (fn): Attention(
            (to_qkv): Linear(in_features=32, out_features=288, bias=False)
            (to_out): Linear(in_features=96, out_features=32, bias=True)
            (dropout): Dropout(p=0.1, inplace=False)
          )
        )
        (1): PreNorm(
          (norm): LayerNorm((32,), eps=1e-05, elementwise_affine=True)
          (fn): FeedForward(
            (net): Sequential(
              (0): Linear(in_features=32, out_features=256, bias=True)
              (1): GEGLU()
              (2): Dropout(p=0.1, inplace=False)
              (3): Linear(in_features=128, out_features=32, bias=True)
            )
          )
        )
      )
    )
  )
  (mlp): MLP(
    (mlp): Sequ

In [209]:
from pathlib import Path

# Compute rough number of parameters
num_parameters = 0
for param in model.parameters():
    num_parameters += param.numel()
print(f'Model number of parameters: {num_parameters}')

# Get the model size, as saved
model_path = Path('/tmp/tab-transformer-temp.pt')
torch.save(model.cpu().state_dict(), model_path)
model_size_mb = model_path.stat().st_size / 1_000_000
print(f'Model size: {model_size_mb:.2f} Mb')

Model number of parameters: 2021855
Model size: 8.12 Mb


# Step 4. Train the model

**Note** We will be doing the supervised learning instead of unsupervised.

In [221]:
# Random output
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model = model.to(device)
# model(X_cat_tensor, X_num_tensor)

batch = next(iter(training_loader))
with torch.no_grad():
    X_cat, X_num = batch['X_cat'], batch['X_num']
    X_cat = X_cat.to(device)
    X_num = X_num.to(device)
    y_hat = model(X_cat, X_num)
    y_hat = y_hat.argmax(-1)

    print(y_hat, batch['y'])

    predictions = tokenizer.decode_target(y_hat.cpu()).tolist()
    expectations = tokenizer.decode_target(batch['y'].cpu()).tolist()


print(f'{predictions=}')
print(f'{expectations=}')

tensor([1, 1, 1, 1, 1, 1, 1], device='cuda:0') tensor([1, 1, 0, 0, 2, 0, 0])
predictions=['Account Setting', 'Account Setting', 'Account Setting', 'Account Setting', 'Account Setting', 'Account Setting', 'Account Setting']
expectations=['Account Setting', 'Account Setting', 'Language', 'Language', 'User Agent', 'Language', 'Language']


In [222]:
from torch import nn

model = model.to(device)

criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.AdamW(model.parameters(), lr=1e-3)

num_epochs = 5

for epoch in range(num_epochs):
    for batch in training_loader:
        X_cat_tensor = batch['X_cat'].to(device)
        X_num_tensor = batch['X_num'].to(device)
        y_tensor = batch['y'].to(device)
    
        optimizer.zero_grad()
        
        y_hat = model(X_cat_tensor, X_num_tensor)
        loss = criterion(y_hat, y_tensor)
        loss.backward()
        optimizer.step()

# Step 5. Inference

In [225]:
batch = next(iter(training_loader))
with torch.no_grad():
    X_cat, X_num = batch['X_cat'], batch['X_num']
    X_cat = X_cat.to(device)
    X_num = X_num.to(device)
    y_hat = model(X_cat, X_num)
    y_hat = y_hat.argmax(-1)

    print(y_hat, batch['y'])

    predictions = tokenizer.decode_target(y_hat.cpu()).tolist()
    expectations = tokenizer.decode_target(batch['y'].cpu()).tolist()


print(f'{predictions=}')
print(f'{expectations=}')

tensor([0, 0, 0, 1, 1, 0, 2], device='cuda:0') tensor([0, 0, 0, 1, 1, 0, 2])
predictions=['Language', 'Language', 'Language', 'Account Setting', 'Account Setting', 'Language', 'User Agent']
expectations=['Language', 'Language', 'Language', 'Account Setting', 'Account Setting', 'Language', 'User Agent']


# Step 6. Save the trained model (Pure Torch)

In [226]:
save_path = Path('/tmp/tab-transformer-temp.pt')  # This is where the model will be saved to
torch.save(model.cpu(), model_path)

# Step 7. Load the model from the pretrained version (Pure Torch)

This step will be done on target machine (inference server, edge device, etc.)

In [229]:
inference_model = torch.load(save_path).to(device)

batch = next(iter(training_loader))
with torch.no_grad():
    X_cat, X_num = batch['X_cat'], batch['X_num']
    X_cat = X_cat.to(device)
    X_num = X_num.to(device)
    y_hat = inference_model(X_cat, X_num)
    y_hat = y_hat.argmax(-1)

    print(y_hat, batch['y'])

    predictions = tokenizer.decode_target(y_hat.cpu()).tolist()
    expectations = tokenizer.decode_target(batch['y'].cpu()).tolist()


print(f'{predictions=}')
print(f'{expectations=}')

tensor([0, 1, 2, 0, 0, 0, 1], device='cuda:0') tensor([0, 1, 2, 0, 0, 0, 1])
predictions=['Language', 'Account Setting', 'User Agent', 'Language', 'Language', 'Language', 'Account Setting']
expectations=['Language', 'Account Setting', 'User Agent', 'Language', 'Language', 'Language', 'Account Setting']


# JIT PyTorch

* This model does not support JIT scripting
* This model does not support JIT tracing