# Prerequisites

## Get the repo

```shell
git clone https://github.com/z-a-f/tabformer.git
cd tabformer
git submodule update --init --recursive
git submodule sync
```

## Install the prerequisites

```shell
conda install numpy, pandas, einops -c conda-forge
conda install transformers, datasets
```


In [1]:
import os
import json
from collections import defaultdict
from pprint import pprint

import numpy as np
import pandas as pd

import torch
from torch import nn

from tab_transformer_pytorch import TabTransformer

demo_data_path = 'demo_data'
csv_path = os.path.join(demo_data_path, 'data.csv')
meta_path = os.path.join(demo_data_path, 'meta.json')

os.makedirs(demo_data_path, exist_ok=True)

In [2]:
# Utility functions

# This is a support function to showcase how the metadata is stored
def make_metadata(df, target_columns):
    '''Creates metadata based on some data frame'''
    metadata = {
        'num_targets': 0,
        'num_outputs': {},  # column_name : number of outputs for each target
        'columns': {
            'features': {
                'categorical': defaultdict(list),  # column_name : list of unique values -- similar to vocabulary
                'numerical': [],
            },
            'targets': {
                'categorical': defaultdict(list),  # column_name : list of unique values -- similar to vocabulary
                'numerical': [],  
            }
        }
    }
    
    # Get categorical
    categorical = df.select_dtypes(['category', 'object']).columns
    numerical = df.columns[~df.columns.isin(categorical)]
    df[categorical] = df[categorical].astype('category')

    # Get unique values
    feature_columns = df.columns.difference(target_columns)
    for name, column_list in zip(['features', 'targets'], [feature_columns, target_columns]):
        # metadata['columns'][name]['categorical'] = {cat: {value: idx for idx, value in enumerate(df[cat].cat.categories.tolist())} for cat in categorical.intersection(column_list)}
        metadata['columns'][name]['categorical'] = {cat: df[cat].cat.categories.tolist() for cat in categorical.intersection(column_list)}
        metadata['columns'][name]['numerical'] = numerical.intersection(column_list).tolist()
    
    # Get the output stats
    for name, values in metadata['columns']['targets']['categorical'].items():
        metadata['num_outputs'][name] = len(values)
        if metadata['num_outputs'][name] == 2:  # Binary classification
            metadata['num_outputs'][name] = 1
    for name in metadata['columns']['targets']['numerical']:
        metadata['num_outputs'][name] = 1  # Regression
    metadata['num_targets'] = len(metadata['num_outputs'])
    return metadata

# def df_inference(model, data, as_category=True):
#     '''Runs inference on a model'''
#     model_device = list(model.parameters())[0].device
#     # Get the tensor representaition -- note that target_columns could be empty
#     X, y = to_feature_target(data, feature_columns, [])
#     X_cat, X_num = to_cat_num(X, categorical, numerical)
#     X_cat_tensor, X_num_tensor = to_tensor(X_cat, X_num, None)

#     with torch.no_grad():
#         X_cat_tensor = X_cat_tensor.to(model_device)
#         X_num_tensor = X_num_tensor.to(model_device)
#         y_hat = model(X_cat_tensor, X_num_tensor)

#         predictions = y_hat.argmax(-1)
#     if as_category:
#         # predictions = predictions.tolist()
#         return data[target_columns[0]].cat.categories[predictions.cpu()].tolist()
#     return predictions

# def to_feature_target(data, feature_columns, target_columns):
#     '''
#     Args:
#         data: Pandas DataFrame
#         feature_columns: Columns that are representation of features
#         target_columns: Column names that are tepresentation of prediction targets
#     '''
#     Xy = data.copy()
#     Xy[categorical] = Xy[categorical].apply(lambda column: column.cat.codes)
    
#     X = Xy[feature_columns]
#     y = Xy[target_columns]

#     return X, y

# def to_cat_num(data, categorical, numerical):
#     '''
#     Args:
#         data: Pandas DataFrame
#         categorical: Columns that are categorical
#         numerical: Columns that are numerical
#     '''
#     data_cat = data[categorical.intersection(feature_columns)]
#     data_num = data[numerical.intersection(feature_columns)]
#     return data_cat, data_num

# def to_tensor(X_cat, X_num, y=None):
#     X_cat_tensor = torch.tensor(X_cat.values, dtype=torch.long)
#     X_num_tensor = torch.tensor(X_num.values, dtype=torch.float32)

#     if y is not None:
#         y_tensor = torch.tensor(y.values, dtype=torch.long)
    
#         if y_tensor.ndim == 2 and y_tensor.shape[1] == 1:
#             y_tensor = y_tensor.flatten()
#         if y_tensor.ndim != 1:
#             raise ValueError(f'Multitarget is not supported yet')
    
#         return X_cat_tensor, X_num_tensor, y_tensor
#     else:
#         return X_cat_tensor, X_num_tensor

# Step 0. Load the CSV file + Statistical information

In [3]:
target_column = 'data_type_name'

if os.path.isfile(meta_path):
    with open(meta_path, 'r') as f:
        metadata = json.load(f)
else:
    data = pd.read_csv(csv_path)
    metadata = make_metadata(data, target_columns=[target_column])
    with open(meta_path, 'w') as f:
        json.dump(metadata, f)

pprint(metadata)

{'columns': {'features': {'categorical': {'api_host': ['market-au.example.com',
                                                       'open-api.example.com'],
                                          'api_method': ['get', 'post'],
                                          'api_path': ['/api/v1/product/list/seller/warehouses',
                                                       '/api/v1/product/prohibited/words/check',
                                                       '/api/v1/seller/holiday_mode/list',
                                                       '/v0/oauth/check_qr'],
                                          'api_protocol': ['https'],
                                          'field_name': ['browser_language',
                                                         'browser_version',
                                                         'locale',
                                                         'token'],
                                          'field

# Step 1. Create dataloaders

In [4]:
categorical_features = metadata['columns']['features']['categorical']
numerical_features = metadata['columns']['features']['numerical']
if metadata['num_targets'] > 1:
    raise ValueError(f'Multitarget not supported yet')
output_dimensions = list(metadata['num_outputs'].values())[0]

In [16]:
class TabularCSVDataset(torch.utils.data.Dataset):
    r'''PyTorch dataset to load CSV files

    This class loads the tabular data based on the metadata in the `meta_path`.
    If the metadata file not available, please, generate it using the `make_metadata()` function.

    Args:
        csv_path: Path for the CSV file
        meta_path: Path to the metadata JSON file
        transform: Transformation callable to apply to the data before returning

    Methods:
        __getitem__:
            Returns a dict with keys 'X_cat', 'X_num', 'y_cat', 'y_num'
        to_numeric:
            Converts a pandas DataFrame into numerical representation according to the metadata.
            This returns `X_cat, X_num, y_cat, y_num` -- categorical / numerical features (X) and targets (y)
        from_numeric:
            Inverse of the `to_numeric` -- converts `X_cat, X_num, y_cat, y_num` into pandas representation.
            This also converts the numerical categories into their labels.

    Example:

        >>> dataset = TabularCSVDataset(csv_path, meta_path)
        >>> batch = dataset[:10]
    '''
    def __init__(self, csv_path, meta_path, transform=None):
        super().__init__()
        self.csv_path = csv_path
        self.meta_path = meta_path
        self.transform = transform

        if not os.path.isfile(meta_path):
            raise ValueError(f'Cannot find the metadata')

        with open(meta_path, 'r') as f:
            self.metadata = json.load(f)
        # Features stats
        self.num_categorical = [len(v) for v in self.metadata['columns']['features']['categorical'].values()]
        self.num_numerical = len(self.metadata['columns']['features']['numerical'])
        self.num_targets = self.metadata['num_targets']
        self.num_outputs = list(self.metadata['num_outputs'].values())

        # Preload the data
        self.data = pd.read_csv(self.csv_path)
        categorical = list(self.metadata['columns']['features']['categorical'].keys())
        self.data[categorical] = self.data[categorical].astype('category')
        categorical = list(self.metadata['columns']['targets']['categorical'].keys())
        self.data[categorical] = self.data[categorical].astype('category')

        # Preprocess the data
        self.X_cat, self.X_num, self.y_cat, self.y_num = self.to_numeric(self.data, astensor=True)

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        data = {'X_cat': self.X_cat[idx], 'X_num': self.X_num[idx], 'y_cat': self.y_cat[idx], 'y_num': self.y_num[idx]}
        if self.transform is not None:
            data = self.transform(data)
        return data

    def to_numeric(self, df, astensor=False):
        # Features
        categorical = self.metadata['columns']['features']['categorical']
        X_cat = df[categorical.keys()]\
                    .astype('category')\
                    .apply(lambda column: column.cat.codes)\
                    .to_numpy()\
                    .astype(np.int64)

        numerical = self.metadata['columns']['features']['numerical']
        X_num = df[numerical]\
                    .to_numpy()\
                    .astype(np.float32)

        # Targets
        categorical = self.metadata['columns']['targets']['categorical']        
        y_cat = df[categorical.keys()]\
                    .astype('category')\
                    .apply(lambda column: column.cat.codes)\
                    .to_numpy()\
                    .astype(np.int64)
        if y_cat.ndim == 2 and y_cat.shape[1] == 1:
            y_cat = y_cat.flatten()

        numerical = self.metadata['columns']['targets']['numerical']
        y_num = df[numerical]\
                    .to_numpy()\
                    .astype(np.float32)
        if y_num.ndim == 2 and y_num.shape[1] == 1:
            y_num = y_num.flatten()

        if astensor:
            X_cat = torch.tensor(X_cat)
            X_num = torch.tensor(X_num)
            y_cat = torch.tensor(y_cat)
            y_num = torch.tensor(y_num)

        return X_cat, X_num, y_cat, y_num

    def from_numeric(self, X_cat=None, X_num=None, y_cat=None, y_num=None):
        result = pd.DataFrame()
        if X_cat is not None:
            if isinstance(X_cat, torch.Tensor):
                X_cat = np.array(X_cat.cpu())
            categorical = list(self.metadata['columns']['features']['categorical'].keys())           
            result[categorical] = pd.DataFrame(X_cat, columns=categorical).apply(lambda column: self.data[column.name].cat.categories[column])
        if X_num is not None:
            if isinstance(X_num, torch.Tensor):
                X_num = np.array(X_num.cpu())
            numerical = list(self.metadata['columns']['features']['numerical'])
            result[numerical] = pd.DataFrame(X_num, columns=numerical)
        if y_cat is not None:
            if isinstance(y_cat, torch.Tensor):
                y_cat = np.array(y_cat.cpu())
            categorical = list(self.metadata['columns']['targets']['categorical'].keys())           
            result[categorical] = pd.DataFrame(y_cat, columns=categorical).apply(lambda column: self.data[column.name].cat.categories[column])
        if y_num is not None:
            if isinstance(y_num, torch.Tensor):
                y_num = np.array(y_num.cpu())
            numerical = list(self.metadata['columns']['targets']['numerical'])
            result[numerical] = pd.DataFrame(y_num, columns=numerical)
        return result

training_set = TabularCSVDataset(csv_path, meta_path)
training_loader = torch.utils.data.DataLoader(training_set, batch_size=512, shuffle=True)

batch = next(iter(training_loader))
batch_df = training_set.from_numeric(**batch)

print(f'Numeric batch')
print(batch)
print(f'DataFrame for the same numeric values')
batch_df

Numeric batch
{'X_cat': tensor([[0, 0, 0, 0, 2, 0, 2, 0, 2, 2, 2],
        [0, 0, 0, 0, 1, 1, 1, 0, 0, 0, 3],
        [0, 0, 0, 1, 3, 0, 0, 0, 3, 3, 4],
        [0, 0, 0, 0, 0, 0, 1, 0, 1, 1, 0],
        [0, 0, 0, 1, 3, 0, 0, 0, 3, 3, 1],
        [0, 0, 0, 0, 1, 1, 1, 0, 0, 0, 3],
        [0, 0, 0, 0, 1, 1, 1, 0, 0, 0, 3]]), 'X_num': tensor([], size=(7, 0)), 'y_cat': tensor([1, 1, 0, 2, 0, 1, 1]), 'y_num': tensor([], size=(7, 0))}
DataFrame for the same numeric values


Unnamed: 0,page_protocol,page_host,api_protocol,api_host,api_path,api_method,psm,scope,field_name,field_path,field_sample,data_type_name
0,https,market-au.example.com,https,market-au.example.com,/api/v1/seller/holiday_mode/list,get,oec.seller.profile_api,query,locale,locale,en,Language
1,https,market-au.example.com,https,market-au.example.com,/api/v1/product/prohibited/words/check,post,oec.product.product_api,query,browser_language,browser_language,en-US,Language
2,https,market-au.example.com,https,open-api.example.com,/v0/oauth/check_qr,get,abcd.openapi.gateway,query,token,token,wxyz_aueast3,Account Setting
3,https,market-au.example.com,https,market-au.example.com,/api/v1/product/list/seller/warehouses,get,oec.product.product_api,query,browser_version,browser_version,5.0%20%28Windows%20NT%2010.0%3B%20Win64%3B%20x...,User Agent
4,https,market-au.example.com,https,open-api.example.com,/v0/oauth/check_qr,get,abcd.openapi.gateway,query,token,token,abcd_aueast3,Account Setting
5,https,market-au.example.com,https,market-au.example.com,/api/v1/product/prohibited/words/check,post,oec.product.product_api,query,browser_language,browser_language,en-US,Language
6,https,market-au.example.com,https,market-au.example.com,/api/v1/product/prohibited/words/check,post,oec.product.product_api,query,browser_language,browser_language,en-US,Language


# ~Step 2. Convert the data to numerical values~

Don't need this anymore

In [6]:
# X, y = to_feature_target(data, feature_columns, target_columns)
# X_cat, X_num = to_cat_num(X, categorical, numerical)
# X_cat_tensor, X_num_tensor, y_tensor = to_tensor(X_cat, X_num, y)

# # Need the number of categories in each categorical column for the embedding layers
# X_num_categorical = [len(X[column].unique()) for column in categorical if column not in target_columns]

# Step 3. Create model

In [7]:
batch['X_cat'].shape

torch.Size([7, 11])

In [8]:
# Hyperparameters from the original paper
params = {
    'dim': 32,
    'depth': 6,
    'heads': 6,
    'attn_dropout': 0.1,
    'ff_dropout': 0.1,
    'mlp_hidden_mults': (4, 2),
    'mlp_act': nn.ReLU(),  # Can be reused, as this is stateless
    'use_shared_categ_embed': True,
}


model = TabTransformer(
    categories = training_set.num_categorical,
    num_continuous = training_set.num_numerical,
    dim_out = training_set.num_outputs[0],  # No support for multi-target, so just take the first output
    # continuous_mean_std = mean_std,
    **params,
)

In [9]:
from pathlib import Path

# Compute rough number of parameters
num_parameters = 0
for param in model.parameters():
    num_parameters += param.numel()
print(f'Model number of parameters: {num_parameters}')

# Get the model size, as saved
model_path = Path('/tmp/tab-transformer-temp.pt')
torch.save(model.cpu().state_dict(), model_path)
model_size_mb = model_path.stat().st_size / 1_000_000
print(f'Model size: {model_size_mb:.2f} Mb')

Model number of parameters: 1642103
Model size: 6.60 Mb


# Step 4. Train the model

**Note** We will be doing the supervised learning instead of unsupervised.

In [10]:
# Random output
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model = model.to(device)
# model(X_cat_tensor, X_num_tensor)

batch = next(iter(training_loader))
with torch.no_grad():
    y_hat = model(batch['X_cat'], batch['X_num'])
    y_hat = y_hat.argmax(-1)

predictions = training_set.from_numeric(y_cat=y_hat)
expectations = training_set.from_numeric(y_cat=batch['y_cat'])

comparison_df = pd.DataFrame()
for col in predictions.columns:
    comparison_df[col + ' (Predicted)'] = predictions
    comparison_df[col + ' (Expected)'] = expectations

comparison_df

Unnamed: 0,data_type_name (Predicted),data_type_name (Expected)
0,Language,Account Setting
1,Language,Account Setting
2,Language,Language
3,Language,Language
4,Language,User Agent
5,Language,Language
6,Language,Language


In [11]:
from torch import nn

model = model.to(device)

criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.AdamW(model.parameters(), lr=1e-3)

num_epochs = 5

for epoch in range(num_epochs):
    for batch in training_loader:
        X_cat_tensor = batch['X_cat'].to(device)
        X_num_tensor = batch['X_num'].to(device)
        y_tensor = batch['y_cat'].to(device)
    
        optimizer.zero_grad()
        
        y_hat = model(X_cat_tensor, X_num_tensor)
        loss = criterion(y_hat, y_tensor)
        loss.backward()
        optimizer.step()

# Step 5. Inference

In [12]:
batch = next(iter(training_loader))
with torch.no_grad():
    y_hat = model(batch['X_cat'], batch['X_num'])
    y_hat = y_hat.argmax(-1)

predictions = training_set.from_numeric(y_cat=y_hat)
expectations = training_set.from_numeric(y_cat=batch['y_cat'])

comparison_df = pd.DataFrame()
for col in predictions.columns:
    comparison_df[col + ' (Predicted)'] = predictions
    comparison_df[col + ' (Expected)'] = expectations

comparison_df

Unnamed: 0,data_type_name (Predicted),data_type_name (Expected)
0,Language,Language
1,Language,Language
2,Language,Language
3,Account Setting,Account Setting
4,Account Setting,Account Setting
5,Language,Language
6,User Agent,User Agent


# Step 6. Save the trained model (Pure Torch)

In [13]:
save_path = Path('/tmp/tab-transformer-temp.pt')  # This is where the model will be saved to
torch.save(model.cpu(), model_path)

# Step 7. Load the model from the pretrained version (Pure Torch)

This step will be done on target machine (inference server, edge device, etc.)

In [14]:
inference_model = torch.load(save_path)

batch = next(iter(training_loader))
with torch.no_grad():
    y_hat = inference_model(batch['X_cat'], batch['X_num'])
    y_hat = y_hat.argmax(-1)

predictions = training_set.from_numeric(y_cat=y_hat)
expectations = training_set.from_numeric(y_cat=batch['y_cat'])

comparison_df = pd.DataFrame()
for col in predictions.columns:
    comparison_df[col + ' (Predicted)'] = predictions
    comparison_df[col + ' (Expected)'] = expectations

comparison_df

Unnamed: 0,data_type_name (Predicted),data_type_name (Expected)
0,User Agent,User Agent
1,Language,Language
2,Account Setting,Account Setting
3,Language,Language
4,Language,Language
5,Language,Language
6,Account Setting,Account Setting


# JIT PyTorch

* This model does not support JIT scripting
* This model does not support JIT tracing