In [1]:
data_csv = '''
page_protocol,page_host,api_protocol,api_host,api_path,api_method,psm,scope,field_name,field_path,field_sample,data_type_name
https,market-au.example.com,https,market-au.example.com,/api/v1/product/prohibited/words/check,post,oec.product.product_api,query,browser_language,browser_language,en-US,Language
https,market-au.example.com,https,market-au.example.com,/api/v1/product/prohibited/words/check,post,oec.product.product_api,query,browser_language,browser_language,en-US,Language
https,market-au.example.com,https,market-au.example.com,/api/v1/product/prohibited/words/check,post,oec.product.product_api,query,browser_language,browser_language,en-US,Language
https,market-au.example.com,https,open-api.example.com,/v0/oauth/check_qr,get,abcd.openapi.gateway,query,token,token,abcd_aueast3,Account Setting
https,market-au.example.com,https,open-api.example.com,/v0/oauth/check_qr,get,abcd.openapi.gateway,query,token,token,wxyz_aueast3,Account Setting
https,market-au.example.com,https,market-au.example.com,/api/v1/product/list/seller/warehouses,get,oec.product.product_api,query,browser_version,browser_version,5.0%20%28Windows%20NT%2010.0%3B%20Win64%3B%20x64%29%20AppleWebKit%2F537.36%20%28KHTML%2C%20like%20Gecko%29%20Chrome%2F120.0.0.0%20Safari%2F537.36,User Agent
https,market-au.example.com,https,market-au.example.com,/api/v1/seller/holiday_mode/list,get,oec.seller.profile_api,query,locale,locale,en,Language
'''

In [2]:
import pandas as pd
from io import StringIO

data = pd.read_csv(StringIO(data_csv))
target_column = 'data_type_name'
feature_columns = data.columns[data.columns != target_column]
target_columns = data.columns[data.columns == target_column]  # Keep this to maintain the DataFrame

if len(target_columns) > 1:
    raise ValueError(f'Multilabel not implemented yet')

data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7 entries, 0 to 6
Data columns (total 12 columns):
 #   Column          Non-Null Count  Dtype 
---  ------          --------------  ----- 
 0   page_protocol   7 non-null      object
 1   page_host       7 non-null      object
 2   api_protocol    7 non-null      object
 3   api_host        7 non-null      object
 4   api_path        7 non-null      object
 5   api_method      7 non-null      object
 6   psm             7 non-null      object
 7   scope           7 non-null      object
 8   field_name      7 non-null      object
 9   field_path      7 non-null      object
 10  field_sample    7 non-null      object
 11  data_type_name  7 non-null      object
dtypes: object(12)
memory usage: 804.0+ bytes


# Step 1. Collect some info

TODO: Remove NaN

In [3]:
# Collect the statistics for categorical and numerical columns in X
# categorical = data[feature_columns].select_dtypes(['category', 'object']).columns
categorical = data.select_dtypes(['category', 'object']).columns
numerical = data.columns[~data.columns.isin(categorical)]
data[categorical] = data[categorical].astype('category')

target_column_dtype = data[target_columns[0]].dtype  # Only a single target for now
if target_column_dtype in ('object', 'category'):
    # Classification
    output_dimensions = len(target_column_dtype.categories)
    if output_dimensions == 2:
        output_dimensions = 1  # Binary classification can be done with a single output
else:
    # Regression
    output_dimensions = 1

# Step 2. Convert the data to numerical values

In [4]:
import torch

def to_feature_target(data, feature_columns, target_columns):
    '''
    Args:
        data: Pandas DataFrame
        feature_columns: Columns that are representation of features
        target_columns: Column names that are tepresentation of prediction targets
    '''
    Xy = data.copy()
    Xy[categorical] = Xy[categorical].apply(lambda column: column.cat.codes)
    
    X = Xy[feature_columns]
    y = Xy[target_columns]

    return X, y

def to_cat_num(data, categorical, numerical):
    '''
    Args:
        data: Pandas DataFrame
        categorical: Columns that are categorical
        numerical: Columns that are numerical
    '''
    data_cat = data[categorical.intersection(feature_columns)]
    data_num = data[numerical.intersection(feature_columns)]
    return data_cat, data_num

def to_tensor(X_cat, X_num, y=None):
    X_cat_tensor = torch.tensor(X_cat.values, dtype=torch.long)
    X_num_tensor = torch.tensor(X_num.values, dtype=torch.float32)

    if y is not None:
        y_tensor = torch.tensor(y.values, dtype=torch.long)
    
        if y_tensor.ndim == 2 and y_tensor.shape[1] == 1:
            y_tensor = y_tensor.flatten()
        if y_tensor.ndim != 1:
            raise ValueError(f'Multitarget is not supported yet')
    
        return X_cat_tensor, X_num_tensor, y_tensor
    else:
        return X_cat_tensor, X_num_tensor

In [5]:
X, y = to_feature_target(data, feature_columns, target_columns)
X_cat, X_num = to_cat_num(X, categorical, numerical)
X_cat_tensor, X_num_tensor, y_tensor = to_tensor(X_cat, X_num, y)

# Need the number of categories in each categorical column for the embedding layers
X_num_categorical = [len(X[column].unique()) for column in categorical if column not in target_columns]

# Step 3. Create model

In [6]:
from torch import nn

from tab_transformer_pytorch import TabTransformer

# Hyperparameters from the original paper
params = {
    'dim': 32,
    'depth': 6,
    'heads': 6,
    'attn_dropout': 0.1,
    'ff_dropout': 0.1,
    'mlp_hidden_mults': (4, 2),
    'mlp_act': nn.ReLU(),  # Can be reused, as this is stateless
    'use_shared_categ_embed': True,
}


model = TabTransformer(
    categories = X_num_categorical,
    num_continuous = len(numerical),
    dim_out = output_dimensions,
    # continuous_mean_std = mean_std,
    **params,
)

model

TabTransformer(
  (category_embed): Embedding(30, 28)
  (transformer): Transformer(
    (layers): ModuleList(
      (0-5): 6 x ModuleList(
        (0): PreNorm(
          (norm): LayerNorm((32,), eps=1e-05, elementwise_affine=True)
          (fn): Attention(
            (to_qkv): Linear(in_features=32, out_features=288, bias=False)
            (to_out): Linear(in_features=96, out_features=32, bias=True)
            (dropout): Dropout(p=0.1, inplace=False)
          )
        )
        (1): PreNorm(
          (norm): LayerNorm((32,), eps=1e-05, elementwise_affine=True)
          (fn): FeedForward(
            (net): Sequential(
              (0): Linear(in_features=32, out_features=256, bias=True)
              (1): GEGLU()
              (2): Dropout(p=0.1, inplace=False)
              (3): Linear(in_features=128, out_features=32, bias=True)
            )
          )
        )
      )
    )
  )
  (mlp): MLP(
    (mlp): Sequential(
      (0): Linear(in_features=352, out_features=1408, bi

In [7]:
from pathlib import Path

# Compute rough number of parameters
num_parameters = 0
for param in model.parameters():
    num_parameters += param.numel()
print(f'Model number of parameters: {num_parameters}')

# Get the model size, as saved
model_path = Path('/tmp/tab-transformer-temp.pt')
torch.save(model.cpu().state_dict(), model_path)
model_size_mb = model_path.stat().st_size / 1_000_000
print(f'Model size: {model_size_mb:.2f} Mb')

Model number of parameters: 1642103
Model size: 6.60 Mb


# Step 4. Train the model

**Note** We will be doing the supervised learning instead of unsupervised.

In [8]:
def df_inference(model, data, as_category=True):
    '''Runs inference on a model'''
    model_device = list(model.parameters())[0].device
    # Get the tensor representaition -- note that target_columns could be empty
    X, y = to_feature_target(data, feature_columns, [])
    X_cat, X_num = to_cat_num(X, categorical, numerical)
    X_cat_tensor, X_num_tensor = to_tensor(X_cat, X_num, None)

    with torch.no_grad():
        X_cat_tensor = X_cat_tensor.to(model_device)
        X_num_tensor = X_num_tensor.to(model_device)
        y_hat = model(X_cat_tensor, X_num_tensor)

        predictions = y_hat.argmax(-1)
    if as_category:
        # predictions = predictions.tolist()
        return data[target_columns[0]].cat.categories[predictions.cpu()].tolist()
    return predictions

In [9]:
# Random output
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model = model.to(device)
# model(X_cat_tensor, X_num_tensor)

predictions = df_inference(model, data)
for idx in range(len(data)):
    print(f'Sample {idx}...')
    print(f'  Predicted: "{predictions[idx]}"')
    print(f'  Expected : "{data.loc[idx, target_columns[0]]}"')
    print()

Sample 0...
  Predicted: "Account Setting"
  Expected : "Language"

Sample 1...
  Predicted: "User Agent"
  Expected : "Language"

Sample 2...
  Predicted: "Account Setting"
  Expected : "Language"

Sample 3...
  Predicted: "Account Setting"
  Expected : "Account Setting"

Sample 4...
  Predicted: "Account Setting"
  Expected : "Account Setting"

Sample 5...
  Predicted: "Account Setting"
  Expected : "User Agent"

Sample 6...
  Predicted: "Account Setting"
  Expected : "Language"



In [10]:
from torch import nn

model = model.to(device)

criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.AdamW(model.parameters(), lr=1e-3)

num_epochs = 5

for epoch in range(num_epochs):
    # No batching -- dataset too small
    X_cat_tensor = X_cat_tensor.to(device)
    X_num_tensor = X_num_tensor.to(device)
    y_tensor = y_tensor.to(device)

    optimizer.zero_grad()
    
    y_hat = model(X_cat_tensor, X_num_tensor)
    loss = criterion(y_hat, y_tensor)
    loss.backward()
    optimizer.step()

# Step 5. Inference

In [11]:
predictions = df_inference(model, data)
for idx in range(len(data)):
    print(f'Sample {idx}...')
    print(f'  Predicted: "{predictions[idx]}"')
    print(f'  Expected : "{data.loc[idx, target_columns[0]]}"')
    print()

Sample 0...
  Predicted: "Language"
  Expected : "Language"

Sample 1...
  Predicted: "Language"
  Expected : "Language"

Sample 2...
  Predicted: "Language"
  Expected : "Language"

Sample 3...
  Predicted: "Account Setting"
  Expected : "Account Setting"

Sample 4...
  Predicted: "Account Setting"
  Expected : "Account Setting"

Sample 5...
  Predicted: "User Agent"
  Expected : "User Agent"

Sample 6...
  Predicted: "Language"
  Expected : "Language"



# Step 6. Save the trained model (Pure Torch)

In [12]:
save_path = Path('/tmp/tab-transformer-temp.pt')  # This is where the model will be saved to
torch.save(model.cpu(), model_path)

# Step 7. Load the model from the pretrained version (Pure Torch)

This step will be done on target machine (inference server, edge device, etc.)

In [13]:
inference_model = torch.load(save_path)

predictions = df_inference(inference_model, data)
for idx in range(len(data)):
    print(f'Sample {idx}...')
    print(f'  Predicted: "{predictions[idx]}"')
    print(f'  Expected : "{data.loc[idx, target_columns[0]]}"')
    print()

Sample 0...
  Predicted: "Language"
  Expected : "Language"

Sample 1...
  Predicted: "Language"
  Expected : "Language"

Sample 2...
  Predicted: "Language"
  Expected : "Language"

Sample 3...
  Predicted: "Account Setting"
  Expected : "Account Setting"

Sample 4...
  Predicted: "Account Setting"
  Expected : "Account Setting"

Sample 5...
  Predicted: "User Agent"
  Expected : "User Agent"

Sample 6...
  Predicted: "Language"
  Expected : "Language"



# JIT PyTorch

* This model does not support JIT scripting
* This model does not support JIT tracing