In [1]:
data_csv = '''
page_protocol,page_host,api_protocol,api_host,api_path,api_method,psm,scope,field_name,field_path,field_sample,data_type_name
https,market-au.example.com,https,market-au.example.com,/api/v1/product/prohibited/words/check,post,oec.product.product_api,query,browser_language,browser_language,en-US,Language
https,market-au.example.com,https,market-au.example.com,/api/v1/product/prohibited/words/check,post,oec.product.product_api,query,browser_language,browser_language,en-US,Language
https,market-au.example.com,https,market-au.example.com,/api/v1/product/prohibited/words/check,post,oec.product.product_api,query,browser_language,browser_language,en-US,Language
https,market-au.example.com,https,open-api.example.com,/v0/oauth/check_qr,get,abcd.openapi.gateway,query,token,token,abcd_aueast3,Account Setting
https,market-au.example.com,https,open-api.example.com,/v0/oauth/check_qr,get,abcd.openapi.gateway,query,token,token,wxyz_aueast3,Account Setting
https,market-au.example.com,https,market-au.example.com,/api/v1/product/list/seller/warehouses,get,oec.product.product_api,query,browser_version,browser_version,5.0%20%28Windows%20NT%2010.0%3B%20Win64%3B%20x64%29%20AppleWebKit%2F537.36%20%28KHTML%2C%20like%20Gecko%29%20Chrome%2F120.0.0.0%20Safari%2F537.36,User Agent
https,market-au.example.com,https,market-au.example.com,/api/v1/seller/holiday_mode/list,get,oec.seller.profile_api,query,locale,locale,en,Language
'''

In [2]:
import pandas as pd
from io import StringIO

data = pd.read_csv(StringIO(data_csv))
target_column = 'data_type_name'
feature_columns = data.columns[data.columns != target_column]
target_columns = data.columns[data.columns == target_column]  # Keep this to maintain the DataFrame

if len(target_columns) > 1:
    raise ValueError(f'Multilabel not implemented yet')

data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7 entries, 0 to 6
Data columns (total 12 columns):
 #   Column          Non-Null Count  Dtype 
---  ------          --------------  ----- 
 0   page_protocol   7 non-null      object
 1   page_host       7 non-null      object
 2   api_protocol    7 non-null      object
 3   api_host        7 non-null      object
 4   api_path        7 non-null      object
 5   api_method      7 non-null      object
 6   psm             7 non-null      object
 7   scope           7 non-null      object
 8   field_name      7 non-null      object
 9   field_path      7 non-null      object
 10  field_sample    7 non-null      object
 11  data_type_name  7 non-null      object
dtypes: object(12)
memory usage: 804.0+ bytes


# Step 1. Collect some info

TODO: Remove NaN

In [3]:
# Collect the statistics for categorical and numerical columns in X
# categorical = data[feature_columns].select_dtypes(['category', 'object']).columns
categorical = data.select_dtypes(['category', 'object']).columns
numerical = data.columns[~data.columns.isin(categorical)]
data[categorical] = data[categorical].astype('category')

target_column_dtype = data[target_columns[0]].dtype  # Only a single target for now
if target_column_dtype in ('object', 'category'):
    # Classification
    output_dimensions = len(target_column_dtype.categories)
    if output_dimensions == 2:
        output_dimensions = 1  # Binary classification can be done with a single output
else:
    # Regression
    output_dimensions = 1

# Step 2. Convert the data to numerical values

In [4]:
# Create the X, y data
Xy = data.copy()
Xy[categorical] = Xy[categorical].apply(lambda column: column.cat.codes)

X = Xy[feature_columns]
y = Xy[target_columns]

X_num_categorical = [len(X[column].unique()) for column in categorical if column not in target_columns]

In [5]:
# Split numerical and categorical inputs
X_cat = X[categorical.intersection(feature_columns)]
X_num = X[numerical.intersection(feature_columns)]

In [6]:
import torch

X_cat_tensor = torch.tensor(X_cat.values, dtype=torch.long)
X_num_tensor = torch.tensor(X_num.values, dtype=torch.float32)
y_tensor = torch.tensor(y.values, dtype=torch.long)

print(X_cat_tensor.shape, X_num_tensor.shape, y_tensor.shape)

torch.Size([7, 11]) torch.Size([7, 0]) torch.Size([7, 1])


In [7]:
X_cat_tensor

tensor([[0, 0, 0, 0, 1, 1, 1, 0, 0, 0, 3],
        [0, 0, 0, 0, 1, 1, 1, 0, 0, 0, 3],
        [0, 0, 0, 0, 1, 1, 1, 0, 0, 0, 3],
        [0, 0, 0, 1, 3, 0, 0, 0, 3, 3, 1],
        [0, 0, 0, 1, 3, 0, 0, 0, 3, 3, 4],
        [0, 0, 0, 0, 0, 0, 1, 0, 1, 1, 0],
        [0, 0, 0, 0, 2, 0, 2, 0, 2, 2, 2]])

# Step 3. Create model

In [8]:
from torch import nn

from tab_transformer_pytorch import TabTransformer

# Hyperparameters from the original paper
params = {
    'dim': 32,
    'depth': 6,
    'heads': 8,
    'attn_dropout': 0.1,
    'ff_dropout': 0.1,
    'mlp_hidden_mults': (4, 2),
    'mlp_act': nn.ReLU(),  # Can be reused, as this is stateless
    'use_shared_categ_embed': True,
}


model = TabTransformer(
    categories = X_num_categorical,
    num_continuous = len(numerical),
    dim_out = output_dimensions,
    # continuous_mean_std = mean_std,
    **params,
)

model

TypeError: TabTransformer.__init__() got an unexpected keyword argument 'use_shared_categ_embed'

# Step 4. Train the model

**Note** We will be doing the supervised learning instead of unsupervised.

In [None]:
model(X_cat_tensor, X_num_tensor)

In [None]:
model.use_shared_categ_embed

In [None]:
type(model)