## Feature extraction from TabularFM

This notebook demonstrates how to extract features from pretrained TabularFM in a zero-shot settings and use those features as additional information for regression or classification downstream tasks

### Feature extraction from a pretrained models

### Download models and datasets

In [2]:
# download model
! mkdir models
! wget stvae_gittables.zip "https://huggingface.co/lamthuy/stvae_gittables/resolve/main/stvae_gittables.zip" -P models/
! unzip -o models/stvae_gittables.zip -d models/

# download dataset
! mkdir datasets
! wget gittables_v4.zip https://huggingface.co/datasets/lamthuy/TabularFM-GitTables/resolve/main/gittables_v4.zip -P datasets/
! unzip -o datasets/gittables_v4.zip -d datasets/

mkdir: cannot create directory ‘models’: File exists
--2024-08-12 09:07:39--  http://stvae_gittables.zip/
Resolving stvae_gittables.zip (stvae_gittables.zip)... failed: Name or service not known.
wget: unable to resolve host address ‘stvae_gittables.zip’
--2024-08-12 09:07:39--  https://huggingface.co/lamthuy/stvae_gittables/resolve/main/stvae_gittables.zip
Resolving huggingface.co (huggingface.co)... 3.162.58.97, 3.162.58.119, 3.162.58.5, ...
Connecting to huggingface.co (huggingface.co)|3.162.58.97|:443... connected.
HTTP request sent, awaiting response... 302 Found
Location: https://cdn-lfs-us-1.huggingface.co/repos/77/0d/770d0fc2aee89278e27c59c53914d40b5573008c4690325b85b83733dcaf7337/497a62cb326d130677e7dbc05e2ce005e466bc11a856403f75ccbed8e93b7dc8?response-content-disposition=inline%3B+filename*%3DUTF-8%27%27stvae_gittables.zip%3B+filename%3D%22stvae_gittables.zip%22%3B&response-content-type=application%2Fzip&Expires=1723712859&Policy=eyJTdGF0ZW1lbnQiOlt7IkNvbmRpdGlvbiI6eyJEYXRlTG

### Load models

In [1]:
import sys
sys.path.append('tabularfm')

from tabularfm.utils.cli import get_config, create_model, create_model_config
from tabularfm.utils.processing import load_model_weights

# configs
model_type = 'stvae'
config_path = 'configs/stvae.yaml'
# data_path =  # kaggle path
data_path = 'datasets/gittables_v4' # gittables
finetune_path = 'models/stvae_gittables/'

# load and create config
configs = get_config(config_path)
model_config_finetune = create_model_config(data_path, configs, model_type, config_type = "finetune")

# load model
stvae_model = create_model(model_type, model_config_finetune)
stvae_model = load_model_weights(model_type, stvae_model, finetune_path, suffix=None)


  from .autonotebook import tqdm as notebook_tqdm
  model.encoder.load_state_dict(torch.load(os.path.join(path, 'encoder_weights.pt'), map_location=torch.device('cpu')))
  model.decoder.load_state_dict(torch.load(os.path.join(path, 'decoder_weights.pt'), map_location=torch.device('cpu')))


In [3]:
# # download downstream datasets
# ! mkdir downstream_datasets
# ! wget gittables_v4.zip https://huggingface.co/datasets/lamthuy/TabularFM-GitTables/resolve/main/gittables_v4.zip -P downstream_datasets/
# ! unzip -o downstream_datasets/gittables_v4.zip -d downstream_datasets/

### Transform

In [2]:
import torch
import pandas as pd
from tabularfm.utils.processing import get_df
from sklearn.model_selection import train_test_split
from tabularfm.utils.processing import get_metadata, add_padding
from tabularfm.ctgan.data_transformer import DataTransformerV2
from sklearn.preprocessing import LabelEncoder

def transform(path, input_dim, cat_encoder='label', test_size=0.3, random_state=121):
    
    df = get_df(path)
    
    data = df.drop(columns=df.columns[-1])
    labels = df[[df.columns[-1]]]

    categorical_columns = data.select_dtypes(include=['object', 'category']).columns

    ori_data = data

    # transform original data
    for col in categorical_columns:
        ori_data[col] = LabelEncoder().fit_transform(ori_data[col])

    transformer = DataTransformerV2()
    metadata = get_metadata(path)
    
    discrete_cols = [k for k,v in metadata['columns'].items() if v['sdtype'] != 'numerical']
    
    transformer.fit(data, discrete_columns=discrete_cols)

    tabfm_data = transformer.transform(data)
    
    tabfm_train_data, tabfm_val_data, train_labels, val_labels = train_test_split(tabfm_data, labels, test_size=0.3, random_state=121)
    train_data, val_data, _, _ = train_test_split(ori_data, labels, test_size=0.3, random_state=121)

    tabfm_train_data = add_padding(tabfm_train_data, max_dim=input_dim)
    tabfm_val_data = add_padding(tabfm_val_data, max_dim=input_dim)
    
    return (train_data, val_data), (tabfm_train_data, tabfm_val_data), (train_labels, val_labels), transformer

def extract_tvae_based(model, train_data, val_data):
    
    model.encoder.eval()
    train_embedding = model.encoder.seq(torch.from_numpy(train_data).float()).detach().cpu()
    val_embedding = model.encoder.seq(torch.from_numpy(val_data).float()).detach().cpu()
    
    return (train_embedding, val_embedding)

def prepare_data(ori_train_data, ori_val_data, train_embedding, val_embedding):
    # original data
    ori_train_data = torch.from_numpy(ori_train_data.values).float()
    ori_val_data = torch.from_numpy(ori_val_data.values).float()

    # concat with embedding extracted from preptrained model
    concat_train_data = torch.concat([ori_train_data, train_embedding], dim=1).float()
    concat_val_data = torch.concat([ori_val_data, val_embedding], dim=1).float()
    
    return (ori_train_data, ori_val_data), (concat_train_data, concat_val_data)

def standard_scale(data):
    return (data - data.mean()) / data.std()

In [3]:
from sklearn.preprocessing import StandardScaler

# path to csv dataset folder
path = 'downstream_datasets/processed_dataset/-social-networking-ads'

# ori_data, tabfm_data, labels, transformer = transform(path, input_dim=stvae_model.input_dim, cat_encoder='label', test_size=0.3, random_state=121)
ori_data, tabfm_data, labels, transformer = transform(path, input_dim=stvae_model.input_dim, cat_encoder='label', test_size=0.3, random_state=121)
train_embedding, val_embedding = extract_tvae_based(stvae_model, tabfm_data[0], tabfm_data[1])

# # standard scaling
# ori_train_data = standard_scale(ori_data[0])
# ori_val_data = standard_scale(ori_data[1])

# (ori_train_data, ori_val_data), (concat_train_data, concat_val_data) = prepare_data(ori_train_data, ori_val_data, train_embedding, val_embedding)


### Classification

In [4]:
from sklearn.metrics import accuracy_score, f1_score, mean_squared_error, r2_score
import lightgbm as lgb
import xgboost as xgb

def evaluate_cls_lgb(train_data, train_labels, val_data, val_labels):
    model = lgb.LGBMClassifier()
    
    # training
    model.fit(train_data, train_labels)
    
    # predicting
    y_pred = model.predict(val_data)
    accuracy = accuracy_score(val_labels, y_pred)
    f1 = f1_score(val_labels, y_pred, average='weighted')
    
    return accuracy, f1

def evaluate_cls_xgb(train_data, train_labels, val_data, val_labels):
    model = xgb.XGBClassifier()
    
    # training
    model.fit(train_data, train_labels)

    # Predict and evaluate
    y_pred = model.predict(val_data)
    accuracy = accuracy_score(val_labels, y_pred)
    f1 = f1_score(val_labels, y_pred, average='weighted')
    
    return accuracy, f1


In [5]:
train_labels, val_labels = labels[0], labels[1]

ori_acc, ori_f1 = evaluate_cls_lgb(ori_train_data, train_labels, ori_val_data, val_labels)
concat_acc, concat_f1 = evaluate_cls_lgb(concat_train_data, train_labels, concat_val_data, val_labels)
emb_acc, emb_f1 = evaluate_cls_lgb(train_embedding, train_labels, val_embedding, val_labels)

print('LGB')
print('ori results: ', ori_acc, ori_f1)
print('concat results: ', concat_acc, concat_f1)
print('emb results: ', emb_acc, emb_f1)


ori_acc, ori_f1 = evaluate_cls_xgb(ori_train_data, train_labels, ori_val_data, val_labels)
concat_acc, concat_f1 = evaluate_cls_xgb(concat_train_data, train_labels, concat_val_data, val_labels)
emb_acc, emb_f1 = evaluate_cls_xgb(train_embedding, train_labels, val_embedding, val_labels)

print('XGB')
print('ori results: ', ori_acc, ori_f1)
print('concat results: ', concat_acc, concat_f1)
print('emb results: ', emb_acc, emb_f1)


[LightGBM] [Info] Number of positive: 97, number of negative: 183
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000035 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 111
[LightGBM] [Info] Number of data points in the train set: 280, number of used features: 3
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.346429 -> initscore=-0.634775
[LightGBM] [Info] Start training from score -0.634775
[LightGBM] [Info] Number of positive: 97, number of negative: 183
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000102 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 3949
[LightGBM] [Info] Number of data points in the train set: 280, number of used features: 45
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.346429 -> initscore=-0.634775
[LightGBM] [Info] Start training from score -0.634775
[LightGBM] [Info] Number of 

  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, dtype=self.classes_.dtype, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, dtype=self.classes_.dtype, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, dtype=self.classes_.dtype, warn=True)


### Regression

In [15]:
from sklearn.metrics import mean_squared_error, r2_score
import lightgbm as lgb
import xgboost as xgb

def evaluate_reg_lgb(train_data, train_labels, val_data, val_labels):
    model = lgb.LGBMRegressor()
    
    # training
    model.fit(train_data, train_labels)
    
    # predicting
    y_pred = model.predict(val_data)
    mse = mean_squared_error(val_labels, y_pred)
    r2 = r2_score(val_labels, y_pred)
    
    return mse, r2

def evaluate_reg_xgb(train_data, train_labels, val_data, val_labels):
    model = xgb.XGBRegressor()
    
    # training
    model.fit(train_data, train_labels)

    # Predict and evaluate
    y_pred = model.predict(val_data)
    mse = mean_squared_error(val_labels, y_pred)
    r2 = r2_score(val_labels, y_pred)
    
    return mse, r2


In [16]:
train_labels, val_labels = labels[0], labels[1]

ori_mse, ori_r2 = evaluate_reg_lgb(ori_train_data, train_labels, ori_val_data, val_labels)
concat_mse, concat_r2 = evaluate_reg_lgb(concat_train_data, train_labels, concat_val_data, val_labels)
emb_mse, emb_r2 = evaluate_reg_lgb(train_embedding, train_labels, val_embedding, val_labels)

print('LGB')
print('ori results: ', ori_mse, ori_r2)
print('concat results: ', concat_mse, concat_r2)
print('emb results: ', emb_mse, emb_r2)

ori_mse, ori_r2 = evaluate_reg_xgb(ori_train_data, train_labels, ori_val_data, val_labels)
concat_mse, concat_r2 = evaluate_reg_xgb(concat_train_data, train_labels, concat_val_data, val_labels)
emb_mse, emb_r2 = evaluate_reg_xgb(train_embedding, train_labels, val_embedding, val_labels)

print('XGB')
print('ori results: ', ori_mse, ori_r2)
print('concat results: ', concat_mse, concat_r2)
print('emb results: ', emb_mse, emb_r2)


  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)


[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000076 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 110
[LightGBM] [Info] Number of data points in the train set: 280, number of used features: 3
[LightGBM] [Info] Start training from score 0.346429
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000274 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 12182
[LightGBM] [Info] Number of data points in the train set: 280, number of used features: 131
[LightGBM] [Info] Start training from score 0.346429
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000246 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 12072
[LightGBM] [Info] Number of data points in the train set: 280, number of used features: 128
[LightGBM] [Info] Start trainin

### Regression or Classification

In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, f1_score, mean_squared_error, r2_score
from sklearn.preprocessing import LabelEncoder
import xgboost as xgb
import lightgbm as lgb
import numpy as np

In [None]:
# Load your data
data = pd.read_csv('data.csv')

# Assuming the last column is the target variable
target_column = data.columns[-1]

# Separate features and target
X = data.drop(columns=[target_column])
y = data[target_column]

# Identify categorical columns
categorical_columns = X.select_dtypes(include=['object', 'category']).columns

# Handle categorical columns differently for XGBoost and LightGBM
# Encode categorical columns using Label Encoding for XGBoost
for col in categorical_columns:
    X[col] = LabelEncoder().fit_transform(X[col])

# Split the data into train/test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [None]:
# Determine the problem type: classification or regression
if y.nunique() > 20 and y.dtype in [np.float64, np.float32, np.int64, np.int32]:
    problem_type = 'regression'
else:
    problem_type = 'classification'

# Train a model based on the problem type
if problem_type == 'classification':
    # Use LightGBM for classification, directly handling categorical features
    model = lgb.LGBMClassifier()
    
    # For LightGBM, specify the categorical features
    model.fit(X_train, y_train, categorical_feature=categorical_columns.tolist())
    
    # Predict and evaluate
    y_pred = model.predict(X_test)
    accuracy = accuracy_score(y_test, y_pred)
    f1 = f1_score(y_test, y_pred, average='weighted')
    
    print(f'Classification Accuracy: {accuracy:.4f}')
    print(f'Classification F1 Score: {f1:.4f}')

else:
    # Use XGBoost for regression, where categorical features are label encoded
    model = xgb.XGBRegressor()
    model.fit(X_train, y_train)
    
    # Predict and evaluate
    y_pred = model.predict(X_test)
    mse = mean_squared_error(y_test, y_pred)
    r2 = r2_score(y_test, y_pred)
    
    print(f'Regression Mean Squared Error: {mse:.4f}')
    print(f'Regression R^2 Score: {r2:.4f}')