## Feature extraction from TabularFM

This notebook demonstrates how to extract features from pretrained TabularFM in a zero-shot settings and use those features as additional information for regression or classification downstream tasks

### Feature extraction from a pretrained models

### Download models and datasets

In [1]:
# download model
! mkdir models
! wget great_gittables.zip "https://huggingface.co/lamthuy/great_gittables/resolve/main/great_gittables.zip" -P models/
! unzip -o models/great_gittables.zip -d models/

# # download dataset
# ! mkdir datasets
# ! wget gittables_v4.zip https://huggingface.co/datasets/lamthuy/TabularFM-GitTables/resolve/main/gittables_v4.zip -P datasets/
# ! unzip -o datasets/gittables_v4.zip -d datasets/

mkdir: cannot create directory ‘models’: File exists
--2024-08-13 09:09:48--  http://great_gittables.zip/
Resolving great_gittables.zip (great_gittables.zip)... failed: Name or service not known.
wget: unable to resolve host address ‘great_gittables.zip’
--2024-08-13 09:09:48--  https://huggingface.co/lamthuy/great_gittables/resolve/main/great_gittables.zip
Resolving huggingface.co (huggingface.co)... 3.162.58.119, 3.162.58.5, 3.162.58.97, ...
Connecting to huggingface.co (huggingface.co)|3.162.58.119|:443... connected.
HTTP request sent, awaiting response... 302 Found
Location: https://cdn-lfs-us-1.huggingface.co/repos/14/9a/149a66ebd949fc579b646b0c108db7fd95e6eaaa37a0c1ec7127f0f68a899cdb/13134d36742dc9182b400ea7d89aec125c994fe1c6b206ae571cd3fb39d8d030?response-content-disposition=inline%3B+filename*%3DUTF-8%27%27great_gittables.zip%3B+filename%3D%22great_gittables.zip%22%3B&response-content-type=application%2Fzip&Expires=1723799388&Policy=eyJTdGF0ZW1lbnQiOlt7IkNvbmRpdGlvbiI6eyJEYXRlT

### Load models

In [1]:
import os
import sys
sys.path.append('tabularfm')

from tabularfm.utils.cli import get_config, create_model, create_model_config
from tabularfm.utils.processing import load_model_weights
from transformers import AutoTokenizer

# configs
model_type = 'great'
config_path = 'configs/great.yaml'
# data_path =  # kaggle path
data_path = 'datasets/gittables_v4' # gittables
finetune_path = 'models/great_gittables'

# load and create config
configs = get_config(config_path)
model_config_finetune = create_model_config(data_path, configs, model_type, config_type = "finetune")
model_config_finetune['pretrained_llm'] = os.path.join(finetune_path, 'weights')

# load model
great_model = create_model(model_type, model_config_finetune)


  from .autonotebook import tqdm as notebook_tqdm


### Transform

In [2]:
import numpy as np

def get_tokens(great_ds):
    tokens = [great_ds_train[i]['input_ids'] for i in range(len(great_ds))]
    
    return tokens

def zero_pad_list_of_lists(list_of_lists, max_length):
    zero_padded_list = []
    for sublist in list_of_lists:
        padded_sublist = sublist + [0] * (max_length - len(sublist))
        zero_padded_list.append(padded_sublist)
    return zero_padded_list

def standardize_tokens(train_tokens, val_tokens):
    
    max_len = np.max([len(k) for k in train_tokens] + [len(k) for k in val_tokens])

    train_tokens = zero_pad_list_of_lists(train_tokens, max_len)
    train_tokens = np.array(train_tokens)
    train_tokens.shape

    val_tokens = zero_pad_list_of_lists(val_tokens, max_len)
    val_tokens = np.array(val_tokens)
    val_tokens.shape


    train_tokens = torch.from_numpy(train_tokens).long()
    val_tokens = torch.from_numpy(val_tokens).long()
    
    return train_tokens, val_tokens


In [3]:
import torch
import pandas as pd
from tabularfm.utils.processing import get_df
from sklearn.model_selection import train_test_split
from tabularfm.utils.processing import get_metadata, add_padding
from tabularfm.ctgan.data_transformer import DataTransformerV2
from sklearn.preprocessing import LabelEncoder

def transform(path, test_size=0.3, random_state=121):
    
    df = get_df(path)
    
    data = df.drop(columns=df.columns[-1])
    labels = df[[df.columns[-1]]]

    categorical_columns = data.select_dtypes(include=['object', 'category']).columns

    ori_data = data

    # transform original data
    for col in categorical_columns:
        ori_data[col] = LabelEncoder().fit_transform(ori_data[col])

    df_train_data, df_val_data, train_labels, val_labels = train_test_split(data, labels, test_size=test_size, random_state=random_state)
    ori_train_data, ori_val_data, _, _ = train_test_split(ori_data, labels, test_size=test_size, random_state=random_state)
    
    return (ori_train_data, ori_val_data), (df_train_data, df_val_data),(train_labels, val_labels)

def prepare_data(ori_train_data, ori_val_data, train_embedding, val_embedding):
    # original data
    ori_train_data = torch.from_numpy(ori_train_data.values).float()
    ori_val_data = torch.from_numpy(ori_val_data.values).float()

    # concat with embedding extracted from preptrained model
    concat_train_data = torch.concat([ori_train_data, train_embedding], dim=1).float()
    concat_val_data = torch.concat([ori_val_data, val_embedding], dim=1).float()
    
    return (ori_train_data, ori_val_data), (concat_train_data, concat_val_data)

def standard_scale(data):
    return (data - data.mean()) / data.std()

def extract_great_based(great_model, train_tokens, val_tokens, mode='cls'):
    train_outputs = great_model.model(train_tokens)
    val_outputs = great_model.model(val_tokens)
    
    if mode == 'cls':
        return train_outputs.logits[:, 0].detach().cpu(), val_outputs.logits[:, 0].detach().cpu()



In [4]:
from sklearn.preprocessing import StandardScaler
from tabularfm.be_great.great_dataset import GReaTDataset
from tabularfm.utils.processing import get_df
from sklearn.model_selection import train_test_split

path = 'downstream_datasets/processed_dataset/-social-networking-ads'

# ori_data, tabfm_data, labels, transformer = transform(path, input_dim=stvae_model.input_dim, cat_encoder='label', test_size=0.3, random_state=121)
ori_data, df_data, labels = transform(path, test_size=0.3, random_state=121)

ori_train_data, ori_val_data = ori_data
df_train_data, df_val_data = df_data
great_model.init_column_info(df_train_data)

great_ds_train = GReaTDataset.from_pandas(df_train_data)
great_ds_val = GReaTDataset.from_pandas(df_val_data)

great_ds_train.set_tokenizer(great_model.tokenizer)
great_ds_val.set_tokenizer(great_model.tokenizer)

train_tokens = get_tokens(great_ds_train)
val_tokens = get_tokens(great_ds_val)

train_tokens, val_tokens = standardize_tokens(train_tokens, val_tokens)

train_embedding, val_embedding = extract_great_based(great_model, train_tokens, val_tokens)


# standard scaling
# ori_train_data = standard_scale(ori_data[0])
# ori_val_data = standard_scale(ori_data[1])

(ori_train_data, ori_val_data), (concat_train_data, concat_val_data) = prepare_data(ori_train_data, ori_val_data, train_embedding, val_embedding)


### Classification

In [5]:
from sklearn.metrics import accuracy_score, f1_score, mean_squared_error, r2_score
import lightgbm as lgb
import xgboost as xgb

def evaluate_cls_lgb(train_data, train_labels, val_data, val_labels):
    model = lgb.LGBMClassifier()
    
    # training
    model.fit(train_data, train_labels)
    
    # predicting
    y_pred = model.predict(val_data)
    accuracy = accuracy_score(val_labels, y_pred)
    f1 = f1_score(val_labels, y_pred, average='weighted')
    
    return accuracy, f1

def evaluate_cls_xgb(train_data, train_labels, val_data, val_labels):
    model = xgb.XGBClassifier()
    
    # training
    model.fit(train_data, train_labels)

    # Predict and evaluate
    y_pred = model.predict(val_data)
    accuracy = accuracy_score(val_labels, y_pred)
    f1 = f1_score(val_labels, y_pred, average='weighted')
    
    return accuracy, f1


In [6]:
train_labels, val_labels = labels[0], labels[1]

ori_acc, ori_f1 = evaluate_cls_lgb(ori_train_data, train_labels, ori_val_data, val_labels)
concat_acc, concat_f1 = evaluate_cls_lgb(concat_train_data, train_labels, concat_val_data, val_labels)
emb_acc, emb_f1 = evaluate_cls_lgb(train_embedding, train_labels, val_embedding, val_labels)

print('LGB')
print('ori results: ', ori_acc, ori_f1)
print('concat results: ', concat_acc, concat_f1)
print('emb results: ', emb_acc, emb_f1)


ori_acc, ori_f1 = evaluate_cls_xgb(ori_train_data, train_labels, ori_val_data, val_labels)
concat_acc, concat_f1 = evaluate_cls_xgb(concat_train_data, train_labels, concat_val_data, val_labels)
emb_acc, emb_f1 = evaluate_cls_xgb(train_embedding, train_labels, val_embedding, val_labels)

print('XGB')
print('ori results: ', ori_acc, ori_f1)
print('concat results: ', concat_acc, concat_f1)
print('emb results: ', emb_acc, emb_f1)


  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, dtype=self.classes_.dtype, warn=True)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, dtype=self.classes_.dtype, warn=True)


[LightGBM] [Info] Number of positive: 97, number of negative: 183
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000050 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 110
[LightGBM] [Info] Number of data points in the train set: 280, number of used features: 3
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.346429 -> initscore=-0.634775
[LightGBM] [Info] Start training from score -0.634775
[LightGBM] [Info] Number of positive: 97, number of negative: 183
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.331901 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 251395
[LightGBM] [Info] Number of data points in the train set: 280, number of used features: 50260
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.346429 -> initscore=-0.634775
[LightGBM] [Info] Start training from score -0.634775


  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, dtype=self.classes_.dtype, warn=True)


[LightGBM] [Info] Number of positive: 97, number of negative: 183
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.334445 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 251285
[LightGBM] [Info] Number of data points in the train set: 280, number of used features: 50257
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.346429 -> initscore=-0.634775
[LightGBM] [Info] Start training from score -0.634775
LGB
ori results:  0.8916666666666667 0.89032379235219
concat results:  0.9083333333333333 0.9076989855395986
emb results:  0.6166666666666667 0.470446735395189
XGB
ori results:  0.9083333333333333 0.9076989855395986
concat results:  0.9333333333333333 0.9333333333333333
emb results:  0.6166666666666667 0.470446735395189


### Regression

In [15]:
from sklearn.metrics import mean_squared_error, r2_score
import lightgbm as lgb
import xgboost as xgb

def evaluate_reg_lgb(train_data, train_labels, val_data, val_labels):
    model = lgb.LGBMRegressor()
    
    # training
    model.fit(train_data, train_labels)
    
    # predicting
    y_pred = model.predict(val_data)
    mse = mean_squared_error(val_labels, y_pred)
    r2 = r2_score(val_labels, y_pred)
    
    return mse, r2

def evaluate_reg_xgb(train_data, train_labels, val_data, val_labels):
    model = xgb.XGBRegressor()
    
    # training
    model.fit(train_data, train_labels)

    # Predict and evaluate
    y_pred = model.predict(val_data)
    mse = mean_squared_error(val_labels, y_pred)
    r2 = r2_score(val_labels, y_pred)
    
    return mse, r2


In [16]:
train_labels, val_labels = labels[0], labels[1]

ori_mse, ori_r2 = evaluate_reg_lgb(ori_train_data, train_labels, ori_val_data, val_labels)
concat_mse, concat_r2 = evaluate_reg_lgb(concat_train_data, train_labels, concat_val_data, val_labels)
emb_mse, emb_r2 = evaluate_reg_lgb(train_embedding, train_labels, val_embedding, val_labels)

print('LGB')
print('ori results: ', ori_mse, ori_r2)
print('concat results: ', concat_mse, concat_r2)
print('emb results: ', emb_mse, emb_r2)

ori_mse, ori_r2 = evaluate_reg_xgb(ori_train_data, train_labels, ori_val_data, val_labels)
concat_mse, concat_r2 = evaluate_reg_xgb(concat_train_data, train_labels, concat_val_data, val_labels)
emb_mse, emb_r2 = evaluate_reg_xgb(train_embedding, train_labels, val_embedding, val_labels)

print('XGB')
print('ori results: ', ori_mse, ori_r2)
print('concat results: ', concat_mse, concat_r2)
print('emb results: ', emb_mse, emb_r2)


  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)


[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000076 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 110
[LightGBM] [Info] Number of data points in the train set: 280, number of used features: 3
[LightGBM] [Info] Start training from score 0.346429
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000274 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 12182
[LightGBM] [Info] Number of data points in the train set: 280, number of used features: 131
[LightGBM] [Info] Start training from score 0.346429
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000246 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 12072
[LightGBM] [Info] Number of data points in the train set: 280, number of used features: 128
[LightGBM] [Info] Start trainin