The detailed code explanation for this article is available at the following link:

https://www.daniweb.com/programming/computer-science/tutorials/542075/tabular-data-classification-with-hugging-face-meta-tree-transformer

For my other articles for Daniweb.com, please see this link:

https://www.daniweb.com/members/1235222/usmanmalik57

## Installing and Importing Required Libraries

In [None]:
!pip install metatreelib
!pip install --upgrade scikit-learn
!pip install imodels

Collecting metatreelib
  Downloading metatreelib-0.1.0-py3-none-any.whl (64 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m64.2/64.2 kB[0m [31m2.0 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting transformers==4.34.0 (from metatreelib)
  Downloading transformers-4.34.0-py3-none-any.whl (7.7 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.7/7.7 MB[0m [31m20.6 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting datasets (from metatreelib)
  Downloading datasets-2.20.0-py3-none-any.whl (547 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m547.8/547.8 kB[0m [31m52.7 MB/s[0m eta [36m0:00:00[0m
Collecting evaluate (from metatreelib)
  Downloading evaluate-0.4.2-py3-none-any.whl (84 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m84.1/84.1 kB[0m [31m14.6 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting accelerate (from metatreelib)
  Downloading accelerate-0.31.0-py3-none-any.whl (309 kB)
[2K     [90m━━━━━━━

In [None]:
from metatree.model_metatree import LlamaForMetaTree as MetaTree
from metatree.decision_tree_class import DecisionTree, DecisionTreeForest
from metatree.run_train import preprocess_dimension_patch
from transformers import AutoConfig
from sklearn.metrics import accuracy_score
import imodels # pip install imodels
import sklearn
from sklearn.model_selection import train_test_split
import numpy as np
import pandas as pd
import torch
from torch.utils.data import Dataset, DataLoader
import random

  and should_run_async(code)


## Loading and Preprocessing the Dataset

In [None]:

# Load the dataset
file_path = '/content/BankNote_Authentication.csv'  # Path to the dataset
df = pd.read_csv(file_path)

# Display the first few rows of the dataset
df.head()


  and should_run_async(code)


Unnamed: 0,variance,skewness,curtosis,entropy,class
0,3.6216,8.6661,-2.8073,-0.44699,0
1,4.5459,8.1674,-2.4586,-1.4621,0
2,3.866,-2.6383,1.9242,0.10645,0
3,3.4566,9.5228,-4.0112,-3.5944,0
4,0.32924,-4.4552,4.5718,-0.9888,0


In [None]:
# Split the dataset into features and target variable
X = df.drop(columns=['class'])
y = df['class']

# Split the data into training and testing sets
train_X, test_X, train_y, test_y = train_test_split(X, y, test_size=0.2, random_state=42)

  and should_run_async(code)


## DataLoader for Batching

In [None]:
class TabularDataset(Dataset):
    def __init__(self, features, labels):
        self.features = features
        self.labels = labels

    def __len__(self):
        return len(self.features)

    def __getitem__(self, idx):
        feature = self.features[idx]
        label = self.labels[idx]
        return torch.tensor(feature, dtype=torch.float32), torch.tensor(label, dtype=torch.float32)

# Convert data to tensors
train_features = train_X.values
train_labels = torch.nn.functional.one_hot(torch.tensor(train_y.values), num_classes=2).float().numpy()

# Create Dataset
train_dataset = TabularDataset(train_features, train_labels)

# Parameters
batch_size = 256

# Create DataLoader
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)

  and should_run_async(code)


## Setting Up the Meta Tree Transformer Model

In [None]:

# Initialize Model
model_name_or_path = "yzhuang/MetaTree"

config = AutoConfig.from_pretrained(model_name_or_path)
# Override config parameters to match your dataset
config.n_feature = train_X.shape[1]
config.n_class = 2

model = MetaTree.from_pretrained(
    model_name_or_path,
    config=config,
    ignore_mismatched_sizes=True
)

decision_tree_forest = DecisionTreeForest()

# Set the depth of the model
model.depth = 2


  and should_run_async(code)
Some weights of LlamaForMetaTree were not initialized from the model checkpoint at yzhuang/MetaTree and are newly initialized because the shapes did not match:
- class_proj.weight: found shape torch.Size([768, 10]) in the checkpoint and torch.Size([768, 2]) in the model instantiated
- emb_bias: found shape torch.Size([10, 768]) in the checkpoint and torch.Size([4, 768]) in the model instantiated
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


## Training the Model with Batches

In [None]:
# Training loop
for batch_features, batch_labels in train_loader:
    # Prepare the batch for the model
    batch = {"input_x": batch_features, "input_y": batch_labels, "input_y_clean": batch_labels}
    batch = preprocess_dimension_patch(batch, n_feature=train_X.shape[1], n_class=2)

    # Generate decision tree
    outputs = model.generate_decision_tree(batch['input_x'], batch['input_y'], depth=model.depth)
    decision_tree_forest.add_tree(DecisionTree(auto_dims=outputs.metatree_dimensions, auto_thresholds=outputs.tentative_splits, input_x=batch['input_x'], input_y=batch['input_y'], depth=model.depth))

    print("Decision Tree Features: ", [x.argmax(dim=-1) for x in outputs.metatree_dimensions])
    print("Decision Tree Thresholds: ", outputs.tentative_splits)



  and should_run_async(code)


Decision Tree Features:  [tensor([0]), tensor([1]), tensor([1])]
Decision Tree Thresholds:  [tensor([[1.0725]]), tensor([[6.7738]]), tensor([[-3.2239]])]
Decision Tree Features:  [tensor([0]), tensor([0]), tensor([1])]
Decision Tree Thresholds:  [tensor([[1.6636]]), tensor([[0.1303]]), tensor([[-2.3517]])]
Decision Tree Features:  [tensor([0]), tensor([1]), tensor([1])]
Decision Tree Thresholds:  [tensor([[0.6576]]), tensor([[4.9849]]), tensor([[0.3011]])]
Decision Tree Features:  [tensor([0]), tensor([1]), tensor([1])]
Decision Tree Thresholds:  [tensor([[0.9736]]), tensor([[6.0237]]), tensor([[-2.4676]])]
Decision Tree Features:  [tensor([0]), tensor([3]), tensor([1])]
Decision Tree Thresholds:  [tensor([[-0.2199]]), tensor([[-0.3073]]), tensor([[3.3367]])]


## Evaluating the Model

In [None]:
# Predict using the decision tree forest
test_X_tensor = torch.tensor(test_X.values, dtype=torch.float32)
tree_pred = decision_tree_forest.predict(test_X_tensor)

tree_pred = tree_pred.argmax(dim=-1).squeeze().numpy()

# Calculate accuracy
accuracy = accuracy_score(test_y, tree_pred)
print("MetaTree Test Accuracy: ", accuracy)


MetaTree Test Accuracy:  0.8727272727272727


  and should_run_async(code)
