# Tabular data classification LightAutoML vs TabPFN

Last week a new method TabPFN was published. Authors made a lot of hyped claims about TabPFN including that it may 'revolutionize data science'

A few hours later prominent researchers promptly debunked these claims stating that on tiny datasets quality of TabPFN is comparable to that of  LightGBM, but at the cost of >> 1000x compute for inference. On larger datasets TabPFN is significantly worse.

In this notebook we take TabPFN for a simple ride in the garage forcourt to see if it can even outperform XGBoost.

In [4]:
# Install TabPFN

!pip install tabpfn

Collecting tabpfn
  Downloading tabpfn-0.1.3-py3-none-any.whl (136 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m136.1/136.1 kB[0m [31m3.9 MB/s[0m eta [36m0:00:00[0m
Collecting configspace>=0.4.21
  Downloading ConfigSpace-0.6.0-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (4.9 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m4.9/4.9 MB[0m [31m48.7 MB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0m
Collecting gpytorch>=1.5.0
  Downloading gpytorch-1.8.1-py2.py3-none-any.whl (361 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m361.8/361.8 kB[0m [31m16.8 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting openml>=0.12.2
  Downloading openml-0.12.2.tar.gz (119 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m119.9/119.9 kB[0m [31m11.7 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25ldone
Collecting liac-arff>=2.4.0
  Downloading liac-arff-2.5.0.tar.gz (13 kB)
  Preparing

In [5]:
!pip install categorical_encoding

Collecting categorical_encoding
  Downloading categorical_encoding-0.4.1-py3-none-any.whl (19 kB)
Collecting category-encoders==2.0.0
  Downloading category_encoders-2.0.0-py2.py3-none-any.whl (87 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m87.8/87.8 kB[0m [31m3.3 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: category-encoders, categorical_encoding
  Attempting uninstall: category-encoders
    Found existing installation: category-encoders 2.5.1.post0
    Uninstalling category-encoders-2.5.1.post0:
      Successfully uninstalled category-encoders-2.5.1.post0
Successfully installed categorical_encoding-0.4.1 category-encoders-2.0.0
[0m

In [6]:
import os
import time
import re

import sys

import numpy as np
import pandas as pd

import torch
from pathlib import Path

from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import OneHotEncoder
from category_encoders import *

import matplotlib.pyplot as plt
from matplotlib.colors import ListedColormap
from pathlib import Path

from sklearn.metrics import accuracy_score, f1_score
from sklearn.model_selection import train_test_split

from tabpfn.scripts.transformer_prediction_interface import TabPFNClassifier
from tabpfn.scripts.decision_boundary import DecisionBoundaryDisplay

In [7]:
%%time
train_data = pd.read_csv('../input/titanic/train.csv')
train_data.head()

CPU times: user 6.16 ms, sys: 1.33 ms, total: 7.49 ms
Wall time: 14.5 ms


Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [8]:
test_data = pd.read_csv('../input/titanic/test.csv')
test_data.head()

Unnamed: 0,PassengerId,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,892,3,"Kelly, Mr. James",male,34.5,0,0,330911,7.8292,,Q
1,893,3,"Wilkes, Mrs. James (Ellen Needs)",female,47.0,1,0,363272,7.0,,S
2,894,2,"Myles, Mr. Thomas Francis",male,62.0,0,0,240276,9.6875,,Q
3,895,3,"Wirz, Mr. Albert",male,27.0,0,0,315154,8.6625,,S
4,896,3,"Hirvonen, Mrs. Alexander (Helga E Lindqvist)",female,22.0,1,1,3101298,12.2875,,S


In [9]:
submission = pd.read_csv('../input/titanic/gender_submission.csv')
submission.head()

Unnamed: 0,PassengerId,Survived
0,892,0
1,893,1
2,894,0
3,895,0
4,896,1


### Add new features

In [10]:
def get_title(name):
    title_search = re.search(' ([A-Za-z]+)\.', name)
    # If the title exists, extract and return it.
    if title_search:
        return title_search.group(1)
    return ""

def create_extra_features(data):
    data['Ticket_type'] = data['Ticket'].map(lambda x: x[0:3])
    data['Name_Words_Count'] = data['Name'].map(lambda x: len(x.split()))
    data['Has_Cabin'] = data["Cabin"].map(lambda x: 1 - int(type(x) == float))
    data['FamilySize'] = data['SibSp'] + data['Parch'] + 1
    
    data['CategoricalFare'] = pd.qcut(data['Fare'], 5).astype(str)
    data['CategoricalAge'] = pd.cut(data['Age'], 5).astype(str)
    
    data['Title'] = data['Name'].apply(get_title).replace(['Lady', 'Countess','Capt', 'Col','Don', 'Dr', 'Major', 'Rev', 'Sir', 'Jonkheer', 'Dona'], 'Rare')
    data['Title'] = data['Title'].replace('Mlle', 'Miss')
    data['Title'] = data['Title'].replace('Ms', 'Miss')
    data['Title'] = data['Title'].replace('Mme', 'Mrs')
    data['Title'] = data['Title'].map({"Mr": 1, "Miss": 2, "Mrs": 3, "Master": 4, "Rare": 5}).fillna(0)
    return data

train_data = create_extra_features(train_data)
test_data = create_extra_features(test_data)

In [11]:
train_data.isnull().sum()

PassengerId           0
Survived              0
Pclass                0
Name                  0
Sex                   0
Age                 177
SibSp                 0
Parch                 0
Ticket                0
Fare                  0
Cabin               687
Embarked              2
Ticket_type           0
Name_Words_Count      0
Has_Cabin             0
FamilySize            0
CategoricalFare       0
CategoricalAge        0
Title                 0
dtype: int64

In [12]:
test_data.isnull().sum()

PassengerId           0
Pclass                0
Name                  0
Sex                   0
Age                  86
SibSp                 0
Parch                 0
Ticket                0
Fare                  1
Cabin               327
Embarked              0
Ticket_type           0
Name_Words_Count      0
Has_Cabin             0
FamilySize            0
CategoricalFare       0
CategoricalAge        0
Title                 0
dtype: int64

In [13]:
train_data.drop(['Age', 'Cabin','PassengerId', 'Name','Ticket'], axis = 1, inplace = True)
test_data.drop(['Age', 'Cabin','PassengerId', 'Name','Ticket'], axis = 1, inplace = True)

In [14]:
train_data.isnull().sum()

Survived            0
Pclass              0
Sex                 0
SibSp               0
Parch               0
Fare                0
Embarked            2
Ticket_type         0
Name_Words_Count    0
Has_Cabin           0
FamilySize          0
CategoricalFare     0
CategoricalAge      0
Title               0
dtype: int64

In [15]:
train_data.dropna(inplace = True)

In [16]:
test_data.head()

Unnamed: 0,Pclass,Sex,SibSp,Parch,Fare,Embarked,Ticket_type,Name_Words_Count,Has_Cabin,FamilySize,CategoricalFare,CategoricalAge,Title
0,3,male,0,0,7.8292,Q,330,3,0,1,"(7.796, 11.025]","(30.502, 45.668]",1
1,3,female,1,0,7.0,S,363,5,0,2,"(-0.001, 7.796]","(45.668, 60.834]",3
2,2,male,0,0,9.6875,Q,240,4,0,1,"(7.796, 11.025]","(60.834, 76.0]",1
3,3,male,0,0,8.6625,S,315,3,0,1,"(7.796, 11.025]","(15.336, 30.502]",1
4,3,female,1,1,12.2875,S,310,6,0,3,"(11.025, 21.438]","(15.336, 30.502]",3


In [17]:
train_data.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 889 entries, 0 to 890
Data columns (total 14 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   Survived          889 non-null    int64  
 1   Pclass            889 non-null    int64  
 2   Sex               889 non-null    object 
 3   SibSp             889 non-null    int64  
 4   Parch             889 non-null    int64  
 5   Fare              889 non-null    float64
 6   Embarked          889 non-null    object 
 7   Ticket_type       889 non-null    object 
 8   Name_Words_Count  889 non-null    int64  
 9   Has_Cabin         889 non-null    int64  
 10  FamilySize        889 non-null    int64  
 11  CategoricalFare   889 non-null    object 
 12  CategoricalAge    889 non-null    object 
 13  Title             889 non-null    int64  
dtypes: float64(1), int64(8), object(5)
memory usage: 104.2+ KB


In [18]:
cat_columns = train_data.select_dtypes(include=['object']).columns.to_list()
cat_columns

['Sex', 'Embarked', 'Ticket_type', 'CategoricalFare', 'CategoricalAge']

In [19]:
for col in cat_columns:
    te=TargetEncoder()
    train_data[col] = te.fit_transform(train_data[col], train_data['Survived'])
    test_data[col] = te.transform(test_data[col])

  elif pd.api.types.is_categorical(cols):


In [20]:
train_data.head()

Unnamed: 0,Survived,Pclass,Sex,SibSp,Parch,Fare,Embarked,Ticket_type,Name_Words_Count,Has_Cabin,FamilySize,CategoricalFare,CategoricalAge,Title
0,0,3,0.188908,1,0,7.25,0.336957,0.117647,4,0,2,0.217877,0.369942,1
1,1,1,0.740385,1,0,71.2833,0.553571,0.65,7,1,2,0.637931,0.40107,3
2,1,3,0.740385,0,0,7.925,0.336957,0.444444,3,0,1,0.201087,0.369942,2
3,1,1,0.740385,1,0,53.1,0.336957,0.444444,7,1,2,0.637931,0.40107,3
4,0,3,0.188908,0,0,8.05,0.336957,0.382452,4,0,1,0.201087,0.40107,1


In [22]:
test_data.head()

Unnamed: 0,Pclass,Sex,SibSp,Parch,Fare,Embarked,Ticket_type,Name_Words_Count,Has_Cabin,FamilySize,CategoricalFare,CategoricalAge,Title
0,3,0.188908,0,0,7.8292,0.38961,0.636352,3,0,1,0.382452,0.382452,1
1,3,0.740385,1,0,7.0,0.336957,0.399684,5,0,2,0.382452,0.382452,3
2,2,0.188908,0,0,9.6875,0.38961,0.382452,4,0,1,0.382452,0.382452,1
3,3,0.188908,0,0,8.6625,0.336957,0.142858,3,0,1,0.382452,0.382452,1
4,3,0.740385,1,1,12.2875,0.336957,0.263158,6,0,3,0.382452,0.382452,3


In [23]:
# create training and test data
y_train = train_data['Survived'].values

x_train = train_data.drop('Survived', axis = 1).values
x_test = test_data.values

**TabPFN**

In [24]:
classifier = TabPFNClassifier(device='cuda')

We have to download the TabPFN, as there is no checkpoint at  /opt/conda/lib/python3.7/site-packages/tabpfn/models_diff/prior_diff_real_checkpoint_n_0_epoch_100.cpkt
It has about 100MB, so this might take a moment.
Loading models_diff/prior_diff_real_checkpoint_n_0_epoch_100.cpkt
Loading....
Using style prior: True
MODEL BUILDER <module 'tabpfn.priors.differentiable_prior' from '/opt/conda/lib/python3.7/site-packages/tabpfn/priors/differentiable_prior.py'> <function get_model.<locals>.make_get_batch.<locals>.new_get_batch at 0x7f7e1eda5440>
Using cuda device
init dist
Not using distributed
DataLoader.__dict__ {'num_steps': 8192, 'get_batch_kwargs': {'batch_size': 1, 'eval_pos_seq_len_sampler': <function train.<locals>.eval_pos_seq_len_sampler at 0x7f7e1eda5a70>, 'seq_len_maximum': 10, 'device': 'cuda', 'num_features': 100, 'hyperparameters': {'lr': 0.0001, 'dropout': 0.0, 'emsize': 512, 'batch_size': 1, 'nlayers': 12, 'num_features': 100, 'nhead': 4, 'nhid_factor': 2, 'bptt': 10, 'eval

In [25]:
%%time
classifier.fit(x_train, y_train)
y_pred, p_pred = classifier.predict(x_test, return_winning_probability=True)

CPU times: user 650 ms, sys: 8.04 ms, total: 658 ms
Wall time: 710 ms


In [26]:
y_pred

array([0, 1, 0, 0, 1, 0, 1, 0, 1, 0, 0, 0, 1, 0, 1, 1, 0, 0, 1, 1, 0, 0,
       1, 0, 1, 0, 1, 0, 0, 0, 0, 0, 1, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 1,
       1, 0, 0, 0, 1, 1, 0, 0, 1, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 1, 1,
       1, 0, 0, 1, 0, 0, 1, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0,
       0, 1, 1, 0, 1, 0, 0, 0, 1, 0, 1, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0,
       0, 1, 1, 1, 1, 0, 0, 1, 0, 1, 1, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 1,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 1, 0, 0, 0, 1, 0, 0, 0,
       0, 0, 1, 1, 1, 0, 1, 1, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1,
       1, 1, 1, 1, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1,
       0, 1, 0, 0, 1, 1, 0, 0, 1, 0, 1, 0, 0, 0, 0, 1, 0, 0, 1, 0, 1, 0,
       1, 0, 1, 0, 1, 1, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1,
       1, 0, 0, 1, 1, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0,
       0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0,
       0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0,

In [28]:
submission['Survived'] = y_pred
submission.to_csv('TabPFN_categorical.csv', index = False)