In [2]:
# !pip install torch torchvision torchaudio
# !pip install pytorch_tabular[all]

## Prepare utility functions
from sklearn.datasets import make_classification
def make_mixed_classification(n_samples, n_features, n_categories):
    X,y = make_classification(n_samples=n_samples, n_features=n_features, random_state=42, n_informative=5)
    cat_cols = random.choices(list(range(X.shape[-1])),k=n_categories)
    num_cols = [i for i in range(X.shape[-1]) if i not in cat_cols]
    for col in cat_cols:
        X[:,col] = pd.qcut(X[:,col], q=4).codes.astype(int)
    col_names = [] 
    num_col_names=[]
    cat_col_names=[]
    for i in range(X.shape[-1]):
        if i in cat_cols:
            col_names.append(f"cat_col_{i}")
            cat_col_names.append(f"cat_col_{i}")
        if i in num_cols:
            col_names.append(f"num_col_{i}")
            num_col_names.append(f"num_col_{i}")
    X = pd.DataFrame(X, columns=col_names)
    y = pd.Series(y, name="target")
    data = X.join(y)
    return data, cat_col_names, num_col_names

## Obtain trainign data
from sklearn.model_selection import train_test_split
import random
import pandas as pd

data, cat_col_names, num_col_names = make_mixed_classification(n_samples=100, n_features=20, n_categories=4)
train, test = train_test_split(data, random_state=42)
train, val = train_test_split(train, random_state=42)

## Define a machine learning model using Pytorch Tabular
from pytorch_tabular import TabularModel
from pytorch_tabular.models import CategoryEmbeddingModelConfig
from pytorch_tabular.config import DataConfig, OptimizerConfig, TrainerConfig, ExperimentConfig

data_config = DataConfig(
    target=['target'], #target should always be a list. Multi-targets are only supported for regression. Multi-Task Classification is not implemented
    continuous_cols=num_col_names,
    categorical_cols=cat_col_names,
)
trainer_config = TrainerConfig(
    auto_lr_find=True, # Runs the LRFinder to automatically derive a learning rate
    batch_size=1024,
    max_epochs=100,
    devices=1, #index of the GPU to use. 0, means CPU
)
optimizer_config = OptimizerConfig()

model_config = CategoryEmbeddingModelConfig(
    task="classification",
    layers="1024-512-512",  # Number of nodes in each layer
    activation="LeakyReLU", # Activation between each layers
    learning_rate = 1e-2
)

tabular_model = TabularModel(
    data_config=data_config,
    model_config=model_config,
    optimizer_config=optimizer_config,
    trainer_config=trainer_config,
)

## Start learning
# see https://stackoverflow.com/questions/43769068/jupyter-notebook-widget-javascript-not-detected if error occurs
tabular_model.fit(train=train, validation=val)
result = tabular_model.evaluate(test)
pred_df = tabular_model.predict(test)
tabular_model.save_model("examples/basic")

Seed set to 42


GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
HPU available: False, using: 0 HPUs





D:\anaconda3\envs\py310\lib\site-packages\pytorch_lightning\callbacks\model_checkpoint.py:639: Checkpoint directory D:\GitHub\Adult-Income-Analysis\saved_models exists and is not empty.
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0,1]
D:\anaconda3\envs\py310\lib\site-packages\pytorch_lightning\trainer\connectors\data_connector.py:441: The 'train_dataloader' does not have many workers which may be a bottleneck. Consider increasing the value of the `num_workers` argument` to `num_workers=11` in the `DataLoader` to improve performance.
D:\anaconda3\envs\py310\lib\site-packages\pytorch_lightning\loops\fit_loop.py:293: The number of training batches (1) is smaller than the logging interval Trainer(log_every_n_steps=50). Set a lower value for log_every_n_steps if you want to see logs for the training epoch.
D:\anaconda3\envs\py310\lib\site-packages\pytorch_lightning\trainer\connectors\data_connector.py:441: The 'val_dataloader' does not have many workers which may be a bottleneck. Consider increa

Finding best initial lr:   0%|          | 0/100 [00:00<?, ?it/s]

LR finder stopped early after 91 steps due to diverging loss.
Learning rate set to 0.00017378008287493763
Restoring states from the checkpoint path at D:\GitHub\Adult-Income-Analysis\.lr_find_372332ee-c05d-4e0a-b941-91c31db08684.ckpt
Restored all states from the checkpoint at D:\GitHub\Adult-Income-Analysis\.lr_find_372332ee-c05d-4e0a-b941-91c31db08684.ckpt


LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0,1]


Output()

LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0,1]


Output()

D:\anaconda3\envs\py310\lib\site-packages\pytorch_lightning\trainer\connectors\data_connector.py:441: The 'test_dataloader' does not have many workers which may be a bottleneck. Consider increasing the value of the `num_workers` argument` to `num_workers=11` in the `DataLoader` to improve performance.


In [3]:
result

[{'test_loss': 0.6946454644203186, 'test_accuracy': 0.4399999976158142}]

In [5]:
data.head()

Unnamed: 0,num_col_0,num_col_1,num_col_2,cat_col_3,num_col_4,num_col_5,num_col_6,num_col_7,num_col_8,num_col_9,...,num_col_11,num_col_12,num_col_13,cat_col_14,cat_col_15,num_col_16,num_col_17,num_col_18,cat_col_19,target
0,0.074206,0.269669,0.72254,2.0,0.222833,0.950503,-0.028925,0.736467,1.176131,-0.229403,...,-0.810924,1.308912,-0.342976,0.0,1.0,-0.079385,0.707716,0.505502,2.0,1
1,-1.014419,0.424234,0.611144,2.0,0.379947,-0.965498,-2.402754,-3.51821,1.00162,0.78287,...,-0.3067,-1.047075,0.030162,3.0,2.0,0.095497,1.479079,-0.256588,3.0,1
2,-2.249096,-1.156744,-1.82407,3.0,0.363878,0.059142,-0.019935,-1.016913,1.395276,-0.682992,...,-0.057784,2.650452,1.126117,3.0,0.0,2.30466,1.445385,-0.454244,3.0,1
3,0.641931,-0.352013,2.105202,3.0,-1.856569,2.687534,1.474056,-0.795152,-0.175284,0.653283,...,0.295619,-0.472407,-1.810032,2.0,3.0,0.584861,0.23001,-0.687934,0.0,1
4,-0.940414,-0.937356,-1.241501,3.0,-1.247805,0.252756,-0.657233,2.088315,-2.797189,0.234652,...,1.486723,0.135038,1.646397,3.0,0.0,1.138604,1.764687,-1.007089,0.0,0


In [6]:
train.head()

Unnamed: 0,num_col_0,num_col_1,num_col_2,cat_col_3,num_col_4,num_col_5,num_col_6,num_col_7,num_col_8,num_col_9,...,num_col_11,num_col_12,num_col_13,cat_col_14,cat_col_15,num_col_16,num_col_17,num_col_18,cat_col_19,target
74,-0.805372,-0.303662,1.30597,0.0,-1.574877,0.270975,0.350785,-2.566409,1.095867,0.596442,...,-0.757797,-0.383026,0.304193,2.0,0.0,0.306238,-0.165399,1.807876,1.0,1
50,-0.994106,1.168218,-0.444946,0.0,-0.710398,-1.960158,-0.040089,-0.732537,-0.242779,-0.030939,...,-1.181683,-1.749469,2.060559,0.0,0.0,-1.897121,-2.302697,-1.19306,0.0,0
67,0.164698,0.104735,0.884101,0.0,0.805801,0.5838,-0.850556,1.53758,-2.416135,-0.362535,...,1.386351,1.255611,0.543499,0.0,0.0,1.739014,-1.25734,1.08361,0.0,0
34,0.033459,2.117856,0.694998,1.0,-0.407292,0.701373,0.462626,0.36277,-0.50175,-0.621818,...,0.53705,1.399536,1.031063,0.0,0.0,1.620539,-1.233468,0.108543,3.0,1
97,-0.075144,0.383972,0.499637,3.0,2.052837,0.063688,1.206153,2.934273,-5.518044,-0.578963,...,5.578709,-1.131236,0.471961,3.0,1.0,2.656914,0.401192,0.139215,0.0,1


In [8]:
val.head()

Unnamed: 0,num_col_0,num_col_1,num_col_2,cat_col_3,num_col_4,num_col_5,num_col_6,num_col_7,num_col_8,num_col_9,...,num_col_11,num_col_12,num_col_13,cat_col_14,cat_col_15,num_col_16,num_col_17,num_col_18,cat_col_19,target
72,0.480146,-0.88622,-2.162219,3.0,0.729718,0.377351,-0.194332,-2.613506,0.566719,2.730867,...,-1.892458,-2.130572,-0.18049,1.0,2.0,-2.42748,0.103544,-0.779152,1.0,1
23,0.07377,0.602477,1.161934,0.0,-1.954635,0.057013,-0.628485,-0.708539,-0.295064,1.397226,...,0.819933,0.171998,-0.647241,0.0,1.0,1.541361,-0.510436,-0.058882,0.0,1
5,0.125847,0.834233,-0.966312,0.0,0.55792,-1.316749,-1.21209,-0.725942,-0.744303,1.028683,...,-0.262295,-1.249062,0.723611,0.0,2.0,-0.94576,0.650382,0.667461,2.0,1
15,0.95644,-0.649266,-2.773563,1.0,-3.143736,0.82785,1.115902,2.466316,-3.396234,-0.62479,...,0.654254,-1.821179,-0.007043,3.0,3.0,-0.815159,0.682034,1.312409,2.0,0
56,-0.158101,-1.434365,-0.798439,0.0,-2.416953,0.383614,0.198558,1.084541,-1.579066,0.806593,...,0.527332,1.767058,-2.369721,2.0,3.0,2.232166,-1.524015,-0.348013,3.0,0
