In [9]:
from ucimlrepo import fetch_ucirepo
import numpy as np
import pandas as pd
from torch.utils.data import TensorDataset
from torch.utils.data import DataLoader
import torch
from sklearn.model_selection import train_test_split

In [10]:
adult = fetch_ucirepo(id=2)
adult_df = adult.data.original

In [90]:
adult_df.replace('?', np.nan, inplace=True)
adult_pred = adult_df.dropna()

In [91]:
adult_df.income.replace(["<=50K", "<=50K."], 0, inplace = True)
adult_df.income.replace([">50K", ">50K."], 1, inplace = True)

In [61]:
def data_process_tabular(dataset_name):
    if dataset_name == 'adult':
        # fetch dataset
        adult = fetch_ucirepo(id=2)
        adult_df = adult.data.original

        # remove N/A
        adult_df.replace('?', np.nan, inplace=True)
        adult_pred = adult_df.dropna()

        # Data Prep
        adult_data = adult_pred.drop(columns=['income'])
        adult_label = adult_pred.income

        adult_label = adult_label.replace(["<=50K", "<=50K."], 0)
        adult_label = adult_label.replace([">50K", ">50K."], 1)

        adult_cat_1hot = pd.get_dummies(adult_data.select_dtypes('object'))
        adult_non_cat = adult_data.select_dtypes(exclude='object')

        adult_data_1hot = pd.concat([adult_non_cat, adult_cat_1hot, adult_label], axis=1, join='inner')
        adult_data_1hot = adult_data_1hot.replace(False, 0)
        adult_data_1hot = adult_data_1hot.replace(True, 1)

        # Balance the dataset on ["0", "1"] two classes
        balanced_df = pd.concat([adult_data_1hot.query("income == 0").sample(n=11208),
                                 adult_data_1hot.loc[adult_data_1hot['income'] == 1]])

        adult_data_tensor = TensorDataset(torch.tensor(balanced_df.drop(columns=['income']).values,
                                                       dtype=torch.float),
                                          torch.tensor(balanced_df.income.values, dtype=torch.long))

        train_data, test_data = train_test_split(adult_data_tensor, test_size=0.2)
    else:
        return None, None
    return train_data, test_data

In [62]:
train_dataset, test_dataset = data_process_tabular('adult')

In [63]:
train_dataset[17][0]

tensor([7.0000e+01, 2.6883e+05, 1.1000e+01, 0.0000e+00, 0.0000e+00, 2.4000e+01,
        0.0000e+00, 0.0000e+00, 0.0000e+00, 0.0000e+00, 1.0000e+00, 0.0000e+00,
        0.0000e+00, 0.0000e+00, 0.0000e+00, 0.0000e+00, 0.0000e+00, 0.0000e+00,
        0.0000e+00, 0.0000e+00, 0.0000e+00, 1.0000e+00, 0.0000e+00, 0.0000e+00,
        0.0000e+00, 0.0000e+00, 0.0000e+00, 0.0000e+00, 0.0000e+00, 0.0000e+00,
        0.0000e+00, 1.0000e+00, 0.0000e+00, 0.0000e+00, 0.0000e+00, 0.0000e+00,
        0.0000e+00, 0.0000e+00, 0.0000e+00, 0.0000e+00, 1.0000e+00, 0.0000e+00,
        0.0000e+00, 0.0000e+00, 0.0000e+00, 0.0000e+00, 0.0000e+00, 0.0000e+00,
        0.0000e+00, 0.0000e+00, 1.0000e+00, 0.0000e+00, 0.0000e+00, 0.0000e+00,
        0.0000e+00, 0.0000e+00, 0.0000e+00, 0.0000e+00, 0.0000e+00, 0.0000e+00,
        1.0000e+00, 0.0000e+00, 1.0000e+00, 0.0000e+00, 0.0000e+00, 0.0000e+00,
        0.0000e+00, 0.0000e+00, 0.0000e+00, 0.0000e+00, 0.0000e+00, 0.0000e+00,
        0.0000e+00, 0.0000e+00, 0.0000e+

In [64]:
train_dataset[17][0][1], train_dataset[17][1]

(tensor(268832.), tensor(1))

In [65]:
adult_df.loc[adult_df['fnlwgt'] == 197552]

Unnamed: 0,age,workclass,fnlwgt,education,education-num,marital-status,occupation,relationship,race,sex,capital-gain,capital-loss,hours-per-week,native-country,income
11036,24,Local-gov,197552,HS-grad,9,Never-married,Tech-support,Not-in-family,White,Female,0,0,40,United-States,0
21552,23,Private,197552,Some-college,10,Never-married,Sales,Own-child,White,Female,0,0,35,United-States,0


## Try pytorch_tabular

In [92]:
from pytorch_tabular import TabularModel
from pytorch_tabular.models import CategoryEmbeddingModelConfig
from pytorch_tabular.config import (
    DataConfig,
    OptimizerConfig,
    TrainerConfig,
)

In [104]:
num_col_names = list(adult.data.features.select_dtypes(exclude='object').columns)

In [105]:
cat_col_names = list(adult.data.features.select_dtypes('object').columns)

In [106]:
cat_col_names

['workclass',
 'education',
 'marital-status',
 'occupation',
 'relationship',
 'race',
 'sex',
 'native-country']

In [107]:
data_config = DataConfig(
    target=['income'], #target should always be a list. Multi-targets are only supported for regression. Multi-Task Classification is not implemented
    continuous_cols=num_col_names,
    categorical_cols=cat_col_names,
)
trainer_config = TrainerConfig(
    auto_lr_find=True, # Runs the LRFinder to automatically derive a learning rate
    batch_size=1024,
    max_epochs=100,
    devices=1, #index of the GPU to use. 0, means CPU
)
optimizer_config = OptimizerConfig()

model_config = CategoryEmbeddingModelConfig(
    task="classification",
    layers="1024-512-512",  # Number of nodes in each layer
    activation="LeakyReLU", # Activation between each layers
    learning_rate = 1e-2
)

tabular_model = TabularModel(
    data_config=data_config,
    model_config=model_config,
    optimizer_config=optimizer_config,
    trainer_config=trainer_config,
)



In [108]:
# train = adult.data.features
# val = adult.data.targets

In [109]:
# adult_df

In [110]:
train, test = train_test_split(adult_df, random_state=42)
train, val = train_test_split(train, random_state=42)

In [111]:
train

Unnamed: 0,age,workclass,fnlwgt,education,education-num,marital-status,occupation,relationship,race,sex,capital-gain,capital-loss,hours-per-week,native-country,income
5840,22,Private,203894,Some-college,10,Never-married,Transport-moving,Not-in-family,White,Female,0,0,24,United-States,0
24530,60,Private,143932,HS-grad,9,Married-civ-spouse,Craft-repair,Husband,White,Male,0,0,40,United-States,0
6806,18,Private,118376,Some-college,10,Never-married,Sales,Own-child,White,Female,0,0,15,United-States,0
39241,34,Private,161153,10th,6,Married-civ-spouse,Farming-fishing,Husband,White,Male,0,0,35,United-States,0
44878,64,Local-gov,209899,Some-college,10,Married-civ-spouse,Protective-serv,Husband,White,Male,0,0,35,United-States,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
27929,28,Private,148429,HS-grad,9,Married-civ-spouse,Machine-op-inspct,Husband,White,Male,0,0,40,United-States,0
6770,30,Private,378723,Some-college,10,Divorced,Adm-clerical,Own-child,White,Female,0,0,55,United-States,0
937,41,Private,132222,Prof-school,15,Married-civ-spouse,Prof-specialty,Husband,White,Male,0,2415,40,United-States,1
44886,39,Private,312271,Some-college,10,Married-civ-spouse,Sales,Husband,White,Male,0,0,60,United-States,0


In [114]:
tabular_model

<pytorch_tabular.tabular_model.TabularModel at 0x21792c33670>

In [112]:
tabular_model.fit(train=train, validation=test)

# tabular_model.save_model("examples/basic")
# loaded_model = TabularModel.load_model("examples/basic")

Seed set to 42


GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
HPU available: False, using: 0 HPUs





D:\anaconda3\envs\py310\lib\site-packages\pytorch_lightning\callbacks\model_checkpoint.py:639: Checkpoint directory D:\GitHub\Adult-Income-Analysis\saved_models exists and is not empty.
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0,1]
D:\anaconda3\envs\py310\lib\site-packages\pytorch_lightning\trainer\connectors\data_connector.py:441: The 'train_dataloader' does not have many workers which may be a bottleneck. Consider increasing the value of the `num_workers` argument` to `num_workers=11` in the `DataLoader` to improve performance.
D:\anaconda3\envs\py310\lib\site-packages\pytorch_lightning\loops\fit_loop.py:293: The number of training batches (27) is smaller than the logging interval Trainer(log_every_n_steps=50). Set a lower value for log_every_n_steps if you want to see logs for the training epoch.
D:\anaconda3\envs\py310\lib\site-packages\pytorch_lightning\trainer\connectors\data_connector.py:441: The 'val_dataloader' does not have many workers which may be a bottleneck. Consider incre

Finding best initial lr:   0%|          | 0/100 [00:00<?, ?it/s]

LR finder stopped early after 91 steps due to diverging loss.
Learning rate set to 3.9810717055349735e-05
Restoring states from the checkpoint path at D:\GitHub\Adult-Income-Analysis\.lr_find_b9979888-61f8-4754-a1fa-866ea5cb6a17.ckpt
Restored all states from the checkpoint at D:\GitHub\Adult-Income-Analysis\.lr_find_b9979888-61f8-4754-a1fa-866ea5cb6a17.ckpt


LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0,1]


Output()

<pytorch_lightning.trainer.trainer.Trainer at 0x21792c9f4f0>

In [113]:
result = tabular_model.evaluate(test)
pred_df = tabular_model.predict(test)

LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0,1]


Output()

D:\anaconda3\envs\py310\lib\site-packages\pytorch_lightning\trainer\connectors\data_connector.py:441: The 'test_dataloader' does not have many workers which may be a bottleneck. Consider increasing the value of the `num_workers` argument` to `num_workers=11` in the `DataLoader` to improve performance.
