<a href="https://colab.research.google.com/github/vishal-burman/PyTorch-Architectures/blob/master/modeling_TabTransformer/test_sample_TabTransformer.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
! git clone https://github.com/vishal-burman/PyTorch-Architectures.git
%cd PyTorch-Architectures/modeling_TabTransformer/

fatal: destination path 'PyTorch-Architectures' already exists and is not an empty directory.
/content/PyTorch-Architectures/modeling_TabTransformer


In [2]:
! wget https://archive.ics.uci.edu/ml/machine-learning-databases/adult/adult.data

--2021-01-04 08:45:36--  https://archive.ics.uci.edu/ml/machine-learning-databases/adult/adult.data
Resolving archive.ics.uci.edu (archive.ics.uci.edu)... 128.195.10.252
Connecting to archive.ics.uci.edu (archive.ics.uci.edu)|128.195.10.252|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 3974305 (3.8M) [application/x-httpd-php]
Saving to: ‘adult.data.8’


2021-01-04 08:45:37 (7.38 MB/s) - ‘adult.data.8’ saved [3974305/3974305]



In [3]:
import time
import pandas as pd
from sklearn.preprocessing import LabelEncoder

import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader
from model import TabTransformer

device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

In [4]:
dataset = pd.read_csv('adult.data', names=['age', 'workclass', 'fnlwgt', 'education', 'education-num', 'marital-status', 'occupation', 'relationship', 'race', 'sex', 'capital-gain', 'capital-loss', 'hours-per-week', 'native-country', 'target'])

In [5]:
dataset.head()

Unnamed: 0,age,workclass,fnlwgt,education,education-num,marital-status,occupation,relationship,race,sex,capital-gain,capital-loss,hours-per-week,native-country,target
0,39,State-gov,77516,Bachelors,13,Never-married,Adm-clerical,Not-in-family,White,Male,2174,0,40,United-States,<=50K
1,50,Self-emp-not-inc,83311,Bachelors,13,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,13,United-States,<=50K
2,38,Private,215646,HS-grad,9,Divorced,Handlers-cleaners,Not-in-family,White,Male,0,0,40,United-States,<=50K
3,53,Private,234721,11th,7,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,0,0,40,United-States,<=50K
4,28,Private,338409,Bachelors,13,Married-civ-spouse,Prof-specialty,Wife,Black,Female,0,0,40,Cuba,<=50K


In [6]:
cont_classes = ['age', 'fnlwgt', 'education-num', 'capital-gain', 'capital-loss', 'hours-per-week']
cat_classes = ['workclass', 'education', 'marital-status', 'occupation', 'relationship', 'race', 'sex', 'native-country']


def transform_df(list_col):
  for col in list_col:
    le = LabelEncoder()
    sample = dataset[col]
    le.fit(sample)
    dataset[col] = le.transform(dataset[col])

transform_df(cat_classes)

In [7]:
dataset.head()

Unnamed: 0,age,workclass,fnlwgt,education,education-num,marital-status,occupation,relationship,race,sex,capital-gain,capital-loss,hours-per-week,native-country,target
0,39,7,77516,9,13,4,1,1,4,1,2174,0,40,39,<=50K
1,50,6,83311,9,13,2,4,0,4,1,0,0,13,39,<=50K
2,38,4,215646,11,9,0,6,1,4,1,0,0,40,39,<=50K
3,53,4,234721,1,7,2,6,0,2,1,0,0,40,39,<=50K
4,28,4,338409,9,13,2,10,5,2,0,0,0,40,5,<=50K


In [8]:
dataset['target'] = dataset['target'].apply(lambda x: 0 if x == " <=50K" else 1)
dataset.head()

Unnamed: 0,age,workclass,fnlwgt,education,education-num,marital-status,occupation,relationship,race,sex,capital-gain,capital-loss,hours-per-week,native-country,target
0,39,7,77516,9,13,4,1,1,4,1,2174,0,40,39,0
1,50,6,83311,9,13,2,4,0,4,1,0,0,13,39,0
2,38,4,215646,11,9,0,6,1,4,1,0,0,40,39,0
3,53,4,234721,1,7,2,6,0,2,1,0,0,40,39,0
4,28,4,338409,9,13,2,10,5,2,0,0,0,40,5,0


In [9]:
list_columns = dataset.columns.tolist()
cat_classes_index = []
cont_classes_index = []

for col in cat_classes:
  cat_classes_index.append(list_columns.index(col))

for col in cont_classes:
  cont_classes_index.append(list_columns.index(col))

cat_dataset = dataset.iloc[:, cat_classes_index]
cont_dataset = dataset.iloc[:, cont_classes_index]

# Convert to numpy arrays
cat_dataset_numpy = cat_dataset.to_numpy()
cont_dataset_numpy = cont_dataset.to_numpy()
target_numpy = dataset['target'].to_numpy()
assert len(cat_dataset) == len(cont_dataset)
assert len(cont_dataset) == len(target_numpy)

In [10]:
class TabDataset(Dataset):
  def __init__(self, cat_dataset_numpy, cont_dataset_numpy, targets):
    self.cat_dataset = cat_dataset_numpy
    self.cont_dataset = cont_dataset_numpy
    self.targets = targets
    self.length = len(targets)
    self.list_samples = []
    self.build()
  
  def __len__(self):
    return len(self.list_samples)
  
  def __getitem__(self, idx):
    sample = self.list_samples[idx]
    return {
        'cat_tensor': torch.tensor(sample['cat_list']),
        'cont_tensor': torch.tensor(sample['cont_list'], dtype=torch.float),
        'target': torch.tensor(sample['target']),
    }

  def build(self):
    for i in range(self.length):
      cat = self.cat_dataset[i]
      cont = self.cont_dataset[i]
      target = self.targets[i]
      self.list_samples.append({
          'cat_list': cat,
          'cont_list': cont,
          'target': target,
      })

In [11]:
# Split --> 80% train 20% valid
len_train = 80 * len(cat_dataset) // 100

cat_dataset_numpy_train = cat_dataset_numpy[:len_train]
cat_dataset_numpy_valid = cat_dataset_numpy[len_train:]

cont_dataset_numpy_train = cont_dataset_numpy[:len_train]
cont_dataset_numpy_valid = cont_dataset_numpy[len_train:]

target_train = target_numpy[:len_train]
target_valid = target_numpy[len_train:]

In [12]:
train_dataset = TabDataset(cat_dataset_numpy_train, cont_dataset_numpy_train, target_train)
valid_dataset = TabDataset(cat_dataset_numpy_valid, cont_dataset_numpy_valid, target_valid)

In [13]:
BATCH_SIZE = 8

train_loader = DataLoader(dataset=train_dataset, batch_size=BATCH_SIZE, shuffle=False)
valid_loader = DataLoader(dataset=valid_dataset, batch_size=BATCH_SIZE, shuffle=False)

print("Length of Train Loader: ", len(train_loader))
print("Length of Valid Loader:", len(valid_loader), "\n")

# Check dataset
for sample1, sample2 in zip(train_loader, valid_loader):
  print(sample1['cat_tensor'].shape, sample2['cont_tensor'].shape)
  break

Length of Train Loader:  3256
Length of Valid Loader: 815 

torch.Size([8, 8]) torch.Size([8, 6])


In [14]:
model = TabTransformer(
    categories=(9, 16, 7, 15, 6, 5, 2, 42),
    num_continuous=6,
    dim=32,
    dim_out=2,
    depth=6,
    heads=8,
    attn_dropout=0.1,
    ff_dropout=0.1,
    mlp_hidden_mults=(4, 2),
    mlp_act=nn.ReLU(),
)
model.to(device)

params = sum(p.numel() for p in model.parameters() if p.requires_grad)
print("Total trainable parameters = ", params)

Total trainable parameters =  220110


In [15]:
LEARNING_RATE = 3e-4
EPOCHS = 2
optimizer = torch.optim.AdamW(model.parameters(), lr=LEARNING_RATE)

In [16]:
def compute_accuracy(model, data_loader, device):
  correct_preds, total_examples = 0, 0
  with torch.set_grad_enabled(False):
    for sample in data_loader:
      cat_tensor = sample['cat_tensor'].to(device)
      cont_tensor = sample['cont_tensor'].to(device)
      target = sample['target'].to(device)

      logits = model(cat_tensor, cont_tensor)
      probas = F.softmax(logits, dim=1)
      _, pred_labels = torch.max(probas, 1)
      correct_preds += (pred_labels == target).sum()
      total_examples += target.size(0)
    return correct_preds.float() / total_examples * 100

start_time = time.time()
for epoch in range(EPOCHS):
  model.train()
  for idx, sample in enumerate(train_loader):
    cat_tensor = sample['cat_tensor'].to(device)
    cont_tensor = sample['cont_tensor'].to(device)
    target = sample['target'].to(device)

    optimizer.zero_grad()
    logits = model(cat_tensor, cont_tensor)
    loss = F.cross_entropy(logits, target)
    loss.backward()
    optimizer.step()

    # LOGGING
    if idx % 500 == 0:
      print("Batch: %04d/%04d || Epoch: %04d/%04d || Loss: %.3f" % (idx, len(train_loader), epoch+1, EPOCHS, loss.item()))
  
  model.eval()
  with torch.set_grad_enabled(False):
    train_acc = compute_accuracy(model, train_loader, device)
    valid_acc = compute_accuracy(model, valid_loader, device)
    print('Train Accuracy: %.3f%%' % (train_acc))
    print('Valid Accuracy: %.3f%%' % (valid_acc))
  epoch_elapsed_time = (time.time() - start_time) / 60
  print('Epoch Elapsed Time: ', epoch_elapsed_time)
total_training_time = (time.time() - start_time) / 60
print('Total Training Time: ', total_training_time)

Batch: 0000/3256 || Epoch: 0001/0002 || Loss: 0.650
Batch: 0500/3256 || Epoch: 0001/0002 || Loss: 0.835
Batch: 1000/3256 || Epoch: 0001/0002 || Loss: 0.265
Batch: 1500/3256 || Epoch: 0001/0002 || Loss: 0.566
Batch: 2000/3256 || Epoch: 0001/0002 || Loss: 0.565
Batch: 2500/3256 || Epoch: 0001/0002 || Loss: 0.190
Batch: 3000/3256 || Epoch: 0001/0002 || Loss: 0.311
Train Accuracy: 83.150%
Valid Accuracy: 83.080%
Epoch Elapsed Time:  1.6121505180994669
Batch: 0000/3256 || Epoch: 0002/0002 || Loss: 0.699
Batch: 0500/3256 || Epoch: 0002/0002 || Loss: 0.603
Batch: 1000/3256 || Epoch: 0002/0002 || Loss: 0.262
Batch: 1500/3256 || Epoch: 0002/0002 || Loss: 0.571
Batch: 2000/3256 || Epoch: 0002/0002 || Loss: 0.582
Batch: 2500/3256 || Epoch: 0002/0002 || Loss: 0.185
Batch: 3000/3256 || Epoch: 0002/0002 || Loss: 0.310
Train Accuracy: 83.584%
Valid Accuracy: 83.249%
Epoch Elapsed Time:  3.2195717891057334
Total Training Time:  3.2195752461751304
