# Phase 1 Experiment — Dataset 5: Adult Census

**Task:** Income Prediction (>50K or <=50K)  
**Type:** TABULAR (14 features, treated as a sequence of 14 tokens)  
**Classes:** 2 (binary)  
**Source:** UCI ML Repository / Kaggle

**Novel aspect:** Each tabular feature is treated as a token at position 0..13.  
PE here encodes **feature identity** (which feature is at this position), not word order.  
Tests whether PE helps transformers distinguish between different tabular feature positions.

In [None]:
import os, sys, math, time
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score, f1_score

PROJECT_ROOT = os.path.abspath(os.path.join(os.getcwd(), '..', '..'))
if PROJECT_ROOT not in sys.path:
    sys.path.insert(0, PROJECT_ROOT)

from PE.sinusoidal_pe import SinusoidalPositionalEncoding
from PE.binary_pe    import BinaryPositionalEncoding
from PE.rope         import RoPEPositionalEncoding
from PE.learned_pe   import LearnedPositionalEncoding
from PE.dape         import DAPEPositionalEncoding

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f'Device: {device}')

In [None]:
class Config:
    # Dataset — tabular, each feature = one token position
    n_classes   = 2
    n_features  = 14      # Adult Census has 14 features -> seq_len = 14
    max_seq_len = 14
    test_size   = 0.2
    val_size    = 0.1

    # Model (each feature projected to d_model)
    d_model  = 64         # smaller model for tabular data
    n_heads  = 4
    n_layers = 2
    d_ff     = 128
    dropout  = 0.1

    batch_size = 128
    lr         = 1e-3
    epochs     = 30
    seed       = 42

cfg = Config()
torch.manual_seed(cfg.seed)
np.random.seed(cfg.seed)
print(f'Config: seq_len={cfg.max_seq_len} (14 features as tokens), n_classes={cfg.n_classes}')

In [None]:
# Load Adult Census dataset from UCI via pandas
# If running for the first time, dataset will be downloaded from UCI
ADULT_URL = 'https://archive.ics.uci.edu/ml/machine-learning-databases/adult/adult.data'

COLUMNS = [
    'age', 'workclass', 'fnlwgt', 'education', 'education_num',
    'marital_status', 'occupation', 'relationship', 'race', 'sex',
    'capital_gain', 'capital_loss', 'hours_per_week', 'native_country', 'income'
]

print('Loading Adult Census dataset...')
try:
    df = pd.read_csv(ADULT_URL, names=COLUMNS, na_values=' ?', skipinitialspace=True)
    print(f'Loaded from UCI. Shape: {df.shape}')
except Exception:
    # Fallback: try sklearn datasets
    from sklearn.datasets import fetch_openml
    adult = fetch_openml('adult', version=2, as_frame=True)
    df = adult.frame
    df.columns = [c.lower().replace('-', '_') for c in df.columns]
    print(f'Loaded from sklearn. Shape: {df.shape}')

# Preprocess
df = df.dropna()
y = (df['income'].astype(str).str.contains('>50K')).astype(int).values
X = df.drop(columns=['income'])

# Encode categoricals
for col in X.select_dtypes(include=['object', 'category']).columns:
    X[col] = pd.factorize(X[col])[0]

X = X.values.astype(np.float32)
X = np.nan_to_num(X, nan=0.0)
cfg.n_features = X.shape[1]
cfg.max_seq_len = cfg.n_features

print(f'Features: {cfg.n_features} | Positive rate: {y.mean():.2%}')

# Split
X_temp, X_test, y_temp, y_test = train_test_split(X, y, test_size=cfg.test_size, random_state=cfg.seed, stratify=y)
X_train, X_val, y_train, y_val = train_test_split(X_temp, y_temp, test_size=cfg.val_size, random_state=cfg.seed, stratify=y_temp)

# Scale
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_val   = scaler.transform(X_val)
X_test  = scaler.transform(X_test)

print(f'Train: {len(X_train)} | Val: {len(X_val)} | Test: {len(X_test)}')

In [None]:
# Tabular dataset — each sample is (n_features,), treated as a sequence of n_features tokens
class TabularDataset(Dataset):
    def __init__(self, X, y):
        # X: (N, n_features) -> store as (N, n_features, 1) for per-feature projection
        self.X = torch.FloatTensor(X).unsqueeze(-1)  # (N, n_features, 1)
        self.y = torch.FloatTensor(y)
    def __len__(self): return len(self.y)
    def __getitem__(self, idx): return self.X[idx], self.y[idx]

train_ds = TabularDataset(X_train, y_train)
val_ds   = TabularDataset(X_val,   y_val)
test_ds  = TabularDataset(X_test,  y_test)

train_loader = DataLoader(train_ds, batch_size=cfg.batch_size, shuffle=True)
val_loader   = DataLoader(val_ds,   batch_size=cfg.batch_size, shuffle=False)
test_loader  = DataLoader(test_ds,  batch_size=cfg.batch_size, shuffle=False)

print(f'Train batches: {len(train_loader)} | Val batches: {len(val_loader)}')
print(f'Input shape: {train_ds[0][0].shape}  (n_features=seq_len, feature_val=1)')

In [None]:
# Model for tabular data
# Each feature value (scalar) is projected to d_model, then PE is added
class MultiHeadAttention(nn.Module):
    def __init__(self, d_model, n_heads, dropout=0.1):
        super().__init__()
        self.n_heads = n_heads; self.d_k = d_model // n_heads
        self.W_q = nn.Linear(d_model, d_model); self.W_k = nn.Linear(d_model, d_model)
        self.W_v = nn.Linear(d_model, d_model); self.W_o = nn.Linear(d_model, d_model)
        self.dropout = nn.Dropout(dropout)
    def forward(self, x):
        B, T, D = x.shape
        Q = self.W_q(x).view(B, T, self.n_heads, self.d_k).transpose(1, 2)
        K = self.W_k(x).view(B, T, self.n_heads, self.d_k).transpose(1, 2)
        V = self.W_v(x).view(B, T, self.n_heads, self.d_k).transpose(1, 2)
        scores = torch.matmul(Q, K.transpose(-2, -1)) / math.sqrt(self.d_k)
        out = torch.matmul(self.dropout(F.softmax(scores, dim=-1)), V)
        return self.W_o(out.transpose(1, 2).contiguous().view(B, T, D))

class FeedForward(nn.Module):
    def __init__(self, d_model, d_ff, dropout=0.1):
        super().__init__()
        self.net = nn.Sequential(nn.Linear(d_model, d_ff), nn.GELU(), nn.Dropout(dropout), nn.Linear(d_ff, d_model))
    def forward(self, x): return self.net(x)

class TransformerBlock(nn.Module):
    def __init__(self, d_model, n_heads, d_ff, dropout=0.1):
        super().__init__()
        self.attn = MultiHeadAttention(d_model, n_heads, dropout)
        self.ff = FeedForward(d_model, d_ff, dropout)
        self.ln1 = nn.LayerNorm(d_model); self.ln2 = nn.LayerNorm(d_model)
        self.drop = nn.Dropout(dropout)
    def forward(self, x):
        x = self.ln1(x + self.drop(self.attn(x)))
        return self.ln2(x + self.drop(self.ff(x)))

class TabularTransformer(nn.Module):
    '''
    Transformer for tabular data.
    Each feature is projected from scalar (1) -> d_model,
    then PE is added to distinguish feature positions.
    '''
    def __init__(self, n_features, d_model, n_heads, n_layers, d_ff, pe_class, dropout=0.1):
        super().__init__()
        self.feature_proj = nn.Linear(1, d_model)        # project each feature value
        self.pe = pe_class(d_model, n_features, dropout)  # PE over feature positions
        self.blocks = nn.ModuleList([TransformerBlock(d_model, n_heads, d_ff, dropout) for _ in range(n_layers)])
        self.norm = nn.LayerNorm(d_model)
        self.head = nn.Linear(d_model, 1)                 # binary classification

    def forward(self, x):
        # x: (B, n_features, 1)
        x = self.feature_proj(x)   # (B, n_features, d_model)
        x = self.pe(x)
        for b in self.blocks: x = b(x)
        return self.head(self.norm(x.mean(dim=1)))

print('TabularTransformer defined.')

In [None]:
def train_epoch(model, loader, criterion, optimizer):
    model.train(); total_loss, correct, total = 0, 0, 0
    for X, y in loader:
        X, y = X.to(device), y.to(device)
        optimizer.zero_grad()
        logits = model(X).squeeze(-1)
        loss = criterion(logits, y)
        loss.backward()
        torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
        optimizer.step()
        total_loss += loss.item() * X.size(0)
        correct += ((torch.sigmoid(logits) > 0.5).long() == y.long()).sum().item()
        total += y.size(0)
    return total_loss / total, correct / total

def evaluate(model, loader):
    model.eval(); all_preds, all_labels = [], []
    with torch.no_grad():
        for X, y in loader:
            preds = (torch.sigmoid(model(X.to(device)).squeeze(-1)) > 0.5).long().cpu()
            all_preds.extend(preds.tolist()); all_labels.extend(y.long().tolist())
    return accuracy_score(all_labels, all_preds), f1_score(all_labels, all_preds, average='binary')

print('Training/eval functions defined.')

In [None]:
PE_METHODS = {
    'sinusoidal': SinusoidalPositionalEncoding,
    'binary':     BinaryPositionalEncoding,
    'rope':       RoPEPositionalEncoding,
    'learned':    LearnedPositionalEncoding,
    'dape':       DAPEPositionalEncoding,
}

results = {}
criterion = nn.BCEWithLogitsLoss()

for pe_name, pe_class in PE_METHODS.items():
    print(f'\n=== {pe_name.upper()} PE ===')
    torch.manual_seed(cfg.seed)
    model = TabularTransformer(
        n_features=cfg.n_features, d_model=cfg.d_model,
        n_heads=cfg.n_heads, n_layers=cfg.n_layers,
        d_ff=cfg.d_ff, pe_class=pe_class, dropout=cfg.dropout
    ).to(device)
    print(f'  Parameters: {sum(p.numel() for p in model.parameters()):,}')
    optimizer = torch.optim.AdamW(model.parameters(), lr=cfg.lr, weight_decay=1e-4)
    scheduler = torch.optim.lr_scheduler.CosineAnnealingLR(optimizer, T_max=cfg.epochs)
    best_acc, best_f1, t0 = 0, 0, time.time()
    for epoch in range(cfg.epochs):
        train_loss, train_acc = train_epoch(model, train_loader, criterion, optimizer)
        val_acc, val_f1 = evaluate(model, val_loader)
        scheduler.step()
        if val_acc > best_acc: best_acc, best_f1 = val_acc, val_f1
        if (epoch + 1) % 5 == 0:
            print(f'  Ep {epoch+1:02d} | loss {train_loss:.4f} | train {train_acc:.4f} | val_acc {val_acc:.4f}')
    elapsed = time.time() - t0
    results[pe_name] = {'accuracy': best_acc, 'f1': best_f1, 'time_s': elapsed}
    print(f'  Done in {elapsed:.1f}s — best val acc: {best_acc:.4f}')

In [None]:
print('\n' + '='*65)
print('PHASE 1 RESULTS — Adult Census (TABULAR, 14 features as sequence)')
print('='*65)
print(f'{"PE Method":<15} {"Accuracy":>10} {"F1":>10} {"Time (s)":>10}')
print('-'*65)
best_acc_val = max(v['accuracy'] for v in results.values())
for pe_name, m in results.items():
    marker = ' <-- BEST' if m['accuracy'] == best_acc_val else ''
    print(f'{pe_name:<15} {m["accuracy"]:>10.4f} {m["f1"]:>10.4f} {m["time_s"]:>10.1f}{marker}')
print('='*65)

names  = list(results.keys())
accs   = [results[n]['accuracy'] for n in names]
f1s    = [results[n]['f1']       for n in names]
colors = ['#4C72B0', '#DD8452', '#55A868', '#C44E52', '#8172B2']

fig, axes = plt.subplots(1, 2, figsize=(12, 4))
axes[0].bar(names, accs, color=colors);  axes[0].set_title('Accuracy — Adult Census');  axes[0].tick_params(axis='x', rotation=15)
axes[1].bar(names, f1s,  color=colors);  axes[1].set_title('F1 Score — Adult Census');  axes[1].tick_params(axis='x', rotation=15)
plt.suptitle('Phase 1: PE Comparison on Adult Census (TABULAR — 14 feature positions)', fontsize=11)
plt.tight_layout()
plt.savefig('results_adult.png', dpi=150, bbox_inches='tight')
plt.show()
print('Saved: results_adult.png')