# Deep Learning for tabular data

__Problem__: Given certain features about a customer, predict its credit status (bad or good)

__Data__: 13 categorical variables, 7 numerical variables and 1  outcome.

In [2]:
import pandas as pd
raw_data = pd.read_csv('german.data', sep = ' ', header = None)

In [3]:
raw_data.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,11,12,13,14,15,16,17,18,19,20
0,A11,6,A34,A43,1169,A65,A75,4,A93,A101,...,A121,67,A143,A152,2,A173,1,A192,A201,1
1,A12,48,A32,A43,5951,A61,A73,2,A92,A101,...,A121,22,A143,A152,1,A173,1,A191,A201,2
2,A14,12,A34,A46,2096,A61,A74,2,A93,A101,...,A121,49,A143,A152,1,A172,2,A191,A201,1
3,A11,42,A32,A42,7882,A61,A74,2,A93,A103,...,A122,45,A143,A153,1,A173,2,A191,A201,1
4,A11,24,A33,A40,4870,A61,A73,3,A93,A101,...,A124,53,A143,A153,2,A173,2,A191,A201,2


In [4]:
print(raw_data.shape)

(1000, 21)


In [5]:
raw_data.columns = ['CheckingAccount',
                    'Duration',
                    'CreditHistory',
                    'Purpose',
                    'CreditAmount',
                    'Savings',
                    'YearofJob',
                    'InstallmentRate',
                    'SexMaritalStatus',
                    'Guarantors',
                    'ResidentDuration',
                    'Property',
                    'Age',
                    'OtherPlans',
                    'Housing',
                    'ExistingCredits',
                    'Job',
                    'Dependents',
                    'Telephone',
                    'ForeignWorker',
                    'Target']

In [6]:
display(raw_data)

Unnamed: 0,CheckingAccount,Duration,CreditHistory,Purpose,CreditAmount,Savings,YearofJob,InstallmentRate,SexMaritalStatus,Guarantors,...,Property,Age,OtherPlans,Housing,ExistingCredits,Job,Dependents,Telephone,ForeignWorker,Target
0,A11,6,A34,A43,1169,A65,A75,4,A93,A101,...,A121,67,A143,A152,2,A173,1,A192,A201,1
1,A12,48,A32,A43,5951,A61,A73,2,A92,A101,...,A121,22,A143,A152,1,A173,1,A191,A201,2
2,A14,12,A34,A46,2096,A61,A74,2,A93,A101,...,A121,49,A143,A152,1,A172,2,A191,A201,1
3,A11,42,A32,A42,7882,A61,A74,2,A93,A103,...,A122,45,A143,A153,1,A173,2,A191,A201,1
4,A11,24,A33,A40,4870,A61,A73,3,A93,A101,...,A124,53,A143,A153,2,A173,2,A191,A201,2
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
995,A14,12,A32,A42,1736,A61,A74,3,A92,A101,...,A121,31,A143,A152,1,A172,1,A191,A201,1
996,A11,30,A32,A41,3857,A61,A73,4,A91,A101,...,A122,40,A143,A152,1,A174,1,A192,A201,1
997,A14,12,A32,A43,804,A61,A75,4,A93,A101,...,A123,38,A143,A152,1,A173,1,A191,A201,1
998,A11,45,A32,A43,1845,A61,A73,4,A93,A101,...,A124,23,A143,A153,1,A173,1,A192,A201,2


In [7]:
print(raw_data.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 21 columns):
 #   Column            Non-Null Count  Dtype 
---  ------            --------------  ----- 
 0   CheckingAccount   1000 non-null   object
 1   Duration          1000 non-null   int64 
 2   CreditHistory     1000 non-null   object
 3   Purpose           1000 non-null   object
 4   CreditAmount      1000 non-null   int64 
 5   Savings           1000 non-null   object
 6   YearofJob         1000 non-null   object
 7   InstallmentRate   1000 non-null   int64 
 8   SexMaritalStatus  1000 non-null   object
 9   Guarantors        1000 non-null   object
 10  ResidentDuration  1000 non-null   int64 
 11  Property          1000 non-null   object
 12  Age               1000 non-null   int64 
 13  OtherPlans        1000 non-null   object
 14  Housing           1000 non-null   object
 15  ExistingCredits   1000 non-null   int64 
 16  Job               1000 non-null   object
 17  Dependents     

In [8]:
# Missing Values: None
raw_data.isna().sum()

CheckingAccount     0
Duration            0
CreditHistory       0
Purpose             0
CreditAmount        0
Savings             0
YearofJob           0
InstallmentRate     0
SexMaritalStatus    0
Guarantors          0
ResidentDuration    0
Property            0
Age                 0
OtherPlans          0
Housing             0
ExistingCredits     0
Job                 0
Dependents          0
Telephone           0
ForeignWorker       0
Target              0
dtype: int64

In [9]:
# Categorical, Numerical Variables
Cate_names = ['CheckingAccount', 'CreditHistory', 'Purpose', 'Savings', 'YearofJob', 'SexMaritalStatus', 'Guarantors', 'Property',
              'OtherPlans', 'Housing', 'Job', 'Telephone', 'ForeignWorker']

Num_names = ['Duration', 'CreditAmount', 'InstallmentRate', 'ResidentDuration',
             'Age', 'ExistingCredits', 'Dependents']


In [10]:
# Cate Variables

for i in range(len(Cate_names)):
  print(raw_data[Cate_names[i]].value_counts())

# Note: There are no instances with A95: female, single

A14    394
A11    274
A12    269
A13     63
Name: CheckingAccount, dtype: int64
A32    530
A34    293
A33     88
A31     49
A30     40
Name: CreditHistory, dtype: int64
A43     280
A40     234
A42     181
A41     103
A49      97
A46      50
A45      22
A44      12
A410     12
A48       9
Name: Purpose, dtype: int64
A61    603
A65    183
A62    103
A63     63
A64     48
Name: Savings, dtype: int64
A73    339
A75    253
A74    174
A72    172
A71     62
Name: YearofJob, dtype: int64
A93    548
A92    310
A94     92
A91     50
Name: SexMaritalStatus, dtype: int64
A101    907
A103     52
A102     41
Name: Guarantors, dtype: int64
A123    332
A121    282
A122    232
A124    154
Name: Property, dtype: int64
A143    814
A141    139
A142     47
Name: OtherPlans, dtype: int64
A152    713
A151    179
A153    108
Name: Housing, dtype: int64
A173    630
A172    200
A174    148
A171     22
Name: Job, dtype: int64
A191    596
A192    404
Name: Telephone, dtype: int64
A201    963
A202     37
Name: For

In [11]:
# Target
raw_data['Target'].value_counts()

1    700
2    300
Name: Target, dtype: int64

 # 1. Label Encoding with Categorical Variables

 We opt for label encoding with categorical variables instead of one-hot encoding to use with our model.

In [12]:
from sklearn.preprocessing import LabelEncoder
for col in raw_data[Cate_names]:
  raw_data[col] = LabelEncoder().fit_transform(raw_data[col])



In [13]:
raw_data.head()

Unnamed: 0,CheckingAccount,Duration,CreditHistory,Purpose,CreditAmount,Savings,YearofJob,InstallmentRate,SexMaritalStatus,Guarantors,...,Property,Age,OtherPlans,Housing,ExistingCredits,Job,Dependents,Telephone,ForeignWorker,Target
0,0,6,4,4,1169,4,4,4,2,0,...,0,67,2,1,2,2,1,1,0,1
1,1,48,2,4,5951,0,2,2,1,0,...,0,22,2,1,1,2,1,0,0,2
2,3,12,4,7,2096,0,3,2,2,0,...,0,49,2,1,1,1,2,0,0,1
3,0,42,2,3,7882,0,3,2,2,2,...,1,45,2,2,1,2,2,0,0,1
4,0,24,3,0,4870,0,2,3,2,0,...,3,53,2,2,2,2,2,0,0,2


In [14]:
for col in raw_data[Cate_names]:
  raw_data[col] = raw_data[col].astype('category')
from sklearn.preprocessing import LabelEncoder

X = raw_data.iloc[:, :-1]
y = raw_data.iloc[:, -1]

y = LabelEncoder().fit_transform(y)

In [15]:
from sklearn.model_selection import train_test_split


X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state= 39, stratify = y )

## Embedding categorical columns

- Size of embedding
borrowed from
 https://www.usfca.edu/data-institute/certificates/fundamentals-deep-learning lesson 2

In [16]:
# Define size of embedding

from collections import Counter

target_dict = {'Good': 1,
               'Bad': 0
               }

embedded_cols = {n: len(col.cat.categories) for n, col in X[Cate_names].items()}

embedded_cols

{'CheckingAccount': 4,
 'CreditHistory': 5,
 'Purpose': 10,
 'Savings': 5,
 'YearofJob': 5,
 'SexMaritalStatus': 4,
 'Guarantors': 3,
 'Property': 4,
 'OtherPlans': 3,
 'Housing': 3,
 'Job': 4,
 'Telephone': 2,
 'ForeignWorker': 2}

In [17]:
embedding_sizes = [(n_categories, min(50, (n_categories+1)//2)) for _,n_categories in embedded_cols.items()]
embedding_sizes


[(4, 2),
 (5, 3),
 (10, 5),
 (5, 3),
 (5, 3),
 (4, 2),
 (3, 2),
 (4, 2),
 (3, 2),
 (3, 2),
 (4, 2),
 (2, 1),
 (2, 1)]

In [18]:
import torch
from torch.utils.data import Dataset, DataLoader
import torch.optim as torch_optim
import torch.nn as nn
import torch.nn.functional as F
from torchvision import models
import numpy as np

In [19]:
class GermanDataset(Dataset):
  def __init__(self, X, y, Cate_names):
    X = X.copy()
    self.X1 = X.loc[:, Cate_names].copy().values.astype(np.int64) # Cate Columns
    self.X2 = X.drop(columns = Cate_names).copy().values.astype(np.float32) # numeric columns
    self.y = y

  def __len__(self):
    return len(self.y)

  def __getitem__(self, idx):
    return self.X1[idx], self.X2[idx], self.y[idx]

In [20]:
train_ds = GermanDataset(X_train, y_train, Cate_names)
valid_ds = GermanDataset(X_test, y_test, Cate_names)

In [21]:
valid_ds.X2

array([[6.000e+00, 9.320e+02, 3.000e+00, ..., 2.400e+01, 1.000e+00,
        1.000e+00],
       [1.100e+01, 1.393e+03, 4.000e+00, ..., 3.500e+01, 2.000e+00,
        1.000e+00],
       [1.800e+01, 2.899e+03, 4.000e+00, ..., 4.300e+01, 1.000e+00,
        2.000e+00],
       ...,
       [4.800e+01, 2.751e+03, 4.000e+00, ..., 3.800e+01, 2.000e+00,
        2.000e+00],
       [1.200e+01, 2.171e+03, 4.000e+00, ..., 3.800e+01, 2.000e+00,
        1.000e+00],
       [1.200e+01, 1.108e+03, 4.000e+00, ..., 2.800e+01, 2.000e+00,
        1.000e+00]], dtype=float32)

In [22]:
class GermanModel(nn.Module):
  def __init__(self, embedding_sizes, n_cont):
    super().__init__()
    self.embeddings = nn.ModuleList([nn.Embedding(categories, size) for categories, size in embedding_sizes])
    n_emb = sum(e.embedding_dim for e in self.embeddings) # length of all embedding vectors
    self.n_emb, self.n_cont = n_emb, n_cont
    self.lin1 = nn.Linear(self.n_emb + self.n_cont, 200)
    self.lin2 = nn.Linear(200, 70)
    self.lin3 = nn.Linear(70, 2)
    self.bn1 = nn.BatchNorm1d(self.n_cont)
    self.bn2 = nn.BatchNorm1d(200)
    self.bn3 = nn.BatchNorm1d(70)
    self.emb_drop = nn.Dropout(0.6)
    self.drops = nn.Dropout(0.3)

  def forward(self, x_cat, x_cont):
    x = [e(x_cat[:, i]) for i, e in enumerate(self.embeddings)]
    x = torch.cat(x, 1)
    x = self.emb_drop(x)
    x2 = self.bn1(x_cont)
    x = torch.cat([x, x2], 1)
    x = F.relu(self.lin1(x))
    x = self.drops(x)
    x = self.bn2(x)
    x = F.relu(self.lin2(x))
    x = self.drops(x)
    x = self.bn3(x)
    x = self.lin3(x)
    return x

In [23]:
model = GermanModel(embedding_sizes, 7)
from torchvision import models
from torchsummary import summary
model

GermanModel(
  (embeddings): ModuleList(
    (0): Embedding(4, 2)
    (1): Embedding(5, 3)
    (2): Embedding(10, 5)
    (3-4): 2 x Embedding(5, 3)
    (5): Embedding(4, 2)
    (6): Embedding(3, 2)
    (7): Embedding(4, 2)
    (8-9): 2 x Embedding(3, 2)
    (10): Embedding(4, 2)
    (11-12): 2 x Embedding(2, 1)
  )
  (lin1): Linear(in_features=37, out_features=200, bias=True)
  (lin2): Linear(in_features=200, out_features=70, bias=True)
  (lin3): Linear(in_features=70, out_features=2, bias=True)
  (bn1): BatchNorm1d(7, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
  (bn2): BatchNorm1d(200, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
  (bn3): BatchNorm1d(70, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
  (emb_drop): Dropout(p=0.6, inplace=False)
  (drops): Dropout(p=0.3, inplace=False)
)

In [24]:
def get_optimizer(model, lr = 0.001, wd = 0.0):
  parameters = filter(lambda p: p.requires_grad, model.parameters())
  optim = torch_optim.Adam(parameters, lr = lr, weight_decay = wd)
  return optim

In [25]:
def train_model(model, optim, train_dl):
    model.train()
    total = 0
    sum_loss = 0
    for x1, x2, y in train_dl:
        batch = y.shape[0]
        output = model(x1, x2)
        loss = F.cross_entropy(output, y)
        optim.zero_grad()
        loss.backward()
        optim.step()
        total += batch
        sum_loss += batch*(loss.item())
    return sum_loss/total

In [26]:
def val_loss(model, valid_dl):
    model.eval()
    total = 0
    sum_loss = 0
    correct = 0
    for x1, x2, y in valid_dl:
        current_batch_size = y.shape[0]
        out = model(x1, x2)
        loss = F.cross_entropy(out, y)
        sum_loss += current_batch_size*(loss.item())
        total += current_batch_size
        pred = torch.max(out, 1)[1]
        correct += (pred == y).float().sum().item()
    print("valid loss %.3f and accuracy %.3f" % (sum_loss/total, correct/total))
    return sum_loss/total, correct/total

In [27]:
def train_loop(model, epochs, lr=0.01, wd=0.0):
    optim = get_optimizer(model, lr = lr, wd = wd)
    for i in range(epochs):
        loss = train_model(model, optim, train_dl)
        print("training loss: ", loss)
        val_loss(model, valid_dl)

In [28]:
batch_size = 20
train_dl = DataLoader(train_ds, batch_size=batch_size,shuffle=True)
valid_dl = DataLoader(valid_ds, batch_size=batch_size,shuffle=True)

In [39]:
train_loop(model, epochs=50, lr=0.02, wd=0.00001)

training loss:  0.5095998637378216
valid loss 0.467 and accuracy 0.770
training loss:  0.504590280354023
valid loss 0.459 and accuracy 0.785
training loss:  0.5005371019244194
valid loss 0.470 and accuracy 0.820
training loss:  0.4964611895382404
valid loss 0.472 and accuracy 0.775
training loss:  0.5186251930892467
valid loss 0.475 and accuracy 0.825
training loss:  0.5117400601506233
valid loss 0.467 and accuracy 0.815
training loss:  0.4923439107835293
valid loss 0.503 and accuracy 0.735
training loss:  0.5002353370189667
valid loss 0.473 and accuracy 0.795
training loss:  0.5381275959312916
valid loss 0.493 and accuracy 0.750
training loss:  0.49375179037451744
valid loss 0.490 and accuracy 0.780
training loss:  0.5051627472043038
valid loss 0.513 and accuracy 0.730
training loss:  0.5108834229409694
valid loss 0.505 and accuracy 0.730
training loss:  0.512208903580904
valid loss 0.470 and accuracy 0.825
training loss:  0.4914290800690651
valid loss 0.488 and accuracy 0.775
trainin