In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import pandas as pd

In [3]:
data = pd.read_csv("../data/clean/clean_bank_full.csv")
data.head()

Unnamed: 0,age,job,marital,education,default,housing,loan,contact,month,day_of_week,...,pdays,previous,poutcome,emp.var.rate,cons.price.idx,cons.conf.idx,euribor3m,nr.employed,subscribed,has_been_contacted
0,56,housemaid,married,basic.4y,no,no,no,telephone,may,mon,...,999,0,nonexistent,1.1,93.994,-36.4,4.857,5191.0,no,1
1,57,services,married,high.school,unknown,no,no,telephone,may,mon,...,999,0,nonexistent,1.1,93.994,-36.4,4.857,5191.0,no,1
2,37,services,married,high.school,no,yes,no,telephone,may,mon,...,999,0,nonexistent,1.1,93.994,-36.4,4.857,5191.0,no,1
3,40,admin,married,basic.6y,no,no,no,telephone,may,mon,...,999,0,nonexistent,1.1,93.994,-36.4,4.857,5191.0,no,1
4,56,services,married,high.school,no,no,yes,telephone,may,mon,...,999,0,nonexistent,1.1,93.994,-36.4,4.857,5191.0,no,1


In [27]:
x_emb_cols = ["job", "marital", "education", "default", "housing", "loan", "month", "day_of_week", "poutcome"]
x_bin_col = ["contact", "has_been_contacted"]
y_col = "subscribed"
x_numerical = ["age", "campaign", "pdays", "previous", "emp.var.rate", "cons.price.idx", "cons.conf.idx",
               "euribor3m", "nr.employed"]
cats = x_emb_cols + x_bin_col
all_cols = x_emb_cols + x_bin_col + x_numerical

In [28]:
data[cats] = data[cats].astype("category")

In [29]:
from bank_telemarketing.preprocessing.preprocess import MultiLabelEncoder, CustomScaler
from sklearn.pipeline import Pipeline

x_pipe = Pipeline(
    steps=
    [
        ("label_encoder", MultiLabelEncoder(cols=cats)),
        ("scaler", CustomScaler(cols=x_numerical))
    ]
)
y_mapping = {"no": 0, "yes": 1}

In [30]:
from sklearn.model_selection import train_test_split

y = data.subscribed
X = data.drop(["subscribed", "duration"], axis=1)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=56)

In [31]:
# Reindexing table
X_train = X_train.reset_index(drop=True)
y_train = y_train.reset_index(drop=True)

y_train = y_train.replace(y_mapping)
X_train = x_pipe.fit_transform(X_train)
X_train

Unnamed: 0,age,job,marital,education,default,housing,loan,contact,month,day_of_week,campaign,pdays,previous,poutcome,emp.var.rate,cons.price.idx,cons.conf.idx,euribor3m,nr.employed,has_been_contacted
0,-0.967873,0,2,3,0,2,0,1,3,0,-0.569289,0.195492,-0.349599,1,0.838905,0.591943,-0.477034,0.770974,0.847611,1
1,-0.007520,0,2,6,0,0,2,0,1,2,0.864498,0.195492,-0.349599,1,0.838905,-0.226505,0.946185,0.772704,0.847611,1
2,-0.007520,1,2,2,0,0,0,1,6,3,0.506051,0.195492,-0.349599,1,0.647877,0.723171,0.881494,0.711598,0.333001,1
3,0.184551,5,1,2,1,0,0,1,4,1,-0.210842,0.195492,-0.349599,1,0.838905,1.536440,-0.282959,0.716786,0.847611,1
4,0.568692,10,1,5,0,0,0,0,3,3,-0.569289,0.195492,-0.349599,1,0.838905,0.591943,-0.477034,0.772127,0.847611,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
28818,0.760762,9,0,5,0,2,0,0,3,3,0.506051,0.195492,-0.349599,1,0.838905,0.591943,-0.477034,0.772704,0.847611,1
28819,-1.063909,7,2,3,0,2,0,0,7,4,-0.569289,0.195492,1.690588,0,-0.116235,-0.647817,-0.326086,0.287318,0.399582,1
28820,1.433010,5,1,5,0,2,0,1,6,2,-0.569289,0.195492,-0.349599,1,0.647877,0.723171,0.881494,0.713904,0.333001,1
28821,-1.255979,1,1,2,0,2,0,0,6,3,-0.569289,-5.088711,1.690588,2,-1.198728,-1.177909,-1.231771,-1.343510,-0.941732,0


In [32]:
from bank_telemarketing.preprocessing.embedding import CategoricalEmbeddingSizes

# Get embedding size
emb = CategoricalEmbeddingSizes()
emb_size = emb.get_cat_emb_dims(data=X_train, cat_cols=cats)
emb_size

([(13, 7), (5, 3), (9, 5), (4, 2), (4, 2), (4, 2), (11, 6), (6, 3), (4, 2)],
 ['job',
  'marital',
  'education',
  'default',
  'housing',
  'loan',
  'month',
  'day_of_week',
  'poutcome'])

In [37]:
emb_sizes = emb_size[0]
emb_cols = emb_size[1]

In [34]:
from torch.utils.data.dataset import Dataset
from torch.utils.data import DataLoader
import numpy as np
import torch

class CustomDataset(Dataset):
    def __init__(self, emb_cols, x, y):
        super(CustomDataset, self).__init__()
        self.cat = emb_cols
        _emb = x.loc[:, emb_cols]
        self.emb_data = np.stack(
            [c.values for _, c in _emb.items()], axis=1
        ).astype(np.int64)

        # Continuous data
        other_data = x.drop(emb_cols, axis=1)
        self.cont_data = np.stack(
            [c.values for _, c in other_data.items()], axis=1
        ).astype(np.float32)
        self.y = y.values.astype(np.int32)

    def __len__(self):
        return len(self.y)

    def __getitem__(self, item):
        x_cont = self.cont_data[item]
        x_emb = self.emb_data[item]
        y = np.asarray(self.y[item])

        out = {
            "x_cont": torch.from_numpy(x_cont),
            "x_emb": torch.from_numpy(x_emb),
            "y": torch.tensor(y, dtype=torch.long)
        }
        return out

In [35]:
import torch.nn as nn

class Classifier(nn.Module):
    def __init__(self, hidden, output_size, continuous_size, embedding_sizes, dropout):
        super(Classifier, self).__init__()

        self.emb_dims = embedding_sizes
        self.cont_dims = continuous_size
        self.dropout = nn.Dropout(dropout)
        self.bn = nn.BatchNorm1d(self.cont_dims)

        # Embedding layers for categorical columns
        self.embedding_layers = nn.ModuleList(
            [nn.Embedding(cat, size) for cat, size in self.emb_dims]
        )
        n_emb = sum(e.embedding_dim for e in self.embedding_layers)

        # Linear layers
        self.linear = nn.Sequential(
            nn.Linear(in_features=n_emb + continuous_size, out_features=hidden),
            nn.ReLU(),
            nn.BatchNorm1d(hidden),
            nn.Linear(in_features=hidden, out_features=int(hidden/2)),
            nn.ReLU(),
            nn.BatchNorm1d(int(hidden/2)),
            nn.Linear(in_features=int(hidden/2), out_features=output_size)
        )

        # Initialize the layers weight
        self.embedding_layers.apply(self.init_layers)
        self.linear.apply(self.init_layers)


    @staticmethod
    def init_layers(m):
        if type(m) == nn.Linear or type(m) == nn.Embedding:
            nn.init.kaiming_normal_(m.weight)

    def forward(self, x_cont, x_cat):
        embeddings = [
            self.dropout(f(x_cat[:, i])) for i, f in enumerate(self.embedding_layers)
        ]
        embeddings = torch.cat(embeddings, 1)

        x_cont = self.bn(x_cont)
        x = torch.cat((embeddings, x_cont), 1)
        x = self.linear(x)
        return x

In [36]:
from torch.utils.data import RandomSampler

train_dataset = CustomDataset(
    emb_cols=emb_cols,
    x=X_train,
    y=y_train
)
train_dataloader = DataLoader(
    dataset=train_dataset,
    sampler=RandomSampler(train_dataset),
    batch_size=5
)

In [54]:
model = Classifier(
    hidden=32,
    output_size=1,
    continuous_size=11,
    embedding_sizes=emb_size[0],
    dropout=0.5
)
model.train()

Classifier(
  (dropout): Dropout(p=0.5, inplace=False)
  (bn): BatchNorm1d(11, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
  (embedding_layers): ModuleList(
    (0): Embedding(13, 7)
    (1): Embedding(5, 3)
    (2): Embedding(9, 5)
    (3): Embedding(4, 2)
    (4): Embedding(4, 2)
    (5): Embedding(4, 2)
    (6): Embedding(11, 6)
    (7): Embedding(6, 3)
    (8): Embedding(4, 2)
  )
  (linear): Sequential(
    (0): Linear(in_features=43, out_features=32, bias=True)
    (1): ReLU()
    (2): BatchNorm1d(32, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    (3): Linear(in_features=32, out_features=16, bias=True)
    (4): ReLU()
    (5): BatchNorm1d(16, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    (6): Linear(in_features=16, out_features=1, bias=True)
  )
)

In [55]:
from torch.optim import lr_scheduler
from torch import optim
from torch.nn import BCEWithLogitsLoss

optimizer = optim.Adam(model.parameters(), lr=0.05)
pos_weight = y_train.value_counts()[0] / y_train.value_counts()[1]
criterion = BCEWithLogitsLoss(pos_weight=torch.tensor(pos_weight))

scheduler = lr_scheduler.ReduceLROnPlateau(
        optimizer,
        "min",
        patience=10,
    )

In [56]:
from bank_telemarketing.train.train_model import train

performance = train(
    model=model,
    device="cpu",
    dataloader=train_dataloader,
    pos_weight=pos_weight,
    criterion=criterion,
    optimizer=optimizer,
    cut_point=0.5
)
print(performance)

{'loss': 1.144256016923517, 'f1': 0.3214959145191703, 'precision': 0.21507410911384423}
