In [None]:
data_path = 'https://archive.ics.uci.edu/ml/machine-learning-databases/statlog/german/german.data'

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import torch
from torch.optim import Adam, RMSprop
from torch.utils.data import TensorDataset, DataLoader
import torch.nn.functional as F
import torch.nn as nn

In [None]:
data = pd.read_csv(data_path, header=None, sep='\s+')
data.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,11,12,13,14,15,16,17,18,19,20
0,A11,6,A34,A43,1169,A65,A75,4,A93,A101,...,A121,67,A143,A152,2,A173,1,A192,A201,1
1,A12,48,A32,A43,5951,A61,A73,2,A92,A101,...,A121,22,A143,A152,1,A173,1,A191,A201,2
2,A14,12,A34,A46,2096,A61,A74,2,A93,A101,...,A121,49,A143,A152,1,A172,2,A191,A201,1
3,A11,42,A32,A42,7882,A61,A74,2,A93,A103,...,A122,45,A143,A153,1,A173,2,A191,A201,1
4,A11,24,A33,A40,4870,A61,A73,3,A93,A101,...,A124,53,A143,A153,2,A173,2,A191,A201,2


In [None]:
cat_columns = np.array([data.iloc[:,x].dtype=='O' for x in range(20)])

In [None]:
data_exog = data.iloc[:,:20]
data_target=data.iloc[:,20]

In [None]:
data_cat = data_exog.iloc[:,cat_columns]
data_num = data_exog.iloc[:,~cat_columns]

In [None]:
target = data.iloc[:,20]

In [None]:
data_num.shape

(1000, 7)

In [None]:
dict = {'x':0, 'y':1}
pd.Series(['x','y','z']).map(dict).replace(np.nan,9)

0    0.0
1    1.0
2    9.0
dtype: float64

In [None]:
dict_cat_n = {n:data_cat.loc[:,n].nunique()+1 for n in data_cat.columns}
dict_cat_n

{0: 5,
 2: 6,
 3: 11,
 5: 6,
 6: 6,
 8: 5,
 9: 4,
 11: 5,
 13: 4,
 14: 4,
 16: 5,
 18: 3,
 19: 3}

In [None]:
pd.get_dummies(data_cat).shape

(1000, 54)

In [None]:
emb_col = dict_cat_n.keys()
emb_size = {n:min(5, (c//2)+1) for n,c in dict_cat_n.items()}
emb_size

{0: 3,
 2: 4,
 3: 5,
 5: 4,
 6: 4,
 8: 3,
 9: 3,
 11: 3,
 13: 3,
 14: 3,
 16: 3,
 18: 2,
 19: 2}

In [None]:
def create_dict(pd_series):
  x = {k:v+1 for v,k in enumerate(sorted(pd_series.unique()))}
  x['UNK'] = 0
  return {k:v for k,v in sorted(x.items(), key=lambda x:x[1])}

In [None]:
dict_cat = {n:create_dict(data_cat.loc[:,n]) for n in data_cat.columns}
dict_cat

{0: {'UNK': 0, 'A11': 1, 'A12': 2, 'A13': 3, 'A14': 4},
 2: {'UNK': 0, 'A30': 1, 'A31': 2, 'A32': 3, 'A33': 4, 'A34': 5},
 3: {'UNK': 0,
  'A40': 1,
  'A41': 2,
  'A410': 3,
  'A42': 4,
  'A43': 5,
  'A44': 6,
  'A45': 7,
  'A46': 8,
  'A48': 9,
  'A49': 10},
 5: {'UNK': 0, 'A61': 1, 'A62': 2, 'A63': 3, 'A64': 4, 'A65': 5},
 6: {'UNK': 0, 'A71': 1, 'A72': 2, 'A73': 3, 'A74': 4, 'A75': 5},
 8: {'UNK': 0, 'A91': 1, 'A92': 2, 'A93': 3, 'A94': 4},
 9: {'UNK': 0, 'A101': 1, 'A102': 2, 'A103': 3},
 11: {'UNK': 0, 'A121': 1, 'A122': 2, 'A123': 3, 'A124': 4},
 13: {'UNK': 0, 'A141': 1, 'A142': 2, 'A143': 3},
 14: {'UNK': 0, 'A151': 1, 'A152': 2, 'A153': 3},
 16: {'UNK': 0, 'A171': 1, 'A172': 2, 'A173': 3, 'A174': 4},
 18: {'UNK': 0, 'A191': 1, 'A192': 2},
 19: {'UNK': 0, 'A201': 1, 'A202': 2}}

In [None]:
shuffled_index_ = np.array(range(len(data)))
np.random.shuffle(shuffled_index_)

In [None]:
train_index = shuffled_index_[:800]
val_index = shuffled_index_[800:]

In [None]:
x_train, x_test, y_train, y_test = data.iloc[train_index,:20], \
data.iloc[val_index,:20], data.iloc[train_index, 20], \
data.iloc[val_index, 20]

In [None]:
x_train_encoded = x_train.copy()
for col in emb_col:
  x_train_encoded.loc[:,col] = x_train.loc[:,col].map(dict_cat[col]).replace(np.nan,0)

x_test_encoded = x_test.copy()
for col in emb_col:
  x_test_encoded.loc[:,col] = x_test.loc[:,col].map(dict_cat[col]).replace(np.nan,0)

In [None]:
x_train_encoded

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19
825,1,21,5,1,1602,1,5,4,4,1,3,3,30,3,2,2,3,1,2,1
514,4,24,3,1,2255,5,2,4,3,1,1,2,54,3,2,1,3,1,1,1
904,4,24,3,5,1278,1,5,4,3,1,1,1,36,3,2,1,4,1,2,1
854,4,36,4,1,10875,1,5,2,3,1,2,3,45,3,2,2,3,2,2,1
652,1,24,3,1,2303,1,5,4,3,2,1,1,45,3,2,1,3,1,1,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
64,4,24,3,5,3181,1,2,4,2,1,4,2,26,3,2,1,3,1,2,1
385,4,18,5,5,2238,1,3,2,2,1,1,3,25,3,2,2,3,1,1,1
486,4,12,3,5,3077,1,3,2,3,1,4,3,52,3,2,1,3,1,2,1
954,1,12,3,1,1893,1,3,4,2,3,4,2,29,3,2,1,3,1,2,1


In [None]:
class _get_data():
  def __init__(self, X, y, emb_col):
    X = X.copy()
    self.X_cat = X.loc[:,emb_col].copy().values.astype(np.int64)
    self.X_num = X.drop(columns = emb_col).copy().values.astype(np.float64)
    self.y = y.values

  def __len__(self):
    return len(self.y)

  def __getitem__(self, idx):
    return self.X_cat[idx], self.X_num[idx], self.y[idx]

In [None]:
train_loader = DataLoader(_get_data(x_train_encoded, y_train, emb_col), batch_size = 32, shuffle = True)
test_loader = DataLoader(_get_data(x_test_encoded, y_test, emb_col), batch_size = 32, shuffle = False)

In [None]:
train_loader

<torch.utils.data.dataloader.DataLoader at 0x7f825c1f9a60>

In [None]:
for xx in train_loader:
  print(xx)
  break



[tensor([[ 4,  5, 10,  5,  5,  3,  1,  1,  3,  2,  3,  1,  1],
        [ 1,  3,  5,  1,  3,  2,  1,  1,  3,  2,  3,  1,  1],
        [ 1,  3,  2,  1,  4,  3,  1,  4,  3,  3,  4,  2,  1],
        [ 2,  2, 10,  1,  5,  2,  1,  4,  3,  1,  3,  1,  1],
        [ 1,  2,  4,  3,  2,  3,  1,  4,  3,  3,  3,  2,  1],
        [ 4,  3,  1,  2,  3,  3,  1,  1,  3,  1,  2,  1,  2],
        [ 1,  2,  5,  1,  3,  2,  1,  1,  1,  2,  2,  1,  1],
        [ 4,  3,  1,  1,  5,  3,  1,  2,  3,  2,  4,  2,  1],
        [ 2,  1, 10,  2,  3,  2,  1,  3,  3,  1,  2,  1,  1],
        [ 4,  3,  5,  1,  1,  2,  2,  1,  3,  1,  3,  1,  1],
        [ 3,  3,  5,  1,  4,  3,  1,  3,  1,  2,  3,  1,  1],
        [ 1,  3,  5,  1,  4,  3,  1,  1,  3,  2,  3,  1,  1],
        [ 4,  5,  2,  5,  4,  3,  1,  3,  3,  2,  3,  2,  1],
        [ 2,  3,  5,  4,  4,  3,  2,  4,  1,  2,  3,  2,  1],
        [ 1,  3,  1,  1,  2,  2,  1,  3,  3,  1,  3,  2,  1],
        [ 1,  5,  7,  2,  5,  3,  3,  1,  3,  2,  3,  1,  2],
       

In [None]:
cat_embed_size = {}
for key in dict_cat.keys():
  tup = (dict_cat_n[key], emb_size[key])
  cat_embed_size[key] = tup

cat_embed_size

{0: (5, 3),
 2: (6, 4),
 3: (11, 5),
 5: (6, 4),
 6: (6, 4),
 8: (5, 3),
 9: (4, 3),
 11: (5, 3),
 13: (4, 3),
 14: (4, 3),
 16: (5, 3),
 18: (3, 2),
 19: (3, 2)}

In [None]:
class MLP_Embedding(nn.Module):
  def __init__(self, cat_embed_size, n_num):
    super(MLP_Embedding, self).__init__()
    ding = nn.ModuleList([nn.Embedding(cat, size) for _, (cat, size) in cat_embed])
    n_emb = sum(e.embedding_dim for e in self.embedding)
    self.n_emb, self.n_num = n_emb, n_num
    self.lin1 = nn.Linear(self.n_emb + self.n_num, 256)
    self.bn1 = nn.BatchNorm1d(256)
    self.lin2 = nn.Linear(256, 128)
    self.out = nn.Linear(128,2)

  def forward(self, x_cat, x_num):
    x = [e(x_cat[:,i]) for i, e in enumerate(self.embedding)]
    x = torch.cat(x, dim=1)
    x = torch.cat([x,x_num], dim=1)
    x = F.relu(self.lin1(x))
    x = self.bn1(x)
    x = F.relu(self.lin2(x))
    x = self.out(x)
    return x

In [None]:
model = MLP_Embedding(cat_embed_size=cat_embed_size, 1)

SyntaxError: ignored