In [1]:
import pandas as pd
import numpy as np
import torch
import torch.nn as nn
import matplotlib.pyplot as plt

In [2]:
train_df = pd.read_csv(r'./data/train.csv', index_col='Id')
test_df = pd.read_csv(r'./data/test.csv')
# sample_df = pd.read_csv(r'./data/sample_submission.csv')

In [3]:
train_df = train_df.dropna(axis=1)
train_df['MSSubClass'] = train_df['MSSubClass'].astype('object')
train_y = train_df['SalePrice']
train_df = train_df.drop('SalePrice', axis=1)

In [4]:
continuous_features = list(train_df.select_dtypes(include=['int']).columns)
categorical_features  = list(train_df.select_dtypes(include=['object']).columns)

In [5]:
for col in continuous_features:
    train_df[col] = train_df[col].astype('int32')
for col in categorical_features:
    train_df[col] = train_df[col].astype('category')

In [6]:
continuous = np.stack([train_df[col].values for col in continuous_features], 1)
categorical = np.stack([train_df[col].cat.codes.values for col in categorical_features], 1)

In [7]:
y = torch.tensor(train_y.values, dtype=torch.float32).reshape(-1, 1)
continuous = torch.tensor(continuous, dtype=torch.float32)
categorical = torch.tensor(categorical, dtype=torch.int32)

In [8]:
categorical_sizes = [len(train_df[col].cat.categories) for col in categorical_features]
embedding_sizes = [(size, min(50, (size + 1) // 2 )) for size in categorical_sizes]

In [9]:
class TabularModel(nn.Module):

    def __init__(self, emb_szs, n_cont, out_sz, layers, p=0.5):
        super().__init__()
        self.embeds = nn.ModuleList([nn.Embedding(ni, nf) for ni,nf in emb_szs])
        self.emb_drop = nn.Dropout(p)
        self.bn_cont = nn.BatchNorm1d(n_cont)

        layerlist = []
        n_emb = sum((nf for ni,nf in emb_szs))
        n_in = n_emb + n_cont

        for i in layers:
            layerlist.append(nn.Linear(n_in,i))
            layerlist.append(nn.ReLU(inplace=True))
            layerlist.append(nn.BatchNorm1d(i))
            layerlist.append(nn.Dropout(p))
            n_in = i
        layerlist.append(nn.Linear(layers[-1],out_sz))

        self.layers = nn.Sequential(*layerlist)

    def forward(self, x_cat, x_cont):
        embeddings = []
        for i,e in enumerate(self.embeds):
            embeddings.append(e(x_cat[:,i]))
        x = torch.cat(embeddings, 1)
        x = self.emb_drop(x)

        x_cont = self.bn_cont(x_cont)
        x = torch.cat([x, x_cont], 1)
        x = self.layers(x)
        return x

In [10]:
torch.manual_seed(33)
model = TabularModel(embedding_sizes, continuous.shape[1], 1, [256,128], p=0.4)

In [11]:
model

TabularModel(
  (embeds): ModuleList(
    (0): Embedding(15, 8)
    (1): Embedding(5, 3)
    (2): Embedding(2, 1)
    (3): Embedding(4, 2)
    (4): Embedding(4, 2)
    (5): Embedding(2, 1)
    (6): Embedding(5, 3)
    (7): Embedding(3, 2)
    (8): Embedding(25, 13)
    (9): Embedding(9, 5)
    (10): Embedding(8, 4)
    (11): Embedding(5, 3)
    (12): Embedding(8, 4)
    (13): Embedding(6, 3)
    (14): Embedding(8, 4)
    (15): Embedding(15, 8)
    (16): Embedding(16, 8)
    (17): Embedding(4, 2)
    (18): Embedding(5, 3)
    (19): Embedding(6, 3)
    (20): Embedding(6, 3)
    (21): Embedding(5, 3)
    (22): Embedding(2, 1)
    (23): Embedding(4, 2)
    (24): Embedding(7, 4)
    (25): Embedding(3, 2)
    (26): Embedding(9, 5)
    (27): Embedding(6, 3)
  )
  (emb_drop): Dropout(p=0.4, inplace=False)
  (bn_cont): BatchNorm1d(32, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
  (layers): Sequential(
    (0): Linear(in_features=137, out_features=256, bias=True)
    (1): ReL

In [12]:
criterion = nn.MSELoss()
optimizer = torch.optim.Adam(model.parameters(), lr=0.005)

In [13]:
batch_size = 200
test_size = int(batch_size * .2)

cat_train = categorical[:batch_size-test_size]
cat_test = categorical[batch_size-test_size:batch_size]
con_train = continuous[:batch_size-test_size]
con_test = continuous[batch_size-test_size:batch_size]
y_train = y[:batch_size-test_size]
y_test = y[batch_size-test_size:batch_size]

In [14]:
import time
start_time = time.time()

epochs = 30000
losses = []

for i in range(epochs):
    i+=1
    y_pred = model(cat_train, con_train)
    loss = torch.sqrt(criterion(y_pred, y_train))
    losses.append(loss.detach().numpy())

    if i%25 == 1:
        print(f'epoch: {i:3}  loss: {loss.item():10.8f}')

    optimizer.zero_grad()
    loss.backward()
    optimizer.step()

print(f'epoch: {i:3}  loss: {loss.item():10.8f}')
print(f'\nDuration: {time.time() - start_time:.0f} seconds')

epoch:   1  loss: 189345.03125000
epoch:  26  loss: 189342.70312500
epoch:  51  loss: 189340.92187500
epoch:  76  loss: 189338.23437500
epoch: 101  loss: 189335.92187500
epoch: 126  loss: 189332.57812500
epoch: 151  loss: 189329.25000000
epoch: 176  loss: 189324.75000000
epoch: 201  loss: 189320.64062500
epoch: 226  loss: 189314.95312500
epoch: 251  loss: 189309.76562500
epoch: 276  loss: 189301.89062500
epoch: 301  loss: 189296.25000000
epoch: 326  loss: 189288.59375000
epoch: 351  loss: 189282.31250000
epoch: 376  loss: 189272.65625000
epoch: 401  loss: 189266.93750000
epoch: 426  loss: 189256.26562500
epoch: 451  loss: 189244.64062500
epoch: 476  loss: 189235.32812500
epoch: 501  loss: 189223.56250000
epoch: 526  loss: 189215.31250000
epoch: 551  loss: 189199.89062500
epoch: 576  loss: 189193.43750000
epoch: 601  loss: 189177.96875000
epoch: 626  loss: 189165.85937500
epoch: 651  loss: 189156.20312500
epoch: 676  loss: 189138.25000000
epoch: 701  loss: 189126.28125000
epoch: 726  lo

In [None]:
plt.plot(range(epochs), losses)
plt.ylabel('RMSE Loss')
plt.xlabel('epoch');