In [76]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
import torch
import torch.nn as nn
from torch.utils.data import TensorDataset, DataLoader, ConcatDataset
import optuna

In [77]:
def define_model(trial):
    n_layers = trial.suggest_int("n_layers", 1, 3)
    layers = []

    in_features = 1
    for i in range(n_layers):
        out_features = trial.suggest_int('n_units_l{}'.format(i), 4, 128)
        layers.append(nn.Linear(in_features, out_features))
        layers.append(nn.ReLU())
        p = trial.suggest_float('droppout_l{}'.format(i), 0.2, 0.5)
        layers.append(nn.Dropout(p))

        in_features = out_features
    layers.append(nn.Linear(in_features, 1))
    layers.append(nn.LogSoftmax(dim=1))

    return nn.Sequential(*layers)

In [78]:
def get_data():

    # Basic Pre-Processing
    df = pd.read_csv(r'./data/train.csv', index_col='Id')
    df = df.dropna(axis=1)
    df['MSSubClass'] = df['MSSubClass'].astype('object')
    y = df['SalePrice']
    X = df.drop('SalePrice', axis=1)

    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, random_state=42)

    # Prepare Training DataLoader
    train_continuous_features = list(X_train.select_dtypes(include=['int']).columns)
    train_categorical_features = list(X_train.select_dtypes(include=['object']).columns)

    for col in train_continuous_features:
        X_train[col] = X_train[col].astype('int32')
    for col in train_categorical_features:
        X_train[col] = X_train[col].astype('category')

    train_continuous = np.stack([X_train[col].values for col in train_continuous_features], 1)
    train_categorical = np.stack([X_train[col].cat.codes.values for col in train_categorical_features], 1)

    y = torch.tensor(y_train.values, dtype=torch.float32).reshape(-1, 1)
    train_continuous = torch.tensor(train_continuous, dtype=torch.float32)
    train_categorical = torch.tensor(train_categorical, dtype=torch.int32)

    train_category_sizes = [len(X_train[col].cat.categories) for col in train_categorical_features]
    train_embedding_sizes = [(size, min(50, (size+1)//2)) for size in train_category_sizes]
    train_self_embeddings = nn.ModuleList([nn.Embedding(ni, nf) for ni, nf in train_embedding_sizes])

    embeddings = []
    for i,e in enumerate(train_self_embeddings):
        embeddings.append(e(train_categorical[:,i]))

    embedded_categorical = torch.cat(embeddings, 1)
    X_train_tensor = torch.cat((train_continuous, embedded_categorical), 1)
    train_datset = TensorDataset(X_train_tensor, embedded_categorical)
    train_dataloader = DataLoader(train_datset, batch_size=20, shuffle=True)

    # Prepare Test DataLoader
    test_continuous_features = list(X_test.select_dtypes(include=['int']).columns)
    test_categorical_features = list(X_test.select_dtypes(include=['object']).columns)

    for col in test_continuous_features:
        X_test[col] = X_test[col].astype('int32')
    for col in test_categorical_features:
        X_test[col] = X_test[col].astype('category')

    test_continuous = np.stack([X_test[col].values for col in test_continuous_features], 1)
    test_categorical = np.stack([X_test[col].cat.codes.values for col in test_categorical_features], 1)

    y = torch.tensor(y_test.values, dtype=torch.float32).reshape(-1, 1)
    test_continuous = torch.tensor(test_continuous, dtype=torch.float32)
    test_categorical = torch.tensor(test_categorical, dtype=torch.int32)

    test_category_sizes = [len(X_test[col].cat.categories) for col in test_categorical_features]
    test_embedding_sizes = [(size, min(50, (size+1)//2)) for size in test_category_sizes]
    test_self_embeddings = nn.ModuleList([nn.Embedding(ni, nf) for ni, nf in test_embedding_sizes])

    embeddings = []
    for i,e in enumerate(test_self_embeddings):
        embeddings.append(e(test_categorical[:,i]))

    embedded_categorical = torch.cat(embeddings, 1)
    X_test_tensor = torch.cat((test_continuous, embedded_categorical), 1)
    test_datset = TensorDataset(X_test_tensor, embedded_categorical)
    test_dataloader = DataLoader(test_datset, batch_size=20, shuffle=True)

    return train_dataloader, test_dataloader

In [79]:
def objective(trial):
    pass

In [81]:
train_loader, test_loader = get_data()

In [80]:
# # Basic Pre-Processing
# df = pd.read_csv(r'./data/train.csv', index_col='Id')
# df = df.dropna(axis=1)
# df['MSSubClass'] = df['MSSubClass'].astype('object')
# y = df['SalePrice']
# X = df.drop('SalePrice', axis=1)
#
# X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, random_state=42)
#
# # Prepare Training DataLoader
# train_continuous_features = list(X_train.select_dtypes(include=['int']).columns)
# train_categorical_features = list(X_train.select_dtypes(include=['object']).columns)
#
# for col in train_continuous_features:
#     X_train[col] = X_train[col].astype('int32')
# for col in train_categorical_features:
#     X_train[col] = X_train[col].astype('category')
#
# train_continuous = np.stack([X_train[col].values for col in train_continuous_features], 1)
# train_categorical = np.stack([X_train[col].cat.codes.values for col in train_categorical_features], 1)
#
# y = torch.tensor(y_train.values, dtype=torch.float32).reshape(-1, 1)
# train_continuous = torch.tensor(train_continuous, dtype=torch.float32)
# train_categorical = torch.tensor(train_categorical, dtype=torch.int32)
#
# train_category_sizes = [len(X_train[col].cat.categories) for col in train_categorical_features]
# train_embedding_sizes = [(size, min(50, (size+1)//2)) for size in train_category_sizes]
# train_self_embeddings = nn.ModuleList([nn.Embedding(ni, nf) for ni, nf in train_embedding_sizes])
#
# embeddings = []
# for i,e in enumerate(train_self_embeddings):
#     embeddings.append(e(train_categorical[:,i]))
#
# embedded_categorical = torch.cat(embeddings, 1)
# X_train_tensor = torch.cat((train_continuous, embedded_categorical), 1)
# train_datset = TensorDataset(X_train_tensor, embedded_categorical)
# train_dataloader = DataLoader(train_datset, batch_size=20, shuffle=True)
#
# # Prepare Test DataLoader
# test_continuous_features = list(X_test.select_dtypes(include=['int']).columns)
# test_categorical_features = list(X_test.select_dtypes(include=['object']).columns)
#
# for col in test_continuous_features:
#     X_test[col] = X_test[col].astype('int32')
# for col in test_categorical_features:
#     X_test[col] = X_test[col].astype('category')
#
# test_continuous = np.stack([X_test[col].values for col in test_continuous_features], 1)
# test_categorical = np.stack([X_test[col].cat.codes.values for col in test_categorical_features], 1)
#
# y = torch.tensor(y_test.values, dtype=torch.float32).reshape(-1, 1)
# test_continuous = torch.tensor(test_continuous, dtype=torch.float32)
# test_categorical = torch.tensor(test_categorical, dtype=torch.int32)
#
# test_category_sizes = [len(X_test[col].cat.categories) for col in test_categorical_features]
# test_embedding_sizes = [(size, min(50, (size+1)//2)) for size in test_category_sizes]
# test_self_embeddings = nn.ModuleList([nn.Embedding(ni, nf) for ni, nf in test_embedding_sizes])
#
# embeddings = []
# for i,e in enumerate(test_self_embeddings):
#     embeddings.append(e(test_categorical[:,i]))
#
# embedded_categorical = torch.cat(embeddings, 1)
# X_test_tensor = torch.cat((test_continuous, embedded_categorical), 1)
# test_datset = TensorDataset(X_test_tensor, embedded_categorical)
# test_dataloader = DataLoader(test_datset, batch_size=20, shuffle=True)