In [1]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.metrics import RootMeanSquaredError
from tensorflow.keras.losses import Huber
from time import time
from sklearn.metrics import mean_absolute_error
import wandb
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

In my previous post, I went through the following process:

Environment: Anaconda, Windows 11.

- Data wrangling (Light exploration, followed by removing and transforming some variables)
- Split the training dataset into training and validation datasets
- Fit a model using a Scikit-Learn pipeline (Data Preprocessing + fitting XGBoost/LightGBM estimators with a Randomized Search across their respective hyperparameters)
- Evaluate and visualize model performance
- Implement an automated approach to selecting hyperparameters (HyperOpt)
- Make predictions

In this post, I will implement the following using the same wrangled/preprocessed data:

Environment: Docker, Windows Subsystem for Linux 2 (WSL 2), Windows 11.

- Build a simple Sequential model in Keras/Tensorflow
- Use the Weights and Biases (WandB) platform to select optimal hyperparameters and record experiments.
  - Experiments are evaluated using K-Fold Cross Validation. Mean RMSE across folds for each experiment are custom logged in WandB. 
- Make predictions.
- Blend predictions from the previous post (decision tree) and this post (neural net).
  - By both taking the mean of predictions, and defining a meta-model trained on a holdout dataset kept completely separate.

Actually setting up the environment (Docker, Windows Subsystem for Linux 2 (WSL 2), Windows 11, VSCode, using CUDA) took a couple of days and is probably worthy of its own blog post. It required the following process:

- Install WSL2, CUDA drivers and Docker
    - Define a Dockerfile that uses a base image compatible with CUDA
    - Get libraries from requirements.txt
    - Set "runArgs" in devcontainer.json to allow GPU usage
- Run in VSCode (the Jupyter extension gave me some trouble)
    - I ended up creating the container directly from a Dockerfile in the same repo as my code

In [2]:
# Function to bring in wrangled/preprocessed data from previous post
def data():
    training = pd.read_csv("../sklearn/training_preprocessed")
    validation = pd.read_csv("../sklearn/validation_preprocessed")
    holdout = pd.read_csv("../sklearn/holdout_preprocessed")
    holdout_predictions_df = pd.read_csv("../sklearn/holdout_preds_preprocessed")
    test = pd.read_csv("../sklearn/test_preprocessed")
    
    X_train = training.drop(columns="SalePrice")
    y_train = training["SalePrice"]
    X_valid = validation.drop(columns="SalePrice")
    y_valid = validation["SalePrice"]
    X_holdout = holdout.drop(columns="Actual_SalePrice")
    y_holdout = holdout["Actual_SalePrice"]
    X_test = test
    holdout_predictions_df = holdout_predictions_df
    return X_train, y_train, X_valid, y_valid, X_holdout, y_holdout, X_test, holdout_predictions_df

# Bring in data
X_train, y_train, X_valid, y_valid, X_holdout, y_holdout, X_test, holdout_predictions_df = data()

# Since this model uses k-fold validation, we don't need separate training and validation datasets
X_train = X_train.append(X_valid).reset_index().drop(columns="index")
y_train = y_train.append(y_valid).reset_index().drop(columns="index").values

  X_train = X_train.append(X_valid).reset_index().drop(columns="index")
  y_train = y_train.append(y_valid).reset_index().drop(columns="index").values


In [3]:
# Log into Weights and Biases
wandb.init(project="house-price-prediction", entity="luiscostigan")

Failed to detect the name of this notebook, you can set it manually with the WANDB_NOTEBOOK_NAME environment variable to enable code saving.
[34m[1mwandb[0m: Currently logged in as: [33mluiscostigan[0m (use `wandb login --relogin` to force relogin)


In [None]:
# Define simple Sequential model
def create_model():
    model = Sequential()
    model.add(Dense(wandb.config.dense1, input_dim=X_train.shape[1], activation='relu'))
    model.add(Dropout(wandb.config.dropout1))
    model.add(Dense(wandb.config.dense2, activation='relu'))
    model.add(Dense(1))
    model.compile(optimizer='rmsprop', loss=Huber(), metrics=[RootMeanSquaredError()])
    
    return model

In [None]:
# Define training function and hyperparameter ranges
from wandb.keras import WandbCallback
from sklearn.model_selection import KFold

sweep_config = {
  "name": "keras-sequential-model-sweep",
  "method": "random",
  "parameters": {
    "dropout1": {
      "min": 0.0,
      "max": 0.4
    },
    "dense1": {
      "values": [32, 128, 512, 2048]
    },
    "dense2": {
      "values": [32, 128, 512, 2048]
    },
    "epochs": {
      "values": [30, 100, 250]
    },
    "batch_size": {
      "values": [16, 32, 64]
    }
  },
  "metric": {
    "name": "Mean Validation RMSE (all folds)",
    "goal": "minimize"
  }
}

config_defaults = {
  "dropout1": 0.1,
  "dense1": 512,
  "dense2": 512,
  "epochs": 100,
  "batch_size": 32
}

# Define number of splits
kf = KFold(n_splits=5)

def train():

  rmse_per_fold = []
  loss_per_fold = []
  fold_no = 1

  # Go through each split, and get the index number for each
  for train, test in kf.split(X_train, y_train):

    # With the current session in WandB
    with wandb.init(config=config_defaults) as run:

      # Recreate the model each time for each new batch
      model = None # Not sure if this step is necessary
      model = create_model()

      # Fit model on new batches
      model.fit(
        np.asarray(X_train), 
        y_train, 
        epochs=wandb.config.epochs, 
        batch_size=wandb.config.batch_size, 
        verbose=0,
        callbacks=[WandbCallback()], 
        validation_data=(np.asarray(X_train),y_train)
        )
      
      # Generate data for each
      scores = model.evaluate(np.asarray(X_train), y_train, callbacks=[WandbCallback()])
      print(f'Score for fold {fold_no}: {model.metrics_names[0]} of {scores[0]}; {model.metrics_names[1]} of {scores[1]}')
      rmse_per_fold.append(scores[1])
      loss_per_fold.append(scores[0])

      # Increase fold number
      fold_no = fold_no + 1

      # == Provide average scores ==
      print('------------------------------------------------------------------------')
      print('Score per fold')
      for i in range(0, len(rmse_per_fold)):
        print('------------------------------------------------------------------------')
        print(f'> Fold {i+1} - Loss: {loss_per_fold[i]} - RMSE: {rmse_per_fold[i]}')
      print('------------------------------------------------------------------------')
      print('Average scores for all folds:')
      print(f'> RMSE: {np.mean(rmse_per_fold)} (+- {np.std(rmse_per_fold)})')
      print(f'> Loss: {np.mean(loss_per_fold)}')
      print('------------------------------------------------------------------------')

      wandb.log({
        "Mean Validation RMSE (all folds)": np.mean(rmse_per_fold),
        "Mean Validation Loss (all folds)": np.mean(loss_per_fold) 
        })

      wandb.join()

keras_sequential_sweep_1 = wandb.sweep(sweep_config, project="house-price-prediction", entity="luiscostigan")

count = 10

wandb.agent(keras_sequential_sweep_1, function=train, count=count)

Hyperparameter optimization and experiment recording all took place within the Weights and Biases platform. The set of hyperparameters resulting in the lowest loss (in terms of RMSE) is noted in the top row of the image below:

<img src="./wandb_rmse.png" width="800">

In [166]:
# Enter best params from sweep
best_params = {
    "dropout": 0.05,
    "dense1": 8192,
    "dense2": 8192,
    "dense3": 64,
    "epochs": 100,
    "batch_size": 2
}

# Build model using the best parameters
def make_predictions(best_params, dataset):
    
    model = Sequential()
    model.add(Dense(best_params.get("dense1"), input_dim=X_train.shape[1], activation='relu'))
    model.add(Dropout(best_params.get("dropout")))
    model.add(Dense(best_params.get("dense2"), activation='relu'))
    model.add(Dense(best_params.get("dense3"), activation='relu'))
    model.add(Dense(1))
    model.compile(optimizer=Adam(0.001), loss="mean_absolute_error", metrics=[RootMeanSquaredError()])
    
    model.fit(X_train, y_train, epochs=best_params.get("epochs"), batch_size=best_params.get("batch_size"), verbose = 0)
                
    preds  = model.predict(dataset, best_params.get("batch_size"), verbose = 0)
              
    return preds

In [168]:
# Make predictions
test_predictions_log_transformed = make_predictions(best_params, X_test)
holdout_predictions_log_transformed = make_predictions(best_params, X_holdout)

2022-03-09 11:27:34.064532: W tensorflow/core/common_runtime/bfc_allocator.cc:462] Allocator (GPU_0_bfc) ran out of memory trying to allocate 256.00MiB (rounded to 268435456)requested by op RandomUniform
If the cause is memory fragmentation maybe the environment variable 'TF_GPU_ALLOCATOR=cuda_malloc_async' will improve the situation. 
Current allocation summary follows.
Current allocation summary follows.
2022-03-09 11:27:34.065038: I tensorflow/core/common_runtime/bfc_allocator.cc:1010] BFCAllocator dump for GPU_0_bfc
2022-03-09 11:27:34.065105: I tensorflow/core/common_runtime/bfc_allocator.cc:1017] Bin (256): 	Total Chunks: 86, Chunks in use: 86. 21.5KiB allocated for chunks. 21.5KiB in use in bin. 420B client-requested in use in bin.
2022-03-09 11:27:34.065126: I tensorflow/core/common_runtime/bfc_allocator.cc:1017] Bin (512): 	Total Chunks: 0, Chunks in use: 0. 0B allocated for chunks. 0B in use in bin. 0B client-requested in use in bin.
2022-03-09 11:27:34.065132: I tensorflow/c

ResourceExhaustedError: OOM when allocating tensor with shape[8192,8192] and type float on /job:localhost/replica:0/task:0/device:GPU:0 by allocator GPU_0_bfc [Op:RandomUniform]

sorflow/core/common_runtime/bfc_allocator.cc:1066] InUse at 6107c1a00 of size 256 next 65
2022-03-09 11:27:34.065660: I tensorflow/core/common_runtime/bfc_allocator.cc:1066] InUse at 6107c1b00 of size 256 next 89
2022-03-09 11:27:34.065662: I tensorflow/core/common_runtime/bfc_allocator.cc:1066] InUse at 6107c1c00 of size 256 next 88
2022-03-09 11:27:34.065665: I tensorflow/core/common_runtime/bfc_allocator.cc:1066] InUse at 6107c1d00 of size 256 next 33
2022-03-09 11:27:34.065667: I tensorflow/core/common_runtime/bfc_allocator.cc:1066] InUse at 6107c1e00 of size 256 next 19
2022-03-09 11:27:34.065669: I tensorflow/core/common_runtime/bfc_allocator.cc:1066] InUse at 6107c1f00 of size 256 next 22
2022-03-09 11:27:34.065671: I tensorflow/core/common_runtime/bfc_allocator.cc:1066] InUse at 6107c2000 of size 256 next 80
2022-03-09 11:27:34.065673: I tensorflow/core/common_runtime/bfc_allocator.cc:1066] InUse at 6107c2100 of size 256 next 82
2022-03-09 11:27:34.065676: I tensorflow/core/com

In [155]:
# Undo the log transform
test_predictions = np.exp(test_predictions_log_transformed)
holdout_predictions = np.exp(holdout_predictions_log_transformed)

# Generating submission CSV
d = {"Id":X_test.index,"SalePrice":test_predictions.flatten()}
submission = pd.DataFrame(data=d, index=None)

submission["Id"] = submission["Id"] + 1461

submission.to_csv("submission_nn.csv",index=False)

## Blending Predictions

So far, I have generated predictions using a decision tree-based model and a neural net-based model.
First, I'll try taking the mean of predictions from both to see how it performs.

In [156]:
# Read CSVs
decision_tree_predictions = pd.read_csv("../sklearn/xgb_lgb_test_predictions.csv")
neural_net_predictions = pd.read_csv("../tensorflow/submission_nn.csv")

# Merge CSVs on Id column
decision_tree_predictions["NN_Predictions"] = neural_net_predictions["SalePrice"]

# Log transform NN predictions again (for consistent RMSE value)
decision_tree_predictions["NN_Predictions"] = np.log(decision_tree_predictions["NN_Predictions"])

# Add new column with mean
decision_tree_predictions["SalePrice"] = decision_tree_predictions[["NN_Predictions","XGBoost_Predictions","LightGBM_Predictions"]].mean(axis=1)

In [159]:
# Refresh data
X_train, y_train, X_valid, y_valid, X_holdout, y_holdout, X_test, holdout_predictions_df = data()

# Append holdout set NN predictions to DT predictions
holdout_predictions_df["NN_Predictions"] = holdout_predictions_log_transformed
holdout_predictions_df = holdout_predictions_df[["XGBoost_Predictions", "LightGBM_Predictions", "NN_Predictions", "Actual_SalePrice"]]

# Add mean column
holdout_predictions_df["Mean_SalePrice"] = holdout_predictions_df[["XGBoost_Predictions", "LightGBM_Predictions", "NN_Predictions"]].mean(axis=1)

# Add weighted mean column
weight = [0, 0, 1]
holdout_predictions_df["Weighted_Mean_SalePrice"] = holdout_predictions_df[["XGBoost_Predictions", "LightGBM_Predictions", "NN_Predictions"]].dot(weight)

# Calculate RMSE
print(mean_squared_error(holdout_predictions_df["Actual_SalePrice"], holdout_predictions_df["Mean_SalePrice"], squared=False))
print(mean_squared_error(holdout_predictions_df["Actual_SalePrice"], holdout_predictions_df["Weighted_Mean_SalePrice"], squared=False))


0.17502633791854189
0.3557935296772313


In [158]:
# Check weighted mean on holdout set

holdout_predictions_df.applymap(np.exp)

Unnamed: 0,XGBoost_Predictions,LightGBM_Predictions,NN_Predictions,Actual_SalePrice,Mean_SalePrice,Weighted_Mean_SalePrice
0,193813.764467,208400.892259,157616.975689,208500.0,185337.112998,194026.451664
1,132119.883800,138532.076216,118430.910673,129500.0,129417.954594,132553.902928
2,128478.197544,149772.500875,98025.493775,132000.0,123556.856848,130936.639170
3,112995.017070,116435.661968,97396.407900,90000.0,108616.569688,112335.151209
4,153034.168657,153558.656517,111143.027711,159000.0,137715.126478,148369.132926
...,...,...,...,...,...,...
287,263619.749227,246863.530560,193733.388149,271000.0,232744.691390,250636.486400
288,180573.107057,170542.417567,137372.957888,192140.0,161731.858998,172715.650256
289,132455.762402,139596.575508,106045.326993,143750.0,125163.890023,131599.456900
290,108509.824907,112157.994953,102181.881757,64500.0,107536.954265,108935.115520


In [None]:
# Drop other columns
decision_tree_predictions = decision_tree_predictions[["Id","SalePrice"]]

# Create mean prediction submission CSV
mean_predictions = decision_tree_predictions
mean_predictions.to_csv("submission_mean.csv",index=False)

Taking the mean of predictions did not beat my score from just using LightGBM in the previous post.

<img src="./mean_predictions_kaggle.png" width="600">

Next, I'll try defining a meta-model to best blend predictions from the two models. After developing each model, predictions were also made on a holdout dataset that was kept separate from the training and validation datasets, for the explicit purpose of training this meta-model. The model trained on this dataset was then used to blend predictions on the test dataset to be submitted to Kaggle.

In [None]:
# Get predictions on test set
test_predictions_dt = pd.read_csv("/root/data-science-projects-1/house-price-prediction/Models/sklearn/submission_dt.csv")
test_predictions_nn = pd.read_csv("/root/data-science-projects-1/house-price-prediction/Models/tensorflow/submission_nn.csv")

# Merge CSVs on Id column
test_predictions_dt["NN_predictions"] = test_predictions_nn["SalePrice"]

# Rename decision tree predictions column
test_predictions_dt = test_predictions_dt.rename(columns={"SalePrice":"DT_predictions"})

# Rename df
blended_predictions_df = test_predictions_dt

The meta-model was a simple grid search across different estimators, without attempting to optimize hyperparameters. Since the holdout dataset is very small, I used 10 folds in the cross-validation process.

In [41]:
import warnings
warnings.filterwarnings("ignore", message=".*Int64Index.*")

from sklearn.linear_model import LinearRegression, Ridge
from sklearn.model_selection import cross_val_score, GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.metrics import make_scorer, mean_squared_error
import xgboost as xgb
import lightgbm as lgb

# Defining a custom loss function (Root Mean Squared Error)
rmse = make_scorer(mean_squared_error, squared=False)

X = holdout_predictions_df[["XGBoost_Predictions", "LightGBM_Predictions", "NN_Predictions"]]
y = holdout_predictions_df["Actual_SalePrice"]

estimators = [
    {
        "clf": (LinearRegression(),)
    },
    {
        "clf": (Ridge(),)
    },
    {
        "clf": (xgb.XGBRegressor(),)
    },
    {
       "clf": (lgb.LGBMRegressor(),)
    }
]

pipe = Pipeline([("clf", LinearRegression())])

grid_search = GridSearchCV(pipe, estimators, cv=20, scoring=rmse)
grid_search.fit(X,y)
grid_search.score(X,y)

0.1495170648143068

In [42]:
grid_search.cv_results_

{'mean_fit_time': array([0.00156263, 0.00148139, 0.14347031, 0.04534595]),
 'std_fit_time': array([0.00034978, 0.00024029, 0.1271254 , 0.08243149]),
 'mean_score_time': array([0.0009854 , 0.00090621, 0.00396862, 0.00125448]),
 'std_score_time': array([0.0001783 , 0.00012706, 0.00024219, 0.00014019]),
 'param_clf': masked_array(data=[LinearRegression(), Ridge(),
                    XGBRegressor(base_score=None, booster=None, colsample_bylevel=None,
                                 colsample_bynode=None, colsample_bytree=None,
                                 enable_categorical=False, gamma=None, gpu_id=None,
                                 importance_type=None, interaction_constraints=None,
                                 learning_rate=None, max_delta_step=None, max_depth=None,
                                 min_child_weight=None, missing=nan, monotone_constraints=None,
                                 n_estimators=100, n_jobs=None, num_parallel_tree=None,
                          

In [None]:
# Make predictions 
grid_search_blended_predictions = grid_search.predict(blended_predictions_df[["DT_predictions", "NN_predictions"]])

In [None]:
# Add predictions to a dataframe
grid_search_blended_predictions_df = blended_predictions_df
grid_search_blended_predictions_df["SalePrice"] = grid_search_blended_predictions
grid_search_blended_predictions_df = grid_search_blended_predictions_df.drop(columns=["DT_predictions","NN_predictions"])

# Create CSV with blended predictions
grid_search_blended_predictions_df.to_csv("submission_gridsearch_blended.csv",index=False)

In [None]:
grid_search_blended_predictions_df

I had high hopes for a meta-model that blended predictions, but it performed worse than I expected.



## Improvements to subsequent models

Below I note some improvements to the models I would implement if I had more time.

- Remove outliers (using something like sklearn's IsolationForest)
- 