In [5]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout
from time import time
from tensorflow.python.keras.callbacks import TensorBoard
from sklearn.metrics import mean_absolute_error
import wandb
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

In [3]:
def data():
    training = pd.read_csv("./training_preprocessed")
    validation = pd.read_csv("./validation_preprocessed")
    test = pd.read_csv("./test_preprocessed")
    X_train = training.drop(columns="y")
    y_train = training["y"]
    X_valid = validation.drop(columns="y")
    y_valid = validation["y"]
    X_test = test
    return X_train, y_train, X_valid, y_valid, X_test

In [4]:
X_train, y_train, X_valid, y_valid, X_test = data()

In [6]:
wandb.init(project="house-price-prediction", entity="luiscostigan")

Failed to detect the name of this notebook, you can set it manually with the WANDB_NOTEBOOK_NAME environment variable to enable code saving.
[34m[1mwandb[0m: Currently logged in as: [33mluiscostigan[0m (use `wandb login --relogin` to force relogin)


In [7]:
def create_model():
    model = Sequential()
    model.add(Dense(64, input_dim=X_train.shape[1], activation='relu'))
    model.add(Dense(512, activation='relu'))
    model.add(Dropout(config.dropout1))
    model.add(Dense(512, activation='relu'))
    model.add(Dense(1))
    model.compile(optimizer='rmsprop', loss="mse", metrics=["mae"])
    
    return model

In [14]:
from sklearn.model_selection import RepeatedKFold

# Instantiate the validator, have it spit out the indices, generate batches from the indices, and feed those batches to the model.
rkf = RepeatedKFold(n_splits=10, n_repeats=5)
rkf.get_n_splits(X_train, y_train)

50

Currently trying to implement RepeatedKFold cross validation with the Keras Sequential model, while recording model results on WandB.

Sometimes I am getting a failed-1 error from WandB.
    - 

In [15]:
print(rkf.split(X_train, y_train))

<generator object _RepeatedSplits.split at 0x7f2048800120>


In [11]:
print(enumerate(rkf.split(X_train, y_train)))

<enumerate object at 0x7f2048939d40>


In [9]:
from wandb.keras import WandbCallback

sweep_config = {
  "name": "keras-sequential-model-sweep",
  "method": "random",
  "parameters": {
    "dropout1": {
      "min": 0.0,
      "max": 0.8
    },
    "epochs": {
      "values": [30,50,100]
    },
    "batch_size": {
      "values": [16, 64, 128, 512]
    }
  }
}

keras_sequential_sweep_1 = wandb.sweep(sweep_config, project="house-price-prediction", entity="luiscostigan")

def train():

  with wandb.init() as run:
    config = wandb.config

    for index, (train_indices, val_indices) in enumerate(rkf.split(X_train, y_train)):
      print("Training on fold " + str(index+1) + "/10...")

      # Generate batches from indices
      xtrain, xval = X_train[train_indices], X_train[val_indices]
      ytrain, yval = y_train[train_indices], y_train[val_indices]

      model = create_model()

      model.fit(
        np.asarray(xtrain), 
        ytrain, 
        epochs=config.epochs, 
        batch_size=config.batch_size, 
        verbose=0,
        callbacks=[WandbCallback()], 
        validation_data=(np.asarray(xval),yval)
        )
      
      test_mse_score, test_mae_score = model.evaluate(np.asarray(xval), yval, callbacks=[WandbCallback()])

count = 10
wandb.agent(keras_sequential_sweep_1, function=train, count=count)



Create sweep with ID: pvoqgyg4
Sweep URL: https://wandb.ai/luiscostigan/house-price-prediction/sweeps/pvoqgyg4


[34m[1mwandb[0m: Agent Starting Run: 8lhdu1eu with config:
[34m[1mwandb[0m: 	batch_size: 16
[34m[1mwandb[0m: 	dropout1: 0.37488221243751385
[34m[1mwandb[0m: 	epochs: 50


Training on fold 1/10...


VBox(children=(Label(value=' 0.00MB of 0.00MB uploaded (0.00MB deduped)\r'), FloatProgress(value=1.0, max=1.0)…

[34m[1mwandb[0m: [32m[41mERROR[0m Run 8lhdu1eu errored: KeyError("None of [Int64Index([   0,    1,    2,    3,    5,    6,    8,    9,   10,   11,\n            ...\n            1157, 1158, 1159, 1160, 1161, 1162, 1163, 1164, 1166, 1167],\n           dtype='int64', length=1051)] are in the [columns]")
[34m[1mwandb[0m: Agent Starting Run: p1h3i3lv with config:
[34m[1mwandb[0m: 	batch_size: 16
[34m[1mwandb[0m: 	dropout1: 0.5146620120077245
[34m[1mwandb[0m: 	epochs: 50
[34m[1mwandb[0m: Ctrl + C detected. Stopping sweep.
[34m[1mwandb[0m: W&B API key is configured (use `wandb login --relogin` to force relogin)


Training on fold 1/10...


VBox(children=(Label(value=' 0.00MB of 0.00MB uploaded (0.00MB deduped)\r'), FloatProgress(value=1.0, max=1.0)…

Train the model with Hyperopt to find best parameters.
Retrain model with new best parameters.
Make predictions.

In [18]:
best_params = {
    "dropout": 0.2,
    "epochs": 100,
    "batch_size": 128
}

In [20]:
best_params = {
    "dropout": 0.2,
    "epochs": 100,
    "batch_size": 128
}

def make_predictions(best_params):
    
    model = Sequential()
    model.add(Dense(64, input_dim=X_train.shape[1], activation='relu'))
    model.add(Dense(512, activation='relu'))
    model.add(Dropout(best_params.get("dropout")))
    model.add(Dense(512, activation='relu'))
    model.add(Dense(1))
    model.compile(optimizer='rmsprop', loss="mse", metrics=["mae"])
              
    model.fit(X_train, y_train, epochs=best_params.get("epochs"),
              batch_size=best_params.get("batch_size"), verbose = 0)
    
    preds  = model.predict(X_test, best_params.get("batch_size"), verbose = 0)
              
    return preds

In [22]:
predictions = make_predictions(best_params)

## Visualize Model Output

In [26]:
d

{'Id': RangeIndex(start=0, stop=1459, step=1),
 'SalePrice': array([[136474.11],
        [156334.36],
        [188805.05],
        ...,
        [181121.44],
        [106913.19],
        [214941.02]], dtype=float32)}

In [29]:
d = {"Id":X_test.index,"SalePrice":predictions.flatten()}
submission = pd.DataFrame(data=d, index=None)

# Revert log transform
#submission["SalePrice"] = np.exp(submission["SalePrice"])

In [30]:
submission

Unnamed: 0,Id,SalePrice
0,0,136474.109375
1,1,156334.359375
2,2,188805.046875
3,3,195929.828125
4,4,200915.140625
...,...,...
1454,1454,76588.890625
1455,1455,63704.851562
1456,1456,181121.437500
1457,1457,106913.187500


In [31]:
submission.to_csv("submission.csv",index=False)