# Chapter 3.6 - Predicting hourse prices - The Boston Hourse Price dataset

In [None]:
from keras.datasets import boston_housing

In [None]:
(train_data, train_targets), (test_data, test_targets) = boston_housing.load_data()

In [None]:
train_data.shape

In [None]:
test_data.shape

In [None]:
train_targets.shape

In [None]:
train_targets

## Preparing the data

### Normalizing the data

In [None]:
# Calculating the mean value of the training set (along the first axis (index = 0)).
mean = train_data.mean(axis = 0)

In [None]:
# Substracting that mean from the training data in order to center that around 0.
train_data -= mean

In [None]:
# Calculating the Standard Deviation of the training set
std = train_data.std(axis = 0)

In [None]:
print('Min: ', train_data.min())
print('Max: ', train_data.max())

In [None]:
# Dividing the dataset by STD
train_data /= std

In [None]:
print('Min: ', train_data.min())
print('Max: ', train_data.max())

In [None]:
# Substracting training mean value from the test dataset and dividing it by the STD
test_data -= mean
test_data /= std

In [None]:
print('Min: ', test_data.min())
print('Max: ', test_data.max())

## Building the network

In [None]:
from keras.models import Sequential
from keras.layers import Dense

In [None]:
# Constructing a function
def build_model():
    model = Sequential()
    model.add(Dense(units = 64, 
                    activation = 'relu', 
                    input_shape = (train_data.shape[1],)))
    model.add(Dense(units = 64, 
                    activation = 'relu'))
    model.add(Dense(units = 1))
    model.compile(optimizer = 'rmsprop', 
                  loss = 'mse',
                  metrics = ['mae'])
    return model

## K-fold validation

In [None]:
import numpy as np

In [None]:
# Defining number of folds
k = 4

In [None]:
num_val_samples = len(train_data) // k
num_epochs = 100
all_scores = []

In [None]:
for i in range(k):
    print('Processing fold #', i)
    val_data = train_data[i * num_val_samples: (i + 1) * num_val_samples]
    val_targets = train_targets[i * num_val_samples: (i + 1) * num_val_samples]
    partial_train_data = np.concatenate([train_data[:i * num_val_samples],
                                         train_data[(i + 1) * num_val_samples:]],
                                         axis = 0)
    partial_train_targets = np.concatenate([train_targets[:i * num_val_samples],
                                            train_targets[(i + 1) * num_val_samples:]],
                                            axis = 0)
    model = build_model()
    model.fit(x = partial_train_data, 
              y = partial_train_targets,
              epochs = num_epochs, 
              batch_size = 10, 
              verbose = 1)
    # Validation mean squared error and validation mean absolute error
    val_mse, val_mae = model.evaluate(val_data, 
                                      val_targets, 
                                      verbose = 0)
    all_scores.append(val_mae)

In [None]:
all_scores

In [None]:
np.mean(all_scores)

## Saving the validation logs at each fold

In [None]:
from keras import backend as K

# Some memory clean-up
K.clear_session()

In [None]:
num_epochs = 500
all_mae_histories = []
for i in range(k):
    print('Processing fold #', i)
    val_data = train_data[i * num_val_samples: (i + 1) * num_val_samples]
    val_targets = train_targets[i * num_val_samples: (i + 1) * num_val_samples]
    partial_train_data = np.concatenate([train_data[:i * num_val_samples],
                                         train_data[(i + 1) * num_val_samples:]],
                                         axis = 0)
    
    partial_train_targets = np.concatenate([train_targets[:i * num_val_samples],
                                            train_targets[(i + 1) * num_val_samples:]],
                                            axis = 0)
    model = build_model()
    history = model.fit(x = partial_train_data, 
                        y = partial_train_targets,
                        validation_data = (val_data, val_targets),
                        epochs = num_epochs, 
                        batch_size = 10, 
                        verbose = 0)
    mae_history = history.history['val_mean_absolute_error']
    all_mae_histories.append(mae_history)

## Average Mean Absolute Error

In [None]:
# Average Mean Absolute Error for all folds
average_mae_history = [np.mean([x[i] for x in all_mae_histories]) for i in range(num_epochs)]

In [None]:
# Plotting the history (mean value for all folds)
import matplotlib.pyplot as plt
plt.plot(range(1, len(average_mae_history) + 1), 
         average_mae_history)
plt.xlabel('Epochs')
plt.ylabel('Validation MAE')
plt.show()

## Smothing the curve with exponential moving average

In [None]:
# First 10 points have much higher value thus the visibility is limited
# Introducing smothed curve with exponential moving average
def smooth_curve(points, factor=0.9):
    smoothed_points = []
    for point in points:
        if smoothed_points:
            previous = smoothed_points[-1]
            smoothed_points.append(previous * factor + point * (1 - factor))
    else:
        smoothed_points.append(point)
    return smoothed_points

In [None]:
smooth_mae_history = smooth_curve(average_mae_history[10:])

In [None]:
# Plotting the function (smoothed)
plt.plot(range(1, len(smooth_mae_history) + 1), 
         smooth_mae_history)
plt.xlabel('Epochs')
plt.ylabel('Validation MAE')
plt.show()

## Training the final model

In [None]:
from keras import backend as K

# Some memory clean-up
K.clear_session()

In [None]:
model = build_model()
model.fit(train_data, 
          train_targets,
          epochs = 80, 
          batch_size = 16, 
          verbose = 1)
test_mse_score, test_mae_score = model.evaluate(test_data, test_targets)

In [None]:
test_mae_score