# Weather forecast - A timeseries example

This notebook was written as an application of Chapter 10 (Deep learning for timeseries) of _Deep learning with Python_ from François Chollet.

In [45]:
import datetime
import os

import pandas as pd
import tensorflow as tf
import numpy as np
import matplotlib.pyplot as plt
import keras_tuner

## 0. Download and import the data

In [None]:
# Detect if a file exists
if not os.path.isfile('jena_climate_2009_2016.csv.zip'):
    !wget https://s3.amazonaws.com/keras-datasets/jena_climate_2009_2016.csv.zip
    !unzip jena_climate_2009_2016.csv.zip

In [46]:
columns = {
    "datetime": "Date Time",
    "p": "p (mbar)",
    "t": "T (degC)",
    "tpot": "Tpot (K)",
    "tdew": "Tdew (degC)",
    "rh": "rh (%)",
    "vpmax": "VPmax (mbar)",
    "vpact": "VPact (mbar)",
    "vpdef": "VPdef (mbar)",
    "sh": "sh (g/kg)",
    "h2Oc": "H2OC (mmol/mol)",
    "rho": "rho (g/m**3)",
    "wv": "wv (m/s)",
    "max_wv": "max. wv (m/s)",
    "wd": "wd (deg)"
}

jena_climate_2009_2016 = pd.read_csv(
    'jena_climate_2009_2016.csv',
    sep=',',
    names=columns.keys(),
    skiprows=1)
jena_climate_2009_2016.datetime = pd.to_datetime(jena_climate_2009_2016.datetime, format='%d.%m.%Y %H:%M:%S')
nb_rows = len(jena_climate_2009_2016)
nb_cols = len(jena_climate_2009_2016.columns)
jena_climate_2009_2016

Unnamed: 0,datetime,p,t,tpot,tdew,rh,vpmax,vpact,vpdef,sh,h2Oc,rho,wv,max_wv,wd
0,2009-01-01 00:10:00,996.52,-8.02,265.40,-8.90,93.30,3.33,3.11,0.22,1.94,3.12,1307.75,1.03,1.75,152.3
1,2009-01-01 00:20:00,996.57,-8.41,265.01,-9.28,93.40,3.23,3.02,0.21,1.89,3.03,1309.80,0.72,1.50,136.1
2,2009-01-01 00:30:00,996.53,-8.51,264.91,-9.31,93.90,3.21,3.01,0.20,1.88,3.02,1310.24,0.19,0.63,171.6
3,2009-01-01 00:40:00,996.51,-8.31,265.12,-9.07,94.20,3.26,3.07,0.19,1.92,3.08,1309.19,0.34,0.50,198.0
4,2009-01-01 00:50:00,996.51,-8.27,265.15,-9.04,94.10,3.27,3.08,0.19,1.92,3.09,1309.00,0.32,0.63,214.3
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
420446,2016-12-31 23:20:00,1000.07,-4.05,269.10,-8.13,73.10,4.52,3.30,1.22,2.06,3.30,1292.98,0.67,1.52,240.0
420447,2016-12-31 23:30:00,999.93,-3.35,269.81,-8.06,69.71,4.77,3.32,1.44,2.07,3.32,1289.44,1.14,1.92,234.3
420448,2016-12-31 23:40:00,999.82,-3.16,270.01,-8.21,67.91,4.84,3.28,1.55,2.05,3.28,1288.39,1.08,2.00,215.2
420449,2016-12-31 23:50:00,999.81,-4.23,268.94,-8.53,71.80,4.46,3.20,1.26,1.99,3.20,1293.56,1.49,2.16,225.8


## 1. Analyze the data

We want to know what are the main patterns in order to correctly analyze the coherence of our model's predictions

### General purpose plots: get an insight of global tendencies over years

In [None]:
colors = [
    "blue",
    "orange",
    "green",
    "red",
    "purple",
    "brown",
    "pink",
    "gray",
    "olive",
    "cyan",
]

fig, axes = plt.subplots(ncols=3, nrows=nb_cols // 3, sharex=True, figsize=[10 * 3, 6 * nb_cols // 3])

for i, col in enumerate(jena_climate_2009_2016.columns):
    if col == "datetime":
        continue
    ax = axes[i // 3, i % 3]
        
    jena_climate_2009_2016.plot(x='datetime', y=col, ax=ax, label=columns[col], color=colors[i % (len(colors))])
    
    ax.grid(True)
    ax.set_xlabel('Date')
    ax.set_ylabel(columns[col])
    
fig.tight_layout()

### Link between temperature, pressure and humidity over a year

In [None]:
# Observing the variation at some day
reference = 2 * nb_rows / 5
nb_days = 31 * 3

start = int(reference)
end = int(6 * 24 * nb_days + reference) 

fig, axes = plt.subplots(nrows=3, ncols=1, figsize=(8, 6 * 3), sharex=True)

for i, col in enumerate(['t', 'p', 'rh']):
    jena_climate_2009_2016.iloc[start:end].plot(x='datetime', label=columns[col], y=col, ax=axes[i])

fig.tight_layout()

## 2. Prepare data for the pipeline

Be careful to remove the temperature from the dataset, as it's what we want to predict...

We also don't consider the datetime column, as we can totally rely solely on indexes

### Splitting the dataset

In [47]:
training_proportion = 0.5
validation_proportion = 0.25
test_split = 1 - training_proportion - validation_proportion

training_index = int(training_proportion * nb_rows)
validation_index = training_index + int(validation_proportion * nb_rows)

raw_data = jena_climate_2009_2016.copy()
temperature = raw_data.t
raw_data.pop('datetime')

0        2009-01-01 00:10:00
1        2009-01-01 00:20:00
2        2009-01-01 00:30:00
3        2009-01-01 00:40:00
4        2009-01-01 00:50:00
                 ...        
420446   2016-12-31 23:20:00
420447   2016-12-31 23:30:00
420448   2016-12-31 23:40:00
420449   2016-12-31 23:50:00
420450   2017-01-01 00:00:00
Name: datetime, Length: 420451, dtype: datetime64[ns]

### Normalizing all the data

We normalize all the values (they're all numerical, it's easy) according to the mean and standard deviation of the training data.

In [48]:
mean = raw_data[:training_index].mean()
std = raw_data[:training_index].std()
raw_data = (raw_data - mean) / std
raw_data[:training_index]

Unnamed: 0,p,t,tpot,tdew,rh,vpmax,vpact,vpdef,sh,h2Oc,rho,wv,max_wv,wd
0,0.913649,-1.920636,-1.974488,-1.866254,1.048015,-1.291316,-1.467152,-0.782343,-1.470122,-1.472032,2.124151,-0.730165,-0.779351,-0.281192
1,0.919528,-1.965100,-2.018478,-1.919925,1.054028,-1.304472,-1.488855,-0.784440,-1.489114,-1.493462,2.172914,-0.932305,-0.886968,-0.469893
2,0.914825,-1.976501,-2.029758,-1.924162,1.084097,-1.307103,-1.491266,-0.786537,-1.492912,-1.495843,2.183381,-1.277899,-1.261473,-0.056383
3,0.912474,-1.953699,-2.006071,-1.890265,1.102138,-1.300525,-1.476798,-0.788633,-1.477719,-1.481556,2.158404,-1.180089,-1.317434,0.251128
4,0.912474,-1.949139,-2.002687,-1.886027,1.096124,-1.299210,-1.474386,-0.788633,-1.477719,-1.479175,2.153885,-1.193130,-1.261473,0.440993
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
210220,-0.199796,-0.171691,-0.155101,-0.427015,-0.628593,-0.380947,-0.599035,-0.086271,-0.596521,-0.598159,0.102950,1.617266,1.398809,0.181239
210221,-0.183336,-0.148889,-0.133670,-0.421365,-0.668884,-0.362529,-0.594212,-0.061112,-0.592722,-0.591016,0.086299,1.930257,1.708745,0.427015
210222,-0.172754,-0.144328,-0.130287,-0.418540,-0.676101,-0.358583,-0.594212,-0.054822,-0.592722,-0.591016,0.084634,1.695513,1.329935,0.258117
210223,-0.163348,-0.153449,-0.140438,-0.412891,-0.643627,-0.366476,-0.589389,-0.071595,-0.585126,-0.586254,0.095100,1.506415,1.054436,0.117174


### Creating a temporal sliding window

As stated in the book, we'll use the past five days to predict a temperature 24 hours in the future.

We only want one point per hour in the future, so we skip 6 timesteps (of 10 minutes each)

The datasets will now contain :
- A samples matrix, of shape (samples, timesteps, features)
- A predictions vector with the temperatures corresponding to each sample

In [49]:
sampling_rate = 6  # Each input and output point will be taken every 6 timesteps (one per hour)
sequence_length = 24 * 5
delay = sampling_rate * (sequence_length + 24 - 1)  # Delay to start target values, in number of timesteps ; TODO: understand the -1
batch_size = 256

timeseries_kwargs = {
    'data': raw_data[:-delay], # Don't include the last sequence
    'targets': temperature[delay:],
    'sampling_rate': sampling_rate,
    'sequence_length': sequence_length,
    'batch_size': batch_size, # Shuffle the samples between them, not inside a sample of course
    'shuffle': True,
}

train_data = tf.keras.utils.timeseries_dataset_from_array(
    **timeseries_kwargs,
    end_index=training_index,
)

val_data = tf.keras.utils.timeseries_dataset_from_array(
    **timeseries_kwargs,
    start_index=training_index,  # Start and end index are also applied to targets
    end_index=validation_index,
)

test_data = tf.keras.utils.timeseries_dataset_from_array(
    **timeseries_kwargs,
    start_index=validation_index,
)

## 3. Test different models

In this part, we will dive into many models, starting from the most naïve one to the most complex, in order to evaluate the gains of each one versus its computation time

### Skeleton

We first define some helper functions to compile, fit and save our models:

In [50]:
def compile_model(
        model,
        learning_rate=1e-3,
        loss=tf.keras.losses.MeanSquaredError(),
        metrics=None):

    if metrics is None:
        metrics = []

    optimizer = tf.keras.optimizers.RMSprop(learning_rate=learning_rate)

    model.compile(loss=loss, optimizer=optimizer, metrics=metrics)


def train_model(model,
                training_data,
                epochs=100,
                callbacks=None,
                early_stopping=None,
                validation_split=0.2,
                validation_data=None,
                name='model',
                log_dir='logs/fit'
                ):

    if callbacks is None:
        callbacks = []
        
    if early_stopping is not None:
        callbacks.append(tf.keras.callbacks.EarlyStopping(monitor='loss', patience=early_stopping))

    date = datetime.datetime.now().strftime("%Y%m%d-%H%M%S")
    callbacks.append(tf.keras.callbacks.ModelCheckpoint(f'models/{name}.keras', save_best_only=True))
    callbacks.append(tf.keras.callbacks.BackupAndRestore(backup_dir=f'/tmp/backup/{name}--{date}'))
    callbacks.append(tf.keras.callbacks.TensorBoard(log_dir=f'{log_dir}/{name}--{date}', histogram_freq=1))
        
    # We don't have "batch size" here as the batches were already made during data preparation
    kwargs = {
        'epochs': epochs,
        'callbacks': callbacks,
    }
    if validation_data is not None:
        kwargs['validation_data'] = validation_data
    else:
        kwargs['validation_split'] = validation_split

    model.fit(
        training_data,
        **kwargs
    )
    return model
    

def create_and_process_model(name, build_function, training_data=train_data, learning_rate=1e-3):
    model = build_function()
    # For the loss, we use MSE as it's more stable around 0 (for gradient calculation)
    # But for evaluation, we prefer using MAE which is clearly interpretable as an error on average
    compile_model(
        model, 
        learning_rate=learning_rate,
        loss=tf.keras.losses.MeanSquaredError(),
        metrics=['mae']
    )
    print(model.summary())
    train_model(
        model,
        training_data,
        epochs=100,
        validation_data=val_data,
        name=name
    )
    
    
def test_best_model(name):
    best_model = tf.keras.models.load_model(f'models/{name}.keras')
    print(f'MAE for {name}: {best_model.evaluate(test_data)[1]:.2f}')

### Baseline

We must then create a more robust model that performs better on the same metrics.

The baseline approach consists of translating the temperature 24 hours to the future and use them as predictions. So in our samples, we use the last temperature measures (to be exactly 24 hours apart from the prediction).

We use the Mean Absolute Error (MSE) metric, i.e. sum(|prediction - target|) / nb_samples

In [None]:
def evaluate_baseline_method(dataset):
    total_absolute_error = 0.0
    nb_samples = 0
    for samples, targets in dataset:
        # We want all the samples of the batch, the last timestep, and the temperature column
        # Be careful: we didn't normalize the targets (temperature array), so we have to de-normalize the prediction
        predictions = samples[:, -1, 1] * std.t + mean.t
        total_absolute_error += np.sum(np.abs(predictions - targets))
        nb_samples += samples.shape[0]  # Reflex: use the shape instead of far variables or hard-coded values
    return total_absolute_error / nb_samples
        
print(f"Validation MSE (in °C): {evaluate_baseline_method(val_data):.3f}")
print(f"Test MSE (in °C): {evaluate_baseline_method(test_data):.3f}")

We can see that by using our baseline method, our predictions would be off by around two and a half degrees in average. This is not bad, but it can definitely be improved.

### Dense

Note: we don't use an activation function for the last layer, as we are facing a regression problem.

In [None]:
def create_dense():
    # raw_data.shape[1:] skips the number of features to just get the shape of the input
    # In this special case, we could replace it by "nb_columns"
    inputs = tf.keras.Input(shape=(sequence_length, raw_data.shape[-1]))
    # Our network must deal with vectors, i.e. one-dimensional tensors ; so we flatten the input data
    x = tf.keras.layers.Flatten()(inputs)
    x = tf.keras.layers.Dense(16, activation='relu')(x)
    x = tf.keras.layers.Dense(16, activation='relu')(x)
    outputs = tf.keras.layers.Dense(1)(x)
    
    return tf.keras.Model(inputs, outputs)

create_and_process_model('dense', create_dense)

In [None]:
test_best_model('dense')

We can see that this model didn't really achieve a good performance compared to the baseline method. We have to seek for a more complex model.

One of the reasons is the flattening : we remove the temporal information from the data, whereas this is crucial for our forecasting task.

### 1D-convolutional network

The 1D convolution is well suited for timeseries (temporal convolution).

In [None]:
def create_convolutional():
    inputs = tf.keras.Input(shape=(sequence_length, raw_data.shape[-1]))
    # Our network must deal with vectors, i.e. one-dimensional tensors ; so we flatten the input data
    x = tf.keras.layers.Flatten()(inputs)
    # We use convolution windows of 24 hours, as most of the patterns would be detected there
    # in accordance to the relative continuity and periodicity of the parameters
    # TODO: understand the consequence of "padding='causal'"
    x = tf.keras.layers.Conv1D(8, 24, padding='causal', activation='relu')(x)
    x = tf.keras.layers.MaxPool1D(2)(x)
    # Don't forget to adapt the window size as we downsample
    # = the model learns to detect more precise patterns
    x = tf.keras.layers.Conv1D(8, 12, padding='causal', activation='relu')(x)
    x = tf.keras.layers.MaxPool1D(2)(x)
    x = tf.keras.layers.Conv1D(8, 6, padding='causal', activation='relu')(x)
    # TODO: understand why we finish with a global average pooling
    x = tf.keras.layers.GlobalAveragePooling1D()(x)
    outputs = tf.keras.layers.Dense(1)(x)
    
    return tf.keras.Model(inputs, outputs)

create_and_process_model('convolutional', create_convolutional)

In [None]:
test_best_model('convolutional')

The model performs worse than the densely-connected one. This is simply caused by the convolution itself, that treats segments of data the same way, but the weather contains variations over days and months.

### First look at RNNs

RNNs are better suited to forecast some data, as they intrinsically work with timesteps / a notion of order, which is crucial here.

In [None]:
def create_simplernn():
    # The SimpleRNN layer can process sequences of arbitrary length
    inputs = tf.keras.Input(shape=(None, raw_data.shape[-1]))
    x = tf.keras.layers.SimpleRNN(16)(inputs)
    outputs = tf.keras.layers.Dense(1)(x)
    
    return tf.keras.Model(inputs, outputs)
    
create_and_process_model('simple_rnn', create_simplernn)


In [None]:
test_best_model('simple_rnn')

That's better than the baseline approach, meaning that machine learning has a real added value

### Long Short-Term Memory (LSTM)

In [None]:
def create_lstm():
    inputs = tf.keras.Input(shape=(sequence_length, raw_data.shape[-1]))
    x = tf.keras.layers.LSTM(16)(inputs)
    outputs = tf.keras.layers.Dense(1)(x)
    
    return tf.keras.Model(inputs, outputs)
    
create_and_process_model('lstm', create_lstm)

In [None]:
test_best_model('lstm')

It's quite a good model, but there is a big problem: if we look at the loss and MAE graphs, we see that training metrics always decrease, whereas validation ones start to increase. Both are diverging rapidly: the model is overfitting!

### Adding recurrent dropout

We can now increase the number of units in the LSTM layer, as the dropout will prevent a quick overfitting in the other case.

We also add a Dropout layer as usual to finally regularize the output of the LSTM layer.

In [None]:
def create_lstm_dropout():
    inputs = tf.keras.Input(shape=(sequence_length, raw_data.shape[-1]))
    x = tf.keras.layers.LSTM(32, recurrent_dropout=0.25)(inputs)
    x = tf.keras.layers.Dropout(0.5)(x)
    outputs = tf.keras.layers.Dense(1)(x)
    
    return tf.keras.Model(inputs, outputs)
    
create_and_process_model('lstm_dropout', create_lstm_dropout)

In [None]:
test_best_model('lstm_dropout')

### Scaling our model

In [None]:
def create_gru():
    inputs = tf.keras.Input(shape=(sequence_length, raw_data.shape[-1]))
    x = tf.keras.layers.GRU(32, recurrent_dropout=0.5)(inputs)
    #x = tf.keras.layers.GRU(32, recurrent_dropout=0.5)(x)
    x = tf.keras.layers.Dropout(0.5)(x)
    outputs = tf.keras.layers.Dense(1)(x)
    
    return tf.keras.Model(inputs, outputs)
    
create_and_process_model('gru', create_gru)

In [None]:
test_best_model('gru')

### Reversed-order RNN

Before moving on, let's test the influence of reversing the input sequence, to see if the chronological order really matters.

In [None]:
reversed_timeseries_kwargs = {
    'data': raw_data[:-delay][::-1], # Don't include the last sequence
    'targets': temperature[delay:][::-1],
    'sampling_rate': sampling_rate,
    'sequence_length': sequence_length,
    'batch_size': batch_size, # Shuffle the samples between them, not inside a sample of course
    'shuffle': True,
}

reversed_train_data = tf.keras.utils.timeseries_dataset_from_array(
    **reversed_timeseries_kwargs,
    end_index=training_index,
)

In [None]:
create_and_process_model('reversed_lstm', create_lstm, training_data=reversed_train_data)

In [None]:
test_best_model('reversed_lstm')

It's not really convincing as-is: indeed, the predictions are more tied to most recent weather conditions than past ones.

The anti-chronological order could still provide valuable information if we combine it to the chronological order.

### Bidirectional RNN

In [None]:
def create_bidirectional():
    inputs = tf.keras.Input(shape=(sequence_length, raw_data.shape[-1]))
    x = tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(32, recurrent_dropout=0.5))(inputs)
    x = tf.keras.layers.Dropout(0.5)(x)
    outputs = tf.keras.layers.Dense(1)(x)
    
    return tf.keras.Model(inputs, outputs)
    
create_and_process_model('bidirectional', create_bidirectional)

In [None]:
test_best_model('bidirectional')

- This model is more complex, thus overfitting earlier
- The anti-chronological information is poluting the chronological one here: the past is way less important than the recent for this task

## Conclusion

We can't really achieve better performance than the baseline: we only have weather data at one point, however conditions are clearly influenced by other places.

Machine learning still is relevant to predict a future where the past contains information and can influence the rest.

## The final model

We will first seek for the best hyperparameters, by using KerasTuner.

In [56]:
def create_final(
        recurrent_dropout=0.6,
        units=32,
        l2_units=None,
        end_neural=True
    ):
    inputs = tf.keras.Input(shape=(sequence_length, raw_data.shape[-1]))
    x = tf.keras.layers.LSTM(units=units, recurrent_dropout=recurrent_dropout)(inputs)

    if l2_units is not None:
       x = tf.keras.layers.LSTM(units=l2_units, recurrent_dropout=recurrent_dropout)(inputs)

    x = tf.keras.layers.Dropout(0.5)(x)
    
    if end_neural:
        x = tf.keras.layers.Dense(16)(x)
        x = tf.keras.layers.Dropout(0.2)(x)

    outputs = tf.keras.layers.Dense(1)(x)
    
    return tf.keras.Model(inputs, outputs)


def search_final(hp):
    model = create_final(
        recurrent_dropout=hp.Float('recurrent_dropout', min_value=0.2, max_value=0.7, default=0.5, step=0.1),
        units=hp.Int('units', min_value=32, max_value=64, step=32),
        l2_units=hp.Int('units', min_value=32, max_value=64, step=32),
        end_neural=hp.Boolean('end_neural'),
    )
    learning_rate=hp.Float('learning_rate', min_value=1e-4, max_value=1e-2, sampling='log')
    model.compile(
        optimizer=tf.keras.optimizers.RMSprop(learning_rate=learning_rate),
        loss='mse',
        metrics=['mae']
    )
    return model

In [54]:
tuner = keras_tuner.Hyperband(
    hypermodel=create_final,
    objective='val_mae',
    max_epochs=15,
    overwrite=True,
    directory='model_search',
    project_name='weather_forecasting'
)

tuner.search_space_summary()

date = datetime.datetime.now().strftime("%Y%m%d-%H%M%S")

callbacks = [
    tf.keras.callbacks.EarlyStopping(monitor='loss', patience=2),
    tf.keras.callbacks.TensorBoard(log_dir=f'logs/fit/final--{date}', histogram_freq=1)
]

tuner.search(train_data, validation_data=val_data, callbacks=callbacks)
best_hps = tuner.get_best_hyperparameters(num_trials=1)[0]

Trial 47 Complete [00h 06m 20s]
val_mae: 2.3841261863708496

Best val_mae So Far: 2.2665553092956543
Total elapsed time: 04h 27m 13s

Search: Running Trial #48

Value             |Best Value So Far |Hyperparameter
0.4               |0.6               |recurrent_dropout
64                |32                |units
True              |False             |l2
True              |True              |end_neural
0.00049269        |0.0045213         |learning_rate
6                 |6                 |tuner/epochs
0                 |2                 |tuner/initial_epoch
2                 |3                 |tuner/bracket
0                 |1                 |tuner/round

Epoch 1/6
[1m819/819[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m139s[0m 168ms/step - loss: 38.0526 - mae: 4.5436 - val_loss: 9.2990 - val_mae: 2.3805
Epoch 2/6
[1m819/819[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m136s[0m 166ms/step - loss: 15.3070 - mae: 3.0477 - val_loss: 9.1808 - val_mae: 2.3557
Epoch 3/6
[1m819/81

KeyboardInterrupt: 

Trial 47 Complete [00h 06m 20s]

val_mae: 2.3841261863708496


Best val_mae So Far: 2.2665553092956543

Total elapsed time: 04h 27m 13s


Search: Running Trial #48


| Value      | Best Value So Far | Hyperparameter      |
|------------|-------------------|---------------------|
| 0.4        | 0.6               | recurrent_dropout   |
| 64         | 32                | units               |
| True       | False             | l2                  |
| True       | True              | end_neural          |
| 0.00049269 | 0.0045213         | learning_rate       |
| 6          | 6                 | tuner/epochs        |
| 0          | 2                 | tuner/initial_epoch |
| 2          | 3                 | tuner/bracket       |
| 0          | 1                 | tuner/round         |

We can now use these parameters to create our model.

In [57]:
final_model = create_final(
    recurrent_dropout=0.6,
    units=32,
    l2_units=None,
    end_neural=True
)

compile_model(
    final_model, 
    learning_rate=45e-4,
    loss=tf.keras.losses.MeanSquaredError(),
    metrics=['mae']
)
print(final_model.summary())
train_model(
    final_model,
    train_data,
    epochs=100,
    validation_data=val_data,
    name='final'
)

None
Epoch 1/100
[1m819/819[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m96s[0m 114ms/step - loss: 23.2959 - mae: 3.6314 - val_loss: 9.2714 - val_mae: 2.3586
Epoch 2/100
[1m819/819[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m102s[0m 124ms/step - loss: 14.9055 - mae: 2.9945 - val_loss: 10.0066 - val_mae: 2.4470
Epoch 3/100
[1m819/819[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m124s[0m 103ms/step - loss: 14.1885 - mae: 2.9214 - val_loss: 8.9559 - val_mae: 2.3192
Epoch 4/100
[1m819/819[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m65s[0m 79ms/step - loss: 13.7758 - mae: 2.8852 - val_loss: 9.6568 - val_mae: 2.4123
Epoch 5/100
[1m819/819[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m143s[0m 154ms/step - loss: 13.4369 - mae: 2.8447 - val_loss: 9.0105 - val_mae: 2.3274
Epoch 6/100
[1m819/819[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m87s[0m 107ms/step - loss: 13.2315 - mae: 2.8274 - val_loss: 8.7945 - val_mae: 2.2894
Epoch 7/100
[1m819/819[0m [32m━━━━━━━━━━━━━━━━━

KeyboardInterrupt: 

In [58]:
test_best_model('final')

[1m405/405[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m25s[0m 59ms/step - loss: 9.7388 - mae: 2.4380
MAE for final: 2.44
