How can I regularly save Keras models during training?

To ensure the ability to recover from an interrupted training run at any time (fault tolerance),
you should use a callback that regularly saves your model to disk.

You should also set up your code to optionally reload that model at startup.

Here's a simple example.

In [None]:
import os
from tensorflow import keras

# Prepare a directory to store all the checkpoints.
checkpoint_dir = './ckpt'
if not os.path.exists(checkpoint_dir):
    os.makedirs(checkpoint_dir)


def make_model():

    # Create a new linear regression model.
    model = keras.Sequential([keras.layers.Dense(1)])
    model.compile(optimizer='adam', loss='mse')
    return model


def make_or_restore_model():

    # Either restore the latest model,
    checkpoints = [checkpoint_dir + '/' + name
                   for name in os.listdir(checkpoint_dir)]
    if checkpoints:
        latest_checkpoint = max(checkpoints, key=os.path.getctime)
        print('Restoring from', latest_checkpoint)
        return keras.models.load_model(latest_checkpoint)

    # or create a fresh one if there is no checkpoint available.
    print('Creating a new model')
    return make_model()


model = make_or_restore_model()

callbacks = [

    # We include the training loss in the folder name.
    keras.callbacks.ModelCheckpoint(
        filepath=checkpoint_dir + '/ckpt-loss={loss:.2f}',
        save_freq=100) # This callback saves a SavedModel every 100 batches.
]

model.fit(train_data, epochs=10, callbacks=callbacks)

Reference:
https://keras.io/getting_started/faq/#how-can-i-regularly-save-keras-models-during-training