In [None]:
# -*- coding: utf-8 -*-
""" 
Usage:
    python trainval.py -h
"""
from __future__ import print_function
import os
os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3'
import numpy as np
import tensorflow as tf
try:
    import cPickle as pickle
except ImportError:
    import pickle
import pandas as pd
import math
from datetime import datetime
from keras.callbacks import EarlyStopping, ModelCheckpoint,TensorBoard,LearningRateScheduler,Callback
from utils.dataset import load_data
import models
from keras.optimizers import Adam, SGD
from keras.utils import plot_model as plot
from keras.utils import multi_gpu_model

import utils
import utils.metrics as Metrics

from keras import backend as K

# uncomment followng to set fix random seed
np.random.seed(1337)

import argparse

parser = argparse.ArgumentParser()
parser.add_argument('--model', type=str, help='model to train and eval')
parser.add_argument('--lr', type=float, default=0.001, help='learing rate')
parser.add_argument('--batch_size', type=int, default=32, help='batch size')
parser.add_argument('--seq_len', type=int, default=5, help='length of import sequence')
parser.add_argument('--pre_train', type=bool, default=False, help='whether to load weights file or not')
parser.add_argument('--weights', type=str, help='weights file to load')
parser.add_argument('--gpus', type=str, help='gpus to use, auto parallelize')


def get_tensorboard(path):
    tensorboard = TensorBoard(log_dir=path)
    return tensorboard

def save_file(file, path):
    rtcode = os.system(" ".join(["cp", file.replace(".pyc", ".py"), path]))
    assert rtcode == 0

def get_decay(base_lr):
    def step_decay_lr(epoch):
        if epoch < 200:
            return base_lr
        else:
            return base_lr * 0.1 

    return step_decay_lr


def show_score(odmax, score, stage):
    print(stage + ' score: %.6f rmse (real): %.6f mape: %.6f' %
          (score[0], score[1], score[2]))

    print('origin rmse (real): %.6f mape: %.6f' %
          (score[3], score[4]))

class SGDLearningRateTracker(Callback):
    def on_epoch_end(self, epoch, logs={}):
        optimizer = self.model.optimizer
        lr = float(K.get_value(optimizer.lr))
        print('LR: {:.6f}'.format(lr))


def rmse(a, b):
    return Metrics.rmse(a, b) * 241.0 / 2.0
def o_rmse(a, b):
    return Metrics.o_rmse(a, b) * 241.0 / 2.0

def train(model, lr, batch_size, seq_len, pre_train, weights, DEMODEL):
    
#     import pdb;pdb.set_trace()
    odmax = 241
    use_tensorboard = True
    gpu_count = len(os.environ["CUDA_VISIBLE_DEVICES"].split(','))
    parallel = True if gpu_count != 1 else False

    nb_epoch = 200       # number of epoch at training stage
    nb_epoch_cont = 500  # number of epoch at continued training stage

    T = 48  # number of time intervals in one day
    m_patience = 20 # number of epoch to train 
    timestep = seq_len #5
    map_height, map_width = 15, 5  # grid size

    days_test = 60

    pt = datetime.now().strftime('%m_%d_%H_%M_%S')
    path_model = 'TRAIN/' + pt
    if os.path.isdir(path_model) is False:
        os.makedirs(path_model)
    print("Exp: " + path_model)

    # load data
    print("loading data...")
    '''
        expect:
        X = (sample, timestep, map_height * map_width, map_height, map_width)
        Y = (sample, map_height * map_width, map_height, map_width)
        weather = (sample, timestep, ?)
        meta = (sample, timestep, ?)

        The meta data is not used in this work, but we can explore its effect in future works. 
    '''
    X, Y, weather, meta = load_data(odmax, timestep)
    len_test = T * days_test



    print("nb_epoch: " + str(nb_epoch) + " nb_epoch_cont: " + str(nb_epoch_cont) + " batch_size: " + str(batch_size))
    print("patience: " + str(m_patience) + " lr: " + str(lr) + " seq_len: " + str(timestep))# + '-' + str(len_period) + '-' + str(len_trend))
    print("odmax: " + str(odmax))
    print("{} sample totally. {} for train, {} for test".format(X.shape[0], X.shape[0] - len_test, len_test))

    X_train, X_test = X[:-len_test], X[-len_test:]
    Y_train, Y_test = Y[:-len_test], Y[-len_test:]
    weather_train, weather_test = weather[:-len_test], weather[-len_test:]
    meta_train, meta_test = meta[:-len_test], meta[-len_test:]

    X_train = [X_train, weather_train, meta_train]
    X_test = [X_test, weather_test, meta_test]

    

    """********************************************************************************************"""
    """ Frist, we train our model with fixed learning rate                                         """
    """********************************************************************************************"""

    model_para = {
        "timestep": timestep,
        "map_height": map_height,
        "map_width": map_width,
        "weather_dim": weather.shape[2],
        "meta_dim": meta.shape[2],
    }
    # Build the model to train in parallel with multi-GPUs or only on GPU
    if parallel:
        model = DEMODEL.build_model(**model_para)
        plot(model, to_file=os.path.join(path_model,'networks.png'), show_shapes=True)
        model.summary()
        train_model = multi_gpu_model(model, gpu_count)

    else:
        model = DEMODEL.build_model(**model_para)
        plot(model, to_file=os.path.join(path_model,'networks.png'), show_shapes=True)
        model.summary()
        train_model = model

    # use the loss define in the model
    loss = DEMODEL.get_loss()
    optimizer = Adam(lr=lr)

    metrics = [ rmse, Metrics.mape,  \
                o_rmse, Metrics.o_mape,
                ]
    train_model.compile(loss=loss, optimizer=optimizer, metrics=metrics)

    # load weights to the pre_train model after model compiled
    if pre_train == True:
        model.load_weights(weights, by_name=True, skip_mismatch=True)

    # define callbacks on training
    callbacks = []

    hyperparams_name = 'timestep{}.lr{}'.format(timestep, lr)
    fname_param = os.path.join(path_model, hyperparams_name + '.best.h5')
    lr_logger = SGDLearningRateTracker() # log out the learning rate after a epoch trained
    callbacks.append(lr_logger)
    callbacks.append(EarlyStopping(monitor='val_rmse', patience=m_patience, mode='min'))
    callbacks.append(ModelCheckpoint(
        fname_param, monitor='val_mape', verbose=0, save_best_only=True, mode='min'))

    if use_tensorboard:
        callbacks.append(get_tensorboard(path_model+"/tensorboard-1/"))

    print('=' * 10)
    print("training model...")
    history = train_model.fit(X_train, Y_train,
                        nb_epoch=nb_epoch,
                        batch_size=batch_size,
                        validation_data=(X_test, Y_test),
                        callbacks=callbacks,
                        verbose=1)

    model.save_weights(os.path.join(
        path_model, '{}.h5'.format(hyperparams_name)), overwrite=True)
    train_model.load_weights(fname_param)
    model.save_weights(fname_param, overwrite=True)
    pickle.dump((history.history), open(os.path.join(
        path_model, '{}.history.pkl'.format(hyperparams_name)), 'wb'))

    print('evaluating using the model that has the best model on the valid set')

    model.load_weights(fname_param)
    
    score = train_model.evaluate(X_train, Y_train, batch_size=batch_size, verbose=0)
    show_score(odmax, score, "train")
    score = train_model.evaluate(
        X_test, Y_test, batch_size=batch_size, verbose=0)
    show_score(odmax, score, "Test")

    print('=' * 10)


    """********************************************************************************************"""
    """ Second, we train our model with step_decay learning rate                                   """
    """********************************************************************************************"""

    """
    # clear session to rebuild the model, in order to switch optimizor
    K.clear_session()
    DEMODEL.clear_graph()

    # rebuild the model
    if parallel:
        model = DEMODEL.build_model(**model_para)
        train_model = multi_gpu_model(model, gpu_count)
    else:
        model = DEMODEL.build_model(**model_para)
        train_model = model

    loss = DEMODEL.get_loss()
    optimizer = Adam(lr=lr)
    metrics = [ rmse, Metrics.mape, \
                o_rmse, Metrics.o_mape,
                ]
    train_model.compile(loss=loss, optimizer=optimizer, metrics=metrics)
    model.load_weights(fname_param)

    fname_param_step =  os.path.join(
        path_model, \
        hyperparams_name + '.cont.best.h5.{epoch:03d}-{val_mape:.4f}-{val_rmse:.4f}-{val_o_mape:.4f}-{val_o_rmse:.4f}')
    callbacks_cont = []
    #lr_logger = SGDLearningRateTracker()


    # callbacks_cont.append(lr_logger)
    callbacks_cont.append(LearningRateScheduler(get_decay(lr)))
    callbacks_cont.append(ModelCheckpoint(
        fname_param_step, monitor='val_mape', verbose=0, save_best_only=False, period=1, save_weights_only=True, mode='min'))
    if use_tensorboard:
        callbacks_cont.append(get_tensorboard(path_model+"/tensorboard-2/"))

    history = train_model.fit(X_train, Y_train,
                        nb_epoch=nb_epoch_cont, 
                        batch_size=batch_size,
                        callbacks=callbacks_cont, 
                        validation_data=(X_test, Y_test),
                        verbose=1)

    pickle.dump((history.history), open(os.path.join(
        path_model, '{}.cont.history.pkl'.format(hyperparams_name)), 'wb'))
    model.save_weights(os.path.join(
        path_model, '{}_cont.h5'.format(hyperparams_name)), overwrite=True)
    model.load_weights(fname_param)
    model.save_weights(fname_param, overwrite=True) # save the origin model weights instead of the paralleled one

    print('=' * 10)
    print('evaluating using the final model')
    score = train_model.evaluate(X_train, Y_train, batch_size=32, verbose=0)
    show_score(odmax, score, "train")
    score = train_model.evaluate(
        X_test, Y_test, batch_size=32, verbose=0)
    show_score(odmax, score, "test")
    """
    
model = "CSTN"
lr = 0.001
batch_size = 32
gpus = '0'
seq_len=5
pre_train=False
weights=None

os.environ["HDF5_DISABLE_VERSION_CHECK"]='1'
os.environ["CUDA_VISIBLE_DEVICES"] = gpus

import_net="import models.%s as DEMODEL"%(model)
exec(import_net)

# train(model, lr, batch_size, seq_len, pre_train, weights, DEMODEL)

odmax = 241
use_tensorboard = True
gpu_count = len(os.environ["CUDA_VISIBLE_DEVICES"].split(','))
parallel = True if gpu_count != 1 else False

nb_epoch = 200       # number of epoch at training stage
nb_epoch_cont = 500  # number of epoch at continued training stage

T = 48  # number of time intervals in one day
m_patience = 20 # number of epoch to train 
timestep = seq_len #5
map_height, map_width = 15, 5  # grid size

days_test = 60

pt = datetime.now().strftime('%m_%d_%H_%M_%S')
path_model = 'TRAIN/' + pt
if os.path.isdir(path_model) is False:
    os.makedirs(path_model)
print("Exp: " + path_model)

# load data
print("loading data...")
'''
    expect:
    X = (sample, timestep, map_height * map_width, map_height, map_width)
    Y = (sample, map_height * map_width, map_height, map_width)
    weather = (sample, timestep, ?)
    meta = (sample, timestep, ?)

    The meta data is not used in this work, but we can explore its effect in future works. 
'''
X, Y, weather, meta = load_data(odmax, timestep)
len_test = T * days_test



print("nb_epoch: " + str(nb_epoch) + " nb_epoch_cont: " + str(nb_epoch_cont) + " batch_size: " + str(batch_size))
print("patience: " + str(m_patience) + " lr: " + str(lr) + " seq_len: " + str(timestep))# + '-' + str(len_period) + '-' + str(len_trend))
print("odmax: " + str(odmax))
print("{} sample totally. {} for train, {} for test".format(X.shape[0], X.shape[0] - len_test, len_test))

X_train, X_test = X[:-len_test], X[-len_test:]
Y_train, Y_test = Y[:-len_test], Y[-len_test:]
weather_train, weather_test = weather[:-len_test], weather[-len_test:]
meta_train, meta_test = meta[:-len_test], meta[-len_test:]

X_train = [X_train, weather_train, meta_train]
X_test = [X_test, weather_test, meta_test]



"""********************************************************************************************"""
""" Frist, we train our model with fixed learning rate                                         """
"""********************************************************************************************"""

model_para = {
    "timestep": timestep,
    "map_height": map_height,
    "map_width": map_width,
    "weather_dim": weather.shape[2],
    "meta_dim": meta.shape[2],
}
# Build the model to train in parallel with multi-GPUs or only on GPU
if parallel:
    model = DEMODEL.build_model(**model_para)
    plot(model, to_file=os.path.join(path_model,'networks.png'), show_shapes=True)
    model.summary()
    train_model = multi_gpu_model(model, gpu_count)

else:
    model = DEMODEL.build_model(**model_para)
    plot(model, to_file=os.path.join(path_model,'networks.png'), show_shapes=True)
    model.summary()
    train_model = model

# use the loss define in the model
loss = DEMODEL.get_loss()
optimizer = Adam(lr=lr)

metrics = [ rmse, Metrics.mape,  \
            o_rmse, Metrics.o_mape,
            ]
train_model.compile(loss=loss, optimizer=optimizer, metrics=metrics)

# load weights to the pre_train model after model compiled
if pre_train == True:
    model.load_weights(weights, by_name=True, skip_mismatch=True)

# define callbacks on training
callbacks = []

hyperparams_name = 'timestep{}.lr{}'.format(timestep, lr)
fname_param = os.path.join(path_model, hyperparams_name + '.best.h5')
lr_logger = SGDLearningRateTracker() # log out the learning rate after a epoch trained
callbacks.append(lr_logger)
callbacks.append(EarlyStopping(monitor='val_rmse', patience=m_patience, mode='min'))
callbacks.append(ModelCheckpoint(
    fname_param, monitor='val_mape', verbose=0, save_best_only=True, mode='min'))

if use_tensorboard:
    callbacks.append(get_tensorboard(path_model+"/tensorboard-1/"))

print('=' * 10)
print("training model...")
history = train_model.fit(X_train, Y_train,
                    epochs=nb_epoch,
                    batch_size=batch_size,
                    validation_data=(X_test, Y_test),
                    callbacks=callbacks,
                    verbose=1)

model.save_weights(os.path.join(
    path_model, '{}.h5'.format(hyperparams_name)), overwrite=True)
train_model.load_weights(fname_param)
model.save_weights(fname_param, overwrite=True)
pickle.dump((history.history), open(os.path.join(
    path_model, '{}.history.pkl'.format(hyperparams_name)), 'wb'))

print('evaluating using the model that has the best model on the valid set')

model.load_weights(fname_param)

score = train_model.evaluate(X_train, Y_train, batch_size=batch_size, verbose=0)
show_score(odmax, score, "train")
score = train_model.evaluate(
    X_test, Y_test, batch_size=batch_size, verbose=0)
show_score(odmax, score, "Test")

print('=' * 10)


In [29]:
def static_var(varname, value):
    def decorate(func):
        setattr(func, varname, value)
        return func
    return decorate

@static_var("gcc_layers", {1})
def test():
    def test1(layer):
        if layer not in test.gcc_layers:
            test.gcc_layers=test.gcc_layers|{layer}
            print(test.gcc_layers)
    def test2(b):
        return test1(b+1)
    return test2

        

a=test()

In [5]:
!nvidia-smi

Thu Sep 17 03:38:22 2020       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 440.31       Driver Version: 440.31       CUDA Version: 10.2     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|   0  Tesla P40           Off  | 00000000:06:00.0 Off |                  Off |
| N/A   44C    P0    50W / 250W |  23191MiB / 24451MiB |      0%      Default |
+-------------------------------+----------------------+----------------------+
|   1  Tesla P40           Off  | 00000000:2F:00.0 Off |                    0 |
| N/A   25C    P0    48W / 250W |      0MiB / 22919MiB |      0%      Default |
+-------------------------------+----------------------+----------------------+
|   2  Tesla P40           Off  | 00000000:86:00.0 Off |                    0 |
| N/A   

# CST-GCN

### 1. 邻接矩阵

In [2]:
class GraphConstruct():
    def __init__(self,
                 num_nodes,
                 edge,
                 max_hop=40,
                 strategy='distance'):
        self.num_nodes=num_nodes;self.edge=edge
        self.max_hop=max_hop;self.strategy=strategy
        self.hop_distance = self.get_hop_distance(num_nodes, edge, max_hop=max_hop)
        self.A=self.get_adjacency(self.hop_distance, strategy=self.strategy)
        self.A=self.A[1:,...] #考虑到地铁自环的情况极少，所以去掉0阶
        
        
        
    def get_hop_distance(self, num_node, edge, max_hop=1):
        # 通过edge数组计算有效邻接矩阵，max_hop=n表征了一种两点n步可达的能力
        A = np.zeros((num_node, num_node),dtype=int)
        for i, j in edge:
            A[j, i] = 1
            A[i, j] = 1

        # compute hop steps
        hop_dis = np.zeros((num_node, num_node)) + np.inf
        #np.linalg.matrix_power(A, d)刻画两点d步可达的邻接矩阵
        transfer_mat = [np.linalg.matrix_power(A, d) for d in range(max_hop + 1)]
        arrive_mat = (np.stack(transfer_mat) > 0)
        for d in range(max_hop, -1, -1):
            hop_dis[arrive_mat[d]] = d
        return hop_dis

    def normalize_digraph(self, A):
        Dl = np.sum(A, 0)
        num_node = A.shape[0]
        Dn = np.zeros((num_node, num_node))
        for i in range(num_node):
            if Dl[i] > 0:
                Dn[i, i] = Dl[i] ** (-1)

        DA = np.dot(Dn, A)
        return DA



    def get_adjacency(self, hop_dis, strategy, max_hop=1):
        # 邻接矩阵的拆分策略
        valid_hop = range(0, max_hop + 1)
        adjacency = np.zeros((num_nodes, num_nodes))
        for hop in valid_hop:
            adjacency[hop_dis == hop] = 1


        normalize_adjacency = self.normalize_digraph(adjacency)

        # 1.公用一个邻接矩阵
        if strategy == 'uniform':
            A = np.zeros((1, num_nodes, num_nodes))
            A[0] = normalize_adjacency

        # 2.按距离拆分邻接矩阵，A[i][p,q]的值代表p站到q站的距离是否为i步，是为1，不是为0
        elif strategy == 'distance':
            A = np.zeros((len(valid_hop), num_nodes, num_nodes))
            for i, hop in enumerate(valid_hop):
                A[i][hop_dis == hop] = normalize_adjacency[hop_dis == hop]

        return A
        

import pandas as pd
import numpy as np

#reality
# num_nodes=20
# edge=[(0,18),(1,2),(2,3),(3,4),(4,5),(6,7),(7,8),(8,9),(10,11),(11,12),(12,13),(13,14),(11,15),(12,16),(16,17),
#      (18,3),(3,7),(7,11),(11,15),(19,4),(4,8),(8,12)]


num_nodes=370
edge=np.load("edge.npy")

Graph=GraphConstruct(num_nodes,edge,max_hop=40,strategy='distance')
A=Graph.A

In [7]:
Graph.hop_distance

array([[ 0.,  1.,  2., ..., 22., 23., 23.],
       [ 1.,  0.,  1., ..., 21., 22., 22.],
       [ 2.,  1.,  0., ..., 20., 21., 21.],
       ...,
       [22., 21., 20., ...,  0.,  1.,  1.],
       [23., 22., 21., ...,  1.,  0.,  1.],
       [23., 22., 21., ...,  1.,  1.,  0.]])

In [3]:
A.shape

(1, 370, 370)

### 2.网络构造

In [1]:
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers, datasets, models


import numpy as np


class CstGcn(tf.keras.Model):
    def __init__(self,A,gcn_fliters=6,tcn_filters=4,gcc_filters=2):
        super(CstGcn, self).__init__()
        self.A=A
        self.gcn_fliters=gcn_fliters
        self.tcn_filters=tcn_filters
        self.gcc_filters=gcc_filters
        
    
    def conv2d(self,x,num_filter,kernel_size=(1,1),activation=tf.nn.relu,use_bn=False):
        if use_bn:
            x=layers.BatchNormalization(axis=1)(x)
        x=layers.Conv2D(
                        filters=num_filter,    
                        kernel_size=kernel_size,         
                        strides=(1, 1),
                        padding='same',
                        activation=tf.nn.relu,
                        data_format='channels_last'
                              )(x)

        return x


    def gcn_res_block(self,x,conv_filter,A):
        """
        Args:
            x : (n,t,h,w,c)
            A : (k,h,w)        
        return
            x : (n,t,h,w,f)
        """   
        n,t,h,w,c=x.shape

        k,h,w = A.shape

        # forward feature when x, backwards when x=transpose(x)
        x=self.conv2d(x,conv_filter*k)
        x=tf.reshape(x,(-1,t,h,w,k,conv_filter))
        x=tf.einsum('nthwkf, khw -> nthwf',x,A)

        x_o = self.conv2d(x,1,(1,h))
        x_d  = self.conv2d(x,1,(h,1))

        x_o=self.conv2d(x_o, h,(1,1),activation=None)
        x_o=tf.transpose(x_o,(0,1,2,4,3))

        x_d=self.conv2d(x_d, h,(1,1),activation=None)
        x_d=tf.transpose(x_d,(0,1,3,4,2))

        x=tf.add(x_o,x_d)
        x=self.conv2d(x,1)


        return x


    def tcn_block(self,x,num_filter):
        """
        Args:
            x : (n,t,h,w,c)       
        return
            x : (n,h,w,1)
        """

        x=layers.ConvLSTM2D(num_filter,kernel_size=(3,3), padding='same',data_format='channels_last',)(x)
        x=self.conv2d(x,1)

        return x



    def gcc_block(self,x,num_filter):
        """
        Args:
            x : (n,h,w,1)     
        return
            x : (n,h,w,1)
        """
        res=x
        x1=self.conv2d(x,num_filter)
        x2=self.conv2d(x,num_filter)
        x2=tf.transpose(x2,(0,2,1,3))
#         n,f,h,w = x1.shape

        x=tf.einsum('nhwf,nwqf -> nhqf',x1,x2)
        x=tf.add(self.conv2d(x,1),res)

        return x
    
    def call(self,x):
        
        #gcn
        x=self.gcn_res_block(x,self.gcn_fliters,self.A)

        x_T=tf.transpose(x,(0,1,3,2,4))
        x_T=self.gcn_res_block(x_T,self.gcn_fliters,self.A)

        x=tf.add(x,x_T)
        x=tf.nn.relu(x)

        #tcn
        x=self.tcn_block(x,self.tcn_filters)

        #gcc
        x=self.gcc_block(x,self.gcc_filters)
        
        return x
    
#eager模式消耗性能，但方便实时调试
tf.config.run_functions_eagerly(False) 
tf.compat.v1.disable_eager_execution()
# tf.compat.v1.enable_eager_execution()
# tf.executing_eagerly()

In [10]:
timestep, nodes_num, adjacency_num, weather_dim = 5, 400, 40, 19

# x = tf.constant(tf.constant_initializer()(shape=[3,5,370,370,1])) 
# A = tf.constant(tf.random_normal_initializer()(shape=[40,370,370]))

x=np.random.rand(3,5,370,370,1)
A=np.random.rand(40,370,370)

y=np.random.rand(3,370,370,1)

cstgcn = CstGcn(A)
cstgcn.compile(optimizer='rmsprop', loss=tf.keras.losses.MAE)

In [11]:
from tensorflow.keras.callbacks import TensorBoard
#设定格式化模型名称，以时间戳作为标记
model_name = "cstgcn"
#设定存储位置，每个模型不一样的路径
tensorboard = TensorBoard(log_dir='logs/{}'.format(model_name))
#使用它
cstgcn.fit(x, y, batch_size =3, epochs=10, validation_split=0.1, callbacks=[tensorboard])

Train on 2 samples, validate on 1 samples
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<tensorflow.python.keras.callbacks.History at 0x7f9868cf0b00>

In [12]:
cstgcn.evaluate(x,y)

0.49549251794815063

In [22]:
y_predict = cstgcn.predict(x[0:1])

In [27]:
y[0].squeeze()

array([[0.43937906, 0.82145971, 0.18346909, ..., 0.53482477, 0.7519584 ,
        0.21810749],
       [0.3060722 , 0.15277989, 0.98317237, ..., 0.24283907, 0.97829012,
        0.36325624],
       [0.46616538, 0.65773883, 0.21093615, ..., 0.44083631, 0.54602933,
        0.75992579],
       ...,
       [0.78135787, 0.52461333, 0.71817084, ..., 0.29784379, 0.2285722 ,
        0.25069173],
       [0.15512229, 0.27955503, 0.89679743, ..., 0.37042135, 0.10133943,
        0.30115823],
       [0.09663379, 0.14840225, 0.0379395 , ..., 0.09407211, 0.63912017,
        0.60298632]])

In [28]:
np.abs(y[0]-y_predict).mean()

0.4945583811595566

In [25]:
y_predict.squeeze()

array([[0.00550423, 0.        , 0.00303542, ..., 0.        , 0.        ,
        0.        ],
       [0.01350642, 0.        , 0.00223933, ..., 0.        , 0.        ,
        0.        ],
       [0.0032169 , 0.        , 0.        , ..., 0.        , 0.        ,
        0.        ],
       ...,
       [0.0216705 , 0.        , 0.00062624, ..., 0.        , 0.        ,
        0.        ],
       [0.        , 0.        , 0.        , ..., 0.        , 0.        ,
        0.00140092],
       [0.01608675, 0.        , 0.00087253, ..., 0.        , 0.        ,
        0.0035792 ]], dtype=float32)

### TimeDistributed

In [74]:
test = layers.Conv2D(
                    filters=1,    
                    kernel_size=(1,1),         
                    strides=(3, 3),
                    activation=tf.nn.relu,
                    data_format='channels_last'
                          )

layers.TimeDistributed(test)(x)

<tf.Tensor 'time_distributed_1/Reshape_1:0' shape=(3, 5, 14, 14, 1) dtype=float32>