In [59]:
import gc
from copy import deepcopy
import numpy as np
import pandas as pd

import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from keras.preprocessing.image import ImageDataGenerator
from keras.callbacks import ReduceLROnPlateau, LearningRateScheduler, TensorBoard
from keras import backend as K

from libs.util import random_mask
# from libs.pconv_model_first_resid import PConvUnet
# from libs.pconv_model_UNet import PConvUnet
from libs.pconv_model_first_resid_bak import PConvUnet

# from libs.pconv_model_UNet import PConvUnet
from libs.properties import properties
from keras.models import load_model  

# Settings
MAX_BATCH_SIZE = 32

%matplotlib inline
%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [60]:
properties_dict = properties()
length = properties_dict["length"]

In [61]:
matrix_df = pd.read_csv('./data/trafficV_M.csv', index_col=0, parse_dates=True)

In [62]:
# matrix_df = np.array(matrix_df)

# print(matrix_df.shape)
# print("%d bytes" % (matrix_df.size * matrix_df.itemsize))
# print(np.isnan(matrix_df).sum())

In [63]:
# # 单独测试week_ago, mintue_ago
# week_delta = pd.Timedelta(1, unit='W')
# minute_delta = pd.Timedelta(15, unit='m')


# channel_num = 3
# smooth_time = channel_num-1
# set_up_time = week_delta

# train_df = matrix_df.truncate(before=matrix_df.index.min() + set_up_time)
# train_week_ago_df = matrix_df.loc[train_df.index - week_delta]
# train_minute_ago_df = matrix_df.loc[train_df.index - minute_delta]

# train_df = np.array(train_df).reshape(-1, length, length, 1)
# train_week_ago_df = np.array(train_week_ago_df).reshape(-1, length, length, 1)
# train_minute_ago_df = np.array(train_minute_ago_df).reshape(-1, length, length, 1)


# train_array = np.concatenate((train_df, train_minute_ago_df, train_week_ago_df), axis=3)
# X_train, X_val = train_test_split(train_array, test_size = 0.1, random_state=42)

In [64]:
def createTrainArray(week_history_num=0, minute_history_num=0):
    week_delta_list = [pd.Timedelta(i+1, unit='W') for i in range(week_history_num)]
    minute_delta_list = [pd.Timedelta((i+1)*15, unit='m') for i in range(minute_history_num)]
    # 参考历史数据时间点list
    delta_list = week_delta_list+minute_delta_list
    print(delta_list)
    
    set_up_time = pd.Timedelta(week_history_num, unit='W')
    # 根据历史数据选取多少，重新构建数据集
    train_df = matrix_df.truncate(before=matrix_df.index.min() + set_up_time)
    
    train_ago_array_tuple = tuple([np.array(matrix_df.loc[train_df.index - i]).reshape(-1, length, length, 1) for i in delta_list])
    train_df = np.array(train_df).reshape(-1, length, length, 1)
    # concatenate保持 待修复数据在前，参考历史数据在后。与random_mask函数生成mask相一致
    train_array = np.concatenate((train_df,)+train_ago_array_tuple, axis=3)
    print(train_array.shape)
    return train_array

In [65]:
week_history_num = 1
minute_history_num = 1

channel_num = week_history_num +minute_history_num +1
smooth_time = channel_num-1

train_array = createTrainArray(week_history_num, minute_history_num)
X_train, X_val = train_test_split(train_array, test_size = 0.1, random_state=42)

[Timedelta('7 days 00:00:00'), Timedelta('0 days 00:15:00')]
(16704, 32, 32, 3)


In [66]:
epoch_steps = X_train.shape[0] // MAX_BATCH_SIZE
val_steps = X_val.shape[0] // MAX_BATCH_SIZE
epoch_steps, val_steps

(469, 52)

In [67]:
rand_size = 25
block_size = (30, 30)
mask_type = 'rand'
class DataGenerator(ImageDataGenerator):
    def flow(self, X, *args, **kwargs):
        i = 1
        while True:
            
            # Get augmentend image samples
            ori = next(super().flow(X, *args, **kwargs))

            # Get masks for each image sample
            mask = np.stack([random_mask(ori.shape[1], ori.shape[2], size=rand_size, channels=channel_num, smooth_time=smooth_time, type=mask_type, block_size=block_size) for _ in range(ori.shape[0])], axis=0)
            # Apply masks to all image sample
            masked = deepcopy(ori)
#             print(masked.shape)
            masked_mean = masked[mask==1].mean()
            masked[mask==0] = masked_mean

            # Yield ([ori, masl],  ori) training batches
#             print(masked.shape, ori.shape)
            gc.collect()
            yield [masked, mask], ori[:,:,:,:1]
            
train_datagen = DataGenerator()
train_generator = train_datagen.flow(
    X_train, batch_size=MAX_BATCH_SIZE
)

# Create validation generator
val_datagen = DataGenerator()
val_generator = val_datagen.flow(
    X_val, batch_size=MAX_BATCH_SIZE
)


In [68]:
# np.array(next(train_generator)[1])[:,:,:,:1].shape
# reduce_lr = ReduceLROnPlateau(monitor='val_loss', patience=1, mode='auto')

# model = PConvUnet(img_rows=length, img_cols=length, channels=channel_num)
# optimizer = model.get_optimizer()
# def scheduler(epoch):
#     if epoch>1 and epoch % 1 == 0 and epoch != 0:
#         lr = K.get_value(optimizer.lr)
#         if lr>0.00005:
#             if epoch%4==0:
#                 K.set_value(optimizer.lr, lr * 0.6)
#             else:
#                 K.set_value(optimizer.lr, lr * 0.8)
#     print(K.get_value(optimizer.lr))
#     return K.get_value(optimizer.lr)

# reduce_lr = LearningRateScheduler(scheduler)

In [69]:
# 手动训练
model = PConvUnet(img_rows=length, img_cols=length, channels=channel_num)
optimizer = model.get_optimizer()
def scheduler(epoch):
    if epoch>2 and epoch % 1 == 0 and epoch != 0:
        lr = K.get_value(optimizer.lr)
        if lr>0.0001:
            if epoch%3==0:
                K.set_value(optimizer.lr, lr * 0.6)
            else:
                K.set_value(optimizer.lr, lr * 0.8)
    print(K.get_value(optimizer.lr))
    return K.get_value(optimizer.lr)

reduce_lr = LearningRateScheduler(scheduler)

In [70]:
# 手动训练
model.fit(
    train_generator, 
    validation_data=val_generator,
    steps_per_epoch = epoch_steps,
    validation_steps = val_steps,
    epochs = 20,
    callbacks=[reduce_lr]
)

Epoch 1/1
0.004
Epoch 2/2
0.004
Epoch 3/3
0.004
Epoch 4/4
0.0024
Epoch 5/5
0.0019200001
Epoch 6/6
0.001536
Epoch 7/7
0.00092160003
Epoch 8/8
0.00073728
Epoch 9/9
0.000589824
Epoch 10/10
0.0003538944
Epoch 11/11
0.0002831155
Epoch 12/12
0.00022649241
Epoch 13/13
0.00013589545
Epoch 14/14
0.000108716355
Epoch 15/15
8.697309e-05
Epoch 16/16
8.697309e-05
Epoch 17/17
8.697309e-05
Epoch 18/18
8.697309e-05
Epoch 19/19
8.697309e-05
Epoch 20/20
8.697309e-05


In [71]:
model.save_weights('./model/bmodel_weights_random_25_2_20e.h5')
# model.load_weights('.model//bmodel_weights_random_25_20e.h5')

In [72]:
def l2(y_true, y_pred):
    return np.sum(np.mean(np.square(y_true - y_pred), axis=0))

def l1(y_true, y_pred):
    return np.sum(np.mean(np.abs(y_true - y_pred), axis=0))

In [84]:
mask_shape = X_val.shape
mask = np.stack([random_mask(mask_shape[1], mask_shape[2], size=rand_size, channels=channel_num, smooth_time=smooth_time, type=mask_type, block_size=block_size) for _ in range(mask_shape[0])], axis=0)

masked = deepcopy(X_val)
masked_mean = masked[mask==1].mean()
masked[mask==0] = masked_mean

y_pred = model.predict([masked, mask])
y_true = X_val[:,:,:,:1]

# 仅对缺失数据进行l2评价
y_true = (1-mask[:,:,:,:1])*y_true
y_pred = (1-mask[:,:,:,:1])*y_pred

l2(y_true, y_pred)

246009.44414879373

In [74]:
np.min(np.sum(np.sum(np.sum(np.abs(y_true - y_pred), axis=3), axis=2), axis=1))

1131.7269773626324

In [75]:
# model.summary()

In [76]:
# list = []
# for _ in range(20):
#     list.append(model.evaluate_generator(val_generator, 1))
# list
# # model.evaluate_generator(val_generator, 100)

In [77]:
# import random
# test_num = random.randint(0, 200)
# test = deepcopy(X_train[test_num,np.newaxis,:])

# test_mask = random_mask(test.shape[1], test.shape[2], size=0.1, channels=channel_num, smooth_time=smooth_time)
# test_mask = test_mask[np.newaxis,:]

# test_mask[0,:,:,0].shape
# test[test_mask==0] = test.mean()

# # test_mask.shape
# # plt.imshow(test[0,:,:,0]*255)
# test_res = model.predict([test, test_mask])
# # np.sum((test-test_res)**2)
# np.sum((test[0,:,:,0][test_mask[0,:,:,0] == 0] - test_res[0,:,:,0][test_mask[0,:,:,0] == 0])**2)