In [None]:
import numpy
import matplotlib.pyplot as plt
import pandas
import math
import os
import time
import glob
from tqdm import tqdm
import re
from keras.models import Sequential
from keras.optimizers import adam_v2
from keras.layers import Dense,Conv1D,Dropout,MaxPooling1D
from keras.layers import LSTM,Flatten,Reshape
from attention import Attention
from tcn import TCN, tcn_full_summary
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics import mean_squared_error,r2_score,mean_absolute_error,mean_squared_log_error
from tensorflow import compat,config
from keras.models import Model, load_model
from keras.callbacks import ModelCheckpoint,EarlyStopping

In [None]:
# fix random seed for reproducibility
seed = 7
numpy.random.seed(seed)

# define basic function
# convert an array of values into a dataset matrix
def create_dataset(dataset, look_back=1):
	dataX, dataY = [], []
	for i in range(len(dataset)-look_back-1):
		a = dataset[i:(i+look_back), 0]
		dataX.append(a)
		dataY.append(dataset[i + look_back, 0])
	return numpy.array(dataX), numpy.array(dataY)

# RMSLE loss function
from keras import backend as K
def root_mean_squared_logarithmic_error(y_true, y_pred):
        first_log = K.log(K.clip(y_pred, K.epsilon(), None) + 1.)
        second_log = K.log(K.clip(y_true, K.epsilon(), None) + 1.)
        return K.sqrt(K.mean(K.square(first_log - second_log), axis=-1))

In [None]:
# load the first dataset
dataframe = pandas.read_csv('Dataset/real_1.csv', usecols=[1], engine='python')
dataset = dataframe.values
# dataset = dataset.astype('float32')
# normalize the dataset
scaler = MinMaxScaler(feature_range=(0, 1))
dataset = scaler.fit_transform(dataset)
# split into train and test sets
train_size = int(len(dataset) * 0.7)
test_size = len(dataset) - train_size
train, test = dataset[0:train_size,:], dataset[train_size:len(dataset),:]
print(len(train), len(test))

'''set predict hyperparams'''
look_back = 20 # 决定历史时间步长
predict_step = 1 # 表示单步预测，代码中并不会用到

# reshape into X=t and Y=t+1
trainX, trainY = create_dataset(train, look_back)
testX, testY = create_dataset(test, look_back)
# reshape input to be [samples, time steps, features]
trainX = numpy.reshape(trainX, (trainX.shape[0], trainX.shape[1], 1))
testX = numpy.reshape(testX, (testX.shape[0], testX.shape[1], 1))

# example the trainX
for i in range(5):
    for j in range(len(trainX[i])):
        print(trainX[i,j],end=' ')
    print('\n')
print(len(trainX),len(trainY))

# plot the normalized dataset
plt.plot(dataset)

plt.rcParams['font.sans-serif']=['SimHei']
plt.rcParams['axes.unicode_minus'] = False
plt.xlabel("时间戳")
plt.ylabel("归一化流量值")
plt.title(" 预处理后的网络流量数据集")
plt.savefig('example',dpi=300)
plt.show()

In [None]:
# load all other data
file_path = r'Dataset' #set the path accordingly
all_files = sorted(glob.glob(file_path+"/real_*.csv"), key=lambda x: int(re.findall("[0-9]+", x)[0]))  #按数字顺序读取csv文件

'''load params'''
plot_len = []
len_ds = 0
file_num = 10 #在所有csv中读取多少个csv作为数据集
plot_num = 3  #绘制多少个结果输出图

print(len(trainX), len(trainY))
for i,filename in enumerate(all_files):
  if i == 0:
    plot_len.append([train_size,test_size])
    continue
  if i >= file_num:
    break
  df=pandas.read_csv(filename,usecols=[1], engine='python')
  ds = df.values
  ds = scaler.fit_transform(ds)
  
  len_ds = len_ds + len(ds)
  print('len_ds:',len_ds,end='  ')

  train_size = int(len(ds) * 0.7)
  test_size = len(ds) - train_size
  train, test = ds[0:train_size,:], ds[train_size:len(ds),:]  
  # 保留第2，3，4个csv文件的位置
  if i in range(plot_num):
    plot_len.append([train_size,test_size])

  trainXt, trainYt = create_dataset(train, look_back)
  testXt, testYt = create_dataset(test, look_back)
  trainXt = numpy.reshape(trainXt, (trainXt.shape[0], trainXt.shape[1], 1))
  testXt = numpy.reshape(testXt, (testXt.shape[0], testXt.shape[1], 1))
  
  trainX = numpy.concatenate((trainX,trainXt))
  trainY = numpy.concatenate((trainY,trainYt))
  testX= numpy.concatenate((testX,testXt))
  testY = numpy.concatenate((testY,testYt))
  print(len(trainXt),len(trainYt),len(trainX),len(trainY))

print(plot_len)
print(look_back)


In [None]:
# define model structure
model=Sequential()
model.add(Conv1D(16, 5, padding='same', activation='relu',input_shape=(look_back, 1),kernel_initializer='he_normal'))
dilation_rates = [2**i for i in range(12)]
n_filters = 10
kernel_size = 5
for dilation_rate in dilation_rates:
    model.add(Conv1D(filters = n_filters,
                kernel_size=kernel_size,
                padding='causal',
                dilation_rate=dilation_rate))
model.add(Dense(n_filters,activation='relu',kernel_initializer='he_normal'))
model.add(Reshape((1,20*n_filters)))
model.add(TCN(128, activation='tanh',return_sequences='True'))
model.add(Attention(units=32))
model.add(Flatten())
model.add(Dense(32))
model.add(Dropout(0.5))
model.add(Dense(1))
model.compile(loss=root_mean_squared_logarithmic_error, optimizer='adam')
model.summary()
print(look_back)

In [None]:
# set train params & create data save folder
epochs = 300
batch_size = 128
model_name = 'paper_bad'

t_start = time.time()
Time = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime(int(t_start)))
sTime = time.strftime("%m%d//%H%M_", time.localtime(int(t_start)))

path = 'predict/' + sTime + model_name
isExists=os.path.exists(path)
if not isExists:
    os.makedirs(path)
print('folder made')

# monitor:监视参数，min_delta:小于此数认为不变化，mode:loss小好，acc大好，patience:n周期无提升则退出，restore_best_weights:取最优权重
earlyStop = EarlyStopping(monitor='val_loss', min_delta=0, patience=50, mode='min', verbose=1, restore_best_weights = True)
# 增加validation_data参数作为验证集，添加早停止机制，训练时打乱序列顺序
history = model.fit(
    trainX, trainY, 
    callbacks=[earlyStop], 
    epochs=epochs, batch_size=batch_size, 
    validation_split=0.1, 
    shuffle = True,
    verbose=1) 
print("train finished")

# verbose：日志显示函数，verbose = 0 为不在标准输出流输出日志信息，verbose = 1 为输出进度条记录，verbose = 2 为每一个epoch输出一行记录

In [None]:
# save trained model
model_file = path + '/model.h5'
model.save(model_file)

print(history.history.keys())
# plot train loss
plt.plot(history.history['loss'])
plt.xlabel('epoch')
plt.ylabel('loss')
plt.savefig(path+'/loss.jpg')
plt.show()

# plot val loss
plt.plot(history.history['val_loss'])
plt.xlabel('epoch')
plt.ylabel('val_loss')
plt.savefig(path+'/val_loss.jpg')
plt.show()

In [None]:
# 使用之前训练好的模型进行预测
# model_path = r'D:\Gdesign\demo\predict\0510\2225_C+C+TCN_valtest\model.h5'
# model=load_model(model_path,custom_objects={'TCN':TCN})

In [None]:
# make predictions
trainPredict = model.predict(trainX)
testPredict = model.predict(testX)
# invert predictions
trainPredict = scaler.inverse_transform(trainPredict)
trainY_s = scaler.inverse_transform([trainY])
testPredict = scaler.inverse_transform(testPredict)
testY_s = scaler.inverse_transform([testY])

plt.plot(testPredict[:,0])
plt.plot(testY_s[0])
# evaluate performance
trainScore_rmse = math.sqrt(mean_squared_error(trainY_s[0], trainPredict[:,0]))
print('Train RMSE: %.4f' % (trainScore_rmse))
testScore_rmse = math.sqrt(mean_squared_error(testY_s[0], testPredict[:,0]))
print('Test RMSE: %.4f' % (testScore_rmse))

try:
    testScore_rmsle = math.sqrt(mean_squared_log_error(testY_s[0], testPredict[:,0]))
    print('RMSLE: %.4f' % (testScore_rmsle))
except(ValueError):
    print('RMSLE: Nan (negative prediction exists)')
except:
    print('RMSLE: Nan (unknown error)')

testScore_r2 = r2_score(testY_s[0], testPredict[:,0])
print('R2: %.4f' % (testScore_r2))
testScore_MAE = mean_absolute_error(testY_s[0], testPredict[:,0])
print('MAE: %.4f' % (testScore_MAE))

In [None]:
# 打印结果图(一个csv一张图，训练数据和测试数据绘制在一张上)
for i,j in enumerate(plot_len):
    if i == 0:
        train_size = j[0]
        test_size = j[1]
        train_j = trainY[:train_size]
        test_j = testY[:test_size]
        trainPredict_j = trainPredict[:train_size,:]
        testPredict_j = testPredict[:test_size,:]
    elif i >0:
        train_size_t = j[0]
        test_size_t = j[1]
        train_j = trainY[train_size:train_size_t+train_size]
        test_j = testY[test_size:test_size_t+test_size]
        trainPredict_j = trainPredict[train_size:train_size_t+train_size,:]
        testPredict_j = testPredict[test_size:test_size_t+test_size,:]
        
        train_szie = train_size_t+train_size
        test_size = test_size_t+test_size

    # print(numpy.shape(train_j),numpy.shape(test_j))
    # print(numpy.shape(trainPredict_j),numpy.shape(testPredict_j))
    ds_j = numpy.concatenate((train_j,test_j))
    ds_j = numpy.reshape(ds_j,(len(ds_j),1))
    # print(numpy.shape(ds_j))

    trainPredictPlot = numpy.empty((len(ds_j),1))
    trainPredictPlot[:, :] = numpy.nan
    trainPredictPlot[:len(trainPredict_j), :] = trainPredict_j
    testPredictPlot = numpy.empty((len(ds_j),1))
    testPredictPlot[:, :] = numpy.nan
    testPredictPlot[len(trainPredict_j):len(ds_j), :] = testPredict_j

    ds_js = scaler.inverse_transform(ds_j)

    x = numpy.arange(0, len(ds_j), 100)
    plt.xticks(x)
    plt.plot(ds_js,label = "Ground Truth")
    plt.plot(trainPredictPlot, label = "Train Prediction")
    plt.plot(testPredictPlot, label = "Test Prediction")
    plt.legend(loc="upper left",fontsize=14)  
    plt.title("Predict Output of real_" + str(i+1) + ".csv")
    # 扩大x轴长度，否则太拥挤看不清
    plt.gca().margins(x=0)
    plt.gcf().canvas.draw()
    # set size
    maxsize = 100
    m = 0.2
    N =len(x)
    s = maxsize / plt.gcf().dpi * N + 2 * m
    margin = m / plt.gcf().get_size_inches()[0]
    plt.gcf().subplots_adjust(left=margin, right=1. - margin)
    plt.gcf().set_size_inches(s, plt.gcf().get_size_inches()[1])

    plt.savefig("%s%s%d%s.jpg"%(path, "/predict_a",i+1,".jpg"), bbox_inches='tight')
    plt.show()


In [None]:
# 打印结果图(一个csv一张图，只包含测试数据)
for i,j in enumerate(plot_len):
    if i == 0:
        train_size = j[0]
        test_size = j[1]
        train_j = trainY[:train_size]
        test_j = testY[:test_size]
        trainPredict_j = trainPredict[:train_size,:]
        testPredict_j = testPredict[:test_size,:]
    elif i >0:
        train_size_t = j[0]
        test_size_t = j[1]
        train_j = trainY[train_size:train_size_t+train_size]
        test_j = testY[test_size:test_size_t+test_size]
        trainPredict_j = trainPredict[train_size:train_size_t+train_size,:]
        testPredict_j = testPredict[test_size:test_size_t+test_size,:]
        
        train_szie = train_size_t+train_size
        test_size = test_size_t+test_size

    ds_j = numpy.concatenate((train_j,test_j))
    ds_j = numpy.reshape(ds_j,(len(ds_j),1))

    ds_js = scaler.inverse_transform(ds_j)
    train_j_s = scaler.inverse_transform([train_j])
    test_j_s = scaler.inverse_transform([test_j])

    x = numpy.arange(0, len(ds_j), 25)
    plt.figure(dpi=100)
    plt.xticks(x,rotation = 50)
    plt.plot(test_j_s[0],label = "Ground Truth",linestyle = (0, (1, 2)),color = 'blue',alpha=0.5)
    plt.plot(testPredict_j, label = "Prediction",linestyle = (0, (1, 2)) ,color= 'red',alpha=0.7)
    plt.xlabel('Timestep')
    plt.ylabel('Web traffic Value')
    plt.legend(loc="upper right",fontsize=8)  
    plt.savefig("%s%s%d%s.jpg"%(path, "/predict_b",i+1,".jpg"), dpi =1000,bbox_inches='tight')
    plt.show()


In [None]:
# 从数据集之外读取一个新的csv进行预测，测试泛化性
test_file = 'real_60.csv'
df_p = pandas.read_csv('Dataset/' + test_file, usecols=[1], engine='python')
ds_p = scaler.fit_transform(df_p.values)
plt.plot(ds_p)
plt.xlabel("Timestamp")
plt.ylabel("NormalizedValue")
plt.title(" Example plot of web traffic after preprocessing ")
plt.show()

# convert an array of values into a dataset matrix
testX_p, testY_p = create_dataset(ds_p, look_back)
# reshape input to be [samples, time steps, features]
testX_p = numpy.reshape(testX_p, (testX_p.shape[0], testX_p.shape[1], 1))

testPredict_p = model.predict(testX_p)

# invert predictions
testPredict_p = scaler.inverse_transform(testPredict_p)
testY_p = scaler.inverse_transform([testY_p])
# calculate root mean squared error
testScore_p = math.sqrt(mean_squared_error(testY_p[0], testPredict_p[:,0]))
print('RMSE: %.4f' % (testScore_p))

testPredictPlot_p = numpy.empty_like(ds_p)
testPredictPlot_p[:, :] = numpy.nan
testPredictPlot_p[look_back:len(ds_p)-1, :] = testPredict_p

# plot baseline and predictions
x = numpy.arange(0, len(ds_p), 50)
y = numpy.arange(0, 13, 1)
plt.yticks(y,fontsize=15)
plt.xticks(x,rotation = 30,fontsize= 15)
plt.plot(scaler.inverse_transform(ds_p),label = "Ground Truth") #linestyle = (0, (1, 2)),color = 'blue',alpha=0.5
plt.plot(testPredictPlot_p, label = "Prediction")
plt.xlabel("时间戳",fontsize=15)
plt.ylabel("网络流量值",fontsize=15)
plt.legend(loc="upper left",fontsize=15)  
plt.title("CNN+TCN+A模型预测")

plt.gca().margins(x=0)
plt.gcf().canvas.draw()
    
# set size
maxsize = 30
m = 0.2
N =len(x)
s = maxsize / plt.gcf().dpi * N + 2 * m
margin = m / plt.gcf().get_size_inches()[0]

plt.gcf().subplots_adjust(left=margin, right=1. - margin)
plt.gcf().set_size_inches(s, plt.gcf().get_size_inches()[1])

plt.savefig("%s%s.jpg"%(path, "/predict_c"), bbox_inches='tight',dpi = 1000)
plt.show()

with open(path + '/result.txt','a+') as f:
    f.write('\n')
    f.write("%s%s" %('test score of real_60.csv:    RMSE',testScore_p))
    

In [None]:
# 将结果保存到tb表
import prettytable as pt
from contextlib import redirect_stdout
tb = pt.PrettyTable()
tb.field_names = ["Structure", "epochs", "batch_size", "seed","look back","dataset num", "test file" ]
tb.add_row([model_name,epochs,batch_size,seed,look_back,file_num,test_file])
tb1 = pt.PrettyTable()
tb1.field_names = ["train rmse", "test rmse","test rmsle","test r2","test mae","practice mse"]
tb1.add_row([trainScore_rmse,testScore_rmse,testScore_rmsle,testScore_r2,testScore_MAE,testScore_p])

print(tb)
print(tb1)
with open(path + '/result.txt','w+') as f:
    f.write(str(tb))
    f.write('\n')
    f.write(str(tb1))
    f.write('\n')
    with redirect_stdout(f):
        model.summary()