# 导入部分

In [1]:
import torch
import torch.nn as nn
import torch.cuda
import new_model
import dgl
import pandas as pd
import time
import random
import numpy as np

In [2]:
import os
os.environ['KMP_DUPIPLICATE_LIB_OK']='True'

# 辅助动图类

In [3]:
%matplotlib inline
from IPython import display
from matplotlib import pyplot as plt
from matplotlib_inline import backend_inline

class Animator:
    """在动画中绘制数据"""
    def __init__(self, xlabel=None, ylabel=None, legend=None, xlim=None,
                 ylim=None, xscale='linear', yscale='linear',
                 fmts=('-', 'm--', 'g-.', 'r:'), nrows=1, ncols=1,
                 figsize=(7, 5)):
        # 增量地绘制多条线
        if legend is None:
            legend = []
        backend_inline.set_matplotlib_formats('svg')
        self.fig, self.axes = plt.subplots(nrows, ncols, figsize=figsize)
        if nrows * ncols == 1:
            self.axes = [self.axes, ]
        # 使用lambda函数捕获参数
        self.config_axes = lambda: self.set_axes(xlabel, ylabel, xlim, ylim, xscale, yscale, legend)
        self.X, self.Y, self.fmts = None, None, fmts

    def set_axes(self, xlabel, ylabel, xlim, ylim, xscale, yscale, legend):
        """设置matplotlib的轴"""
        self.axes[0].set_xlabel(xlabel)
        self.axes[0].set_ylabel(ylabel)
        self.axes[0].set_xscale(xscale)
        self.axes[0].set_yscale(yscale)
        self.axes[0].set_xlim(xlim)
        self.axes[0].set_ylim(ylim)
        if legend:
            self.axes[0].legend(legend)
        self.axes[0].grid()


    def add(self, x, y):
        # 向图表中添加多个数据点
        if not hasattr(y, "__len__"):
            y = [y]
        n = len(y)
        if not hasattr(x, "__len__"):
            x = [x] * n
        if not self.X:
            self.X = [[] for _ in range(n)]
        if not self.Y:
            self.Y = [[] for _ in range(n)]
        for i, (a, b) in enumerate(zip(x, y)):
            if a is not None and b is not None:
                self.X[i].append(a)
                self.Y[i].append(b)
        self.axes[0].cla()
        for x, y, fmt in zip(self.X, self.Y, self.fmts):
            self.axes[0].plot(x, y, fmt)
        self.config_axes()
        plt.pause(0.001)
        display.display(self.fig)
        display.clear_output(wait=True)
        
    def show(self):
        display.display(self.fig)

# 训练和评估函数

In [4]:
def new_eval(train_graph_list, val_graph_list, model, loss_func, num_steps):

    eval_loss = []

    model.eval()
    with torch.no_grad():
        
        for i in range(1, len(val_graph_list)+1):
            
            if i < num_steps:
                batched_graph = dgl.batch(train_graph_list[-(num_steps - i):] + val_graph_list[:i])
            else:
                batched_graph = dgl.batch(val_graph_list[i - num_steps:i])
            
            active, consume = model(batched_graph)
            label = val_graph_list[i-1].ndata['label']

            loss = loss_func(active, label[:, 0].view_as(
                active)) + loss_func(consume, label[:, 1].view_as(consume))
            
            eval_loss.append(loss.item())

    model.train()
    return sum(eval_loss) / len(eval_loss)

In [5]:
def eval(train_graph_list, val_graph_list, model, loss_func, num_steps):

    eval_loss = []

    model.eval()
    with torch.no_grad():
        
        for i in range(1, len(val_graph_list)+1):
            
            if i < num_steps:
                batched_graph = dgl.batch(train_graph_list[-(num_steps - i):] + val_graph_list[:i])
            else:
                batched_graph = dgl.batch(val_graph_list[i - num_steps:i])
            
            active, consume = model(batched_graph)
            label = batched_graph.ndata['label']

            loss = loss_func(active, label[:, 0].view_as(
                active)) + loss_func(consume, label[:, 1].view_as(consume))
            
            eval_loss.append(loss.item())

    model.train()
    return sum(eval_loss) / len(eval_loss)

In [6]:
def cnn_eval(val_data, model, loss_func):
    val_loss = []
    model.eval()
    with torch.no_grad():
        for sequences_data in val_data:
            active, consume = model(sequences_data)
            
            label = sequences_data[-1, :, -2:]

            loss = loss_func(active, label[:, 0].view_as(
                active)) + loss_func(consume, label[:, 1].view_as(consume))
            
            val_loss.append(loss.item())

    model.train()
    return sum(val_loss) / len(val_loss)

In [7]:
def new_time_step_train(graph_list, model, loss_func, optimizer, num_epochs, device, train_ratio, num_steps, threshold):

    # 划分训练集和验证集
    train_graph_list = graph_list[:int(len(graph_list) * train_ratio)]
    val_graph_list = graph_list[int(len(graph_list) * train_ratio):]

    # 将数据集放到device上
    loss_func = loss_func.to(device)
    train_graph_list = [graph.to(device) for graph in train_graph_list]
    val_graph_list = [graph.to(device) for graph in val_graph_list]

    print('train_length:', len(train_graph_list), 'val_length:', len(val_graph_list))

    animator = Animator(xlabel='epoch', ylabel='loss & score', xlim=[0, num_epochs], ylim=[
                        0, 50], legend=['train loss', 'val loss', 'score', 'best score'])

    # 设置文件夹名称，以日和时分命名
    dictionary_name = time.strftime("Day%d-Hour%H-Minutes%M", time.localtime())
    
    # 如果文件夹不存在则创建文件夹
    if not os.path.exists('model/' + dictionary_name):
        os.mkdir('model/' + dictionary_name)  

    model.train()

    best_val_loss = 9999999999
    best_score = 0
    best_epoch = 0
    best_model_state = None
    
    def data(pos):
        # 返回从pos位置开始的长度为num_steps的序列
        return train_graph_list[pos: pos + num_steps]

    for epoch in range(num_epochs):
        train_loss = []
        
        # 从随机偏移量开始对序列进行分区，随机范围包括num_steps-1
        sequences = train_graph_list[random.randint(0, num_steps - 1):]
        
        num_subseqs = len(sequences) // num_steps
        
        # 长度为num_steps的子序列的起始索引
        initial_indices = list(range(0, num_subseqs * num_steps, num_steps))
        
        # 在随机抽样的迭代过程中，
        # 来自两个相邻的、随机的、小批量中的子序列不一定在原始序列上相邻
        random.shuffle(initial_indices)
        
        X = [data(j) for j in initial_indices]
        
        for batch_graph in X:
        
            # 创建批处理图对象
            batched_graph = dgl.batch(batch_graph)
            
            # 前向传播
            active, consume = model(batched_graph)
            
            # 计算损失
            label = batch_graph[-1].ndata['label'].float()
            
            # print(label.shape)
            
            loss = loss_func(active, label[:, 0].view_as(
                active)) + loss_func(consume, label[:, 1].view_as(consume))
            
            train_loss.append(loss.item())
            
            # 反向传播
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()

        # 计算平均损失
        train_loss = sum(train_loss) / len(train_loss)

        # 使用评估函数计算验证集上的loss
        val_loss = new_eval(train_graph_list, val_graph_list, model, loss_func, num_steps)

        # 对val_loss开根号
        sqrt_val_loss = torch.sqrt(torch.tensor(val_loss * 4))

        # 计算score
        score = 1 / (1 + sqrt_val_loss)

        # 如果当前模型在验证集上的表现比之前所有模型都好，就保存当前模型的参数
        if val_loss < best_val_loss:
            best_val_loss = val_loss
            best_score = score
            best_epoch = epoch + 1
            best_model_state = model.state_dict()
            
        # 每30个epoch打印一次损失和score
        if (epoch + 1) % 5 == 0:

            # 打印损失和score
            print('Epoch: {0}\n Train loss: {1}\n Val loss: {2}\n Score: {3}\n Best score: {4}\n Best val loss: {5}\n Best epoch: {6}'.format(
                epoch + 1, train_loss, val_loss, score, best_score, best_val_loss, best_epoch))

            # 绘制动画
            animator.add(epoch + 1, (train_loss, val_loss, score * 100, best_score * 100))
            
        # 每30个epoch保存一次模型，并以epoch和该轮的score命名
        if (epoch + 1) % 30 == 0 and score > threshold:
            torch.save(model.state_dict(), 'model/' + dictionary_name + '/' + str(epoch + 1) + '-' + "{:.3f}".format(score))

    return best_model_state, best_score, best_val_loss, best_epoch, dictionary_name

In [8]:
def new_time_step_train_without_eval(graph_list, model, loss_func, optimizer, num_epochs, device, train_ratio, num_steps, threshold):

    # 划分训练集和验证集
    train_graph_list = graph_list[:int(len(graph_list) * train_ratio)]

    # 将数据集放到device上
    loss_func = loss_func.to(device)
    train_graph_list = [graph.to(device) for graph in train_graph_list]

    print('train_length:', len(train_graph_list))


    model.train()
    
    def data(pos):
        # 返回从pos位置开始的长度为num_steps的序列
        return train_graph_list[pos: pos + num_steps]

    for epoch in range(num_epochs):
        train_loss = []
        
        # 从随机偏移量开始对序列进行分区，随机范围包括num_steps-1
        sequences = train_graph_list[random.randint(0, num_steps - 1):]
        
        num_subseqs = len(sequences) // num_steps
        
        # 长度为num_steps的子序列的起始索引
        initial_indices = list(range(0, num_subseqs * num_steps, num_steps))
        
        # 在随机抽样的迭代过程中，
        # 来自两个相邻的、随机的、小批量中的子序列不一定在原始序列上相邻
        random.shuffle(initial_indices)
        
        X = [data(j) for j in initial_indices]
        
        for batch_graph in X:
        
            # 创建批处理图对象
            batched_graph = dgl.batch(batch_graph)
            
            # 前向传播
            active, consume = model(batched_graph)
            
            # print(active[0:5])
            # print(consume[0:5])
            
            # 计算损失
            label = batch_graph[-1].ndata['label'].float()
            
            # print(label[0:5])
            
            # print(label.shape)
            
            loss = loss_func(active, label[:, 0].view_as(
                active)) + loss_func(consume, label[:, 1].view_as(consume))
            
            train_loss.append(loss.item())
            
            # 反向传播
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()

        # 计算平均损失
        train_loss = sum(train_loss) / len(train_loss)
            
        # 每5个epoch打印一次损失
        if (epoch + 1) % 5 == 0:

            # 打印损失和score
            print('Epoch: {0}\n Train loss: {1}'.format(epoch + 1, train_loss))


In [9]:
def time_step_train(graph_list, model, loss_func, optimizer, num_epochs, device, train_ratio, num_steps, threshold):

    # 划分训练集和验证集
    train_graph_list = graph_list[:int(len(graph_list) * train_ratio)]
    val_graph_list = graph_list[int(len(graph_list) * train_ratio):]

    # 将数据集放到device上
    loss_func = loss_func.to(device)
    train_graph_list = [graph.to(device) for graph in train_graph_list]
    val_graph_list = [graph.to(device) for graph in val_graph_list]

    print('train_length:', len(train_graph_list), 'val_length:', len(val_graph_list))

    animator = Animator(xlabel='epoch', ylabel='loss & score', xlim=[0, num_epochs], ylim=[
                        0, 50], legend=['train loss', 'val loss', 'score', 'best score'])

    # 设置文件夹名称，以日和时分命名
    dictionary_name = time.strftime("Day%d-Hour%H-Minutes%M", time.localtime())
    
    # 如果文件夹不存在则创建文件夹
    if not os.path.exists('model/' + dictionary_name):
        os.mkdir('model/' + dictionary_name)  

    model.train()

    best_val_loss = 9999999999
    best_score = 0
    best_epoch = 0
    best_model_state = None
    
    def data(pos):
        # 返回从pos位置开始的长度为num_steps的序列
        return train_graph_list[pos: pos + num_steps]

    for epoch in range(num_epochs):
        train_loss = []
        
        # 从随机偏移量开始对序列进行分区，随机范围包括num_steps-1
        sequences = train_graph_list[random.randint(0, num_steps - 1):]
        
        num_subseqs = len(sequences) // num_steps
        
        # 长度为num_steps的子序列的起始索引
        initial_indices = list(range(0, num_subseqs * num_steps, num_steps))
        
        # 在随机抽样的迭代过程中，
        # 来自两个相邻的、随机的、小批量中的子序列不一定在原始序列上相邻
        random.shuffle(initial_indices)
        
        X = [data(j) for j in initial_indices]
        
        for batch_graph in X:
        
            # 创建批处理图对象
            batched_graph = dgl.batch(batch_graph)
            
            # 前向传播
            active, consume = model(batched_graph)
            
            # 计算损失
            label = batched_graph.ndata['label'].float()
            loss = loss_func(active, label[:, 0].view_as(
                active)) + loss_func(consume, label[:, 1].view_as(consume))
            
            train_loss.append(loss.item())
            
            # 反向传播
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()

        # 计算平均损失
        train_loss = sum(train_loss) / len(train_loss)

        # 使用评估函数计算验证集上的loss
        val_loss = eval(train_graph_list, val_graph_list, model, loss_func, num_steps)

        # 对val_loss开根号
        sqrt_val_loss = torch.sqrt(torch.tensor(val_loss * 4))

        # 计算score
        score = 1 / (1 + sqrt_val_loss)

        # 如果当前模型在验证集上的表现比之前所有模型都好，就保存当前模型的参数
        if val_loss < best_val_loss:
            best_val_loss = val_loss
            best_score = score
            best_epoch = epoch + 1
            best_model_state = model.state_dict()
            
        # 每30个epoch打印一次损失和score
        if (epoch + 1) % 5 == 0:

            # 打印损失和score
            print('Epoch: {0}\n Train loss: {1}\n Val loss: {2}\n Score: {3}\n Best score: {4}\n Best val loss: {5}\n Best epoch: {6}'.format(
                epoch + 1, train_loss, val_loss, score, best_score, best_val_loss, best_epoch))

            # 绘制动画
            animator.add(epoch + 1, (train_loss * 10, val_loss * 10, score * 100, best_score * 100))
            
        # 每30个epoch保存一次模型，并以epoch和该轮的score命名
        if (epoch + 1) % 30 == 0 and score > threshold:
            torch.save(model.state_dict(), 'model/' + dictionary_name + '/' + str(epoch + 1) + '-' + "{:.3f}".format(score))

    return best_model_state, best_score, best_val_loss, best_epoch, dictionary_name

In [10]:
def cnn_time_step_train(input_data, model, loss_func, optimizer, num_epochs, device, train_ratio, num_steps, batch_size, threshold):

    # 划分训练集和验证集
    train_data = input_data[:int(len(input_data) * train_ratio)]
    val_data = input_data[int(len(input_data) * train_ratio):]
    
    # 将数据集转为tensor
    train_data = torch.tensor(train_data, dtype=torch.float32)
    val_data = torch.tensor(val_data, dtype=torch.float32)
    
    new_val_data = []
    
    for i in range(1, len(val_data)+1):
        if i < num_steps:
            new_val_data.append(torch.cat((train_data[-(num_steps - i):], val_data[:i]), dim=0))
        else:
            new_val_data.append(val_data[i - num_steps:i])
        
    val_data = torch.stack(new_val_data)

    # 将数据集放到device上
    loss_func = loss_func.to(device)
    train_data = train_data.to(device)
    val_data = val_data.to(device)

    print('train_length:', len(train_data), 'val_length:', len(val_data))

    animator = Animator(xlabel='epoch', ylabel='loss & score', xlim=[0, num_epochs], ylim=[
                        0, 50], legend=['train loss', 'val loss', 'score', 'best score'])

    # 设置文件夹名称，以日和时分命名
    dictionary_name = time.strftime("Day%d-Hour%H-Minutes%M", time.localtime())
    
    # 如果文件夹不存在则创建文件夹
    if not os.path.exists('model/' + dictionary_name):
        os.mkdir('model/' + dictionary_name)  

    model.train()

    best_val_loss = 9999999999
    best_score = 0
    best_epoch = 0
    best_model_state = None
    
    def data(pos):
        # 返回从pos位置开始的长度为num_steps的序列
        return train_data[pos: pos + num_steps]

    for epoch in range(num_epochs):
        train_loss = []
        
        # 从随机偏移量开始对序列进行分区，随机范围包括num_steps-1
        sequences = train_data[random.randint(0, num_steps - 1):]
        
        num_subseqs = len(sequences) // num_steps
        
        # 长度为num_steps的子序列的起始索引
        initial_indices = list(range(0, num_subseqs * num_steps, num_steps))
        
        # 在随机抽样的迭代过程中，
        # 来自两个相邻的、随机的、小批量中的子序列不一定在原始序列上相邻
        random.shuffle(initial_indices)
        
        X = [data(j) for j in initial_indices]
        
        for sequences_data in X:
            
            for i in range(0, len(sequences_data), batch_size):
                
                batched_data = sequences_data[:, i:i+batch_size, :]
                
                # print('batched_data:', batched_data.shape)
                
                label = batched_data[-1, :, -2:]
                
                # print('label:', label.shape)
                
                # 前向传播
                active, consume = model(batched_data)
                
                # print('active:', active.shape, 'consume:', consume.shape)
                
                # 计算损失
                loss = loss_func(active, label[:, 0].view_as(
                    active)) + loss_func(consume, label[:, 1].view_as(consume))
                
                train_loss.append(loss.item())
                
                # 反向传播
                optimizer.zero_grad()
                loss.backward()
                optimizer.step()

        # 计算平均损失
        train_loss = sum(train_loss) / len(train_loss)

        # 使用评估函数计算验证集上的loss
        val_loss = cnn_eval(val_data, model, loss_func)

        # 对val_loss开根号
        sqrt_val_loss = torch.sqrt(torch.tensor(val_loss * 4))

        # 计算score
        score = 1 / (1 + sqrt_val_loss)

        # 如果当前模型在验证集上的表现比之前所有模型都好，就保存当前模型的参数
        if val_loss < best_val_loss:
            best_val_loss = val_loss
            best_score = score
            best_epoch = epoch + 1
            best_model_state = model.state_dict()
            
        # 每30个epoch打印一次损失和score
        if (epoch + 1) % 5 == 0:

            # 打印损失和score
            print('Epoch: {0}\n Train loss: {1}\n Val loss: {2}\n Score: {3}\n Best score: {4}\n Best val loss: {5}\n Best epoch: {6}'.format(
                epoch + 1, train_loss, val_loss, score, best_score, best_val_loss, best_epoch))

            # 绘制动画
            animator.add(epoch + 1, (train_loss * 10, val_loss * 10, score * 100, best_score * 100))
            
        # 每30个epoch保存一次模型，并以epoch和该轮的score命名
        if (epoch + 1) % 30 == 0 and score > threshold:
            torch.save(model.state_dict(), 'model/' + dictionary_name + '/' + str(epoch + 1) + '-' + "{:.3f}".format(score))

    return best_model_state, best_score, best_val_loss, best_epoch, dictionary_name

In [11]:
def predict(graph_list, model, device, predict_length):

    # 将数据集放到device上
    graph_list = [graph.to(device) for graph in graph_list]

    # 预测
    model.eval()
    with torch.no_grad():

        # 创建批处理图对象
        batched_graph = dgl.batch(graph_list)

        # 前向传播
        act_pre, con_pre = model(batched_graph)

        # 将两个预测结果作为两列并输出为csv文件
        predict = torch.cat((act_pre, con_pre), dim=1)
        
        # 保留前面的部分
        pre_predict = predict[:-predict_length*1140]
        
        pre_output_unordered = pd.DataFrame(pre_predict.cpu().numpy())
        pre_output_unordered.to_csv("prediction/pre_predict_unordered.csv", index=False, header=False)
        
        pre_predictions = []
        for i in range(1140):
            for j in range(len(pre_predict)//1140):
                pre_predictions.append([pre_predict[j*1140+i, 0], pre_predict[j*1140+i, 1]])
                
        pre_predictions_tensor = torch.tensor(pre_predictions)
        pre_predictions_numpy = pre_predictions_tensor.detach().cpu().numpy()
        
        pre_output = pd.DataFrame(pre_predictions_numpy)
        pre_output.to_csv("prediction/pre_predict_ordered.csv", index=False, header=False)
        
        # 保留最后predict_length*1140行，即保留预测结果
        predict = predict[-predict_length*1140:]

        # 原结果是按照时间排列的，每天1140个节点，每个节点有两个预测值，所以需要将结果按照节点排列，使用for循环
        predictions = []
        for i in range(1140):
            for j in range(predict_length):
                predictions.append([predict[j*1140+i, 0], predict[j*1140+i, 1]])

                
        predictions_tensor = torch.tensor(predictions)
        predictions_numpy = predictions_tensor.detach().cpu().numpy()

        output = pd.DataFrame(predictions_numpy)
        output.to_csv("prediction/predict.csv", index=False, header=False)
    return predictions_tensor

In [12]:
def multiple_predict(graph_list, model_list, device, predict_length):
    # 定义预测结果列表
    multiple_predictions = []

    # 预测
    for model in model_list:
        model.to(device)
        predictions = predict(graph_list, model, device, predict_length)
        multiple_predictions.append(predictions)

    # 将多个模型的预测结果取平均
    multiple_predictions = torch.stack(multiple_predictions)
    predictions = torch.mean(multiple_predictions, dim=0)
    predictions = predictions.detach().cpu().numpy()
    
    # 输出为csv文件
    output = pd.DataFrame(predictions)
    output.to_csv("prediction/predict.csv", index=False, header=False) 

In [13]:
def ensemble_learning(graph_list, dictionary_name, min_threshold, max_threshold, device, predict_length):
    # 集成学习，使用多个模型的预测结果的平均值作为最终的预测结果
    # 从文件夹中读取模型
    file_list = os.listdir(dictionary_name)
    model_list = []
    score_list = []
    for file in file_list:
        # 如果是文件夹则跳过
        if os.path.isdir(dictionary_name + '/' + file):
            continue
        # 将文件名的后五位转为数字
        score = float(file[-5:])
        if score >= min_threshold and score <= max_threshold:
            print('model_name:', file, 'score:', score)
            model = new_model.GCN_LSTM()
            model.load_state_dict(torch.load(dictionary_name + '/' + file))
            model_list.append(model)
            score_list.append(score)
    print('model_num:', len(model_list))
    print('pre_predict_length:', len(graph_list) - 4)
    print('avg_score:', sum(score_list)/len(score_list))
    multiple_predict(graph_list, model_list, device, predict_length)

# 执行部分

In [None]:
# 读取dgl图训练集
train_graph_list, _ = dgl.load_graphs('dgl_data/dgl_graphs_train.bin')

In [None]:
# 读取numpy数组训练集
# train_data = np.load('numpy_data/train.npy')
# print('train_data:', train_data.shape)

In [None]:
# 参数
num_epochs = 2000
train_ratio = 0.8
learning_rate = 0.001
loss_func = nn.MSELoss()
threshold = 0.35
time_step = 7
batch_size = 128

In [None]:
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

In [None]:
model = new_model.GCN_LSTM().to(device)
# model = new_model.GCN_LSTM_test(time_step=time_step).to(device)
# model = new_model.CNN(time_step=time_step).to(device)

In [None]:
# 读取模型
# model = new_model.MLP_test()
# device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
# model.load_state_dict(torch.load('model/new_model.MLP_test(1)_bs_0.486_bvl_0.279_be_4787_time_11_13_18_43.pth'))
# model.to(device)
# optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)
# optimizer.load_state_dict(torch.load('model/new_model.MLP_test(1)_bs_0.486_bvl_0.279_be_4787_time_11_13_18_43.pth_optim'))

In [None]:
# 定义优化器
optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)

In [None]:
# new_time_step_train_without_eval(train_graph_list, model, loss_func, optimizer, num_epochs, device, train_ratio, time_step, threshold)

In [None]:
best_model_state, best_score, best_val_loss, best_epoch, dictionary_name = time_step_train(train_graph_list, model, loss_func, optimizer, num_epochs, device, train_ratio, time_step, threshold)

In [None]:
# best_model_state, best_score, best_val_loss, best_epoch, dictionary_name = new_time_step_train(train_graph_list, model, loss_func, optimizer, num_epochs, device, train_ratio, time_step, threshold)

In [None]:
# best_model_state, best_score, best_val_loss, best_epoch, dictionary_name = cnn_time_step_train(train_data, model, loss_func, optimizer, num_epochs, device, train_ratio, time_step, batch_size, threshold)

In [None]:
# 在训练结束后，使用最好的模型参数来创建最好的模型
best_model = new_model.GCN_LSTM()
best_model.load_state_dict(best_model_state)

In [None]:
# 获取模型类名
model_name = str(type(model))
# 去除类名中的尖括号
model_name = model_name[8:-2]
# 转为3位小数的字符串
best_score = "{:.3f}".format(best_score)
best_epoch = str(best_epoch)

torch.save(best_model.state_dict(), 'model/' + dictionary_name +'/best-' + best_epoch + '-' + best_score)
torch.save(optimizer.state_dict(), 'model/' + dictionary_name +'/optimizer-0.000')

## 预测部分

In [14]:
# 读取dgl图训练集
train_graph_list, _ = dgl.load_graphs('dgl_data/dgl_graphs_train.bin')

In [15]:
# 读取dgl图测试集
test_graph_list_A, _ = dgl.load_graphs('dgl_data/dgl_graphs_test_A.bin')
test_graph_list_B, _ = dgl.load_graphs('dgl_data/dgl_graphs_test_B.bin')

In [16]:
# 删去训练集中的label
for graph in train_graph_list:
    graph.ndata.pop('label')

In [17]:
# 将训练集和测试集拼接起来
graph_list = train_graph_list[-5:] + test_graph_list_A + test_graph_list_B
print(len(graph_list))
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

12


In [18]:
# 单一模型预测
# 读取模型
model = new_model.GCN_LSTM()
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
model.load_state_dict(torch.load('model/best_score_0.351_time_11_12_11_37.pth'))
model.to(device)
# 执行预测
predict(graph_list, model, device, 3)
print("Done!")

Done!


In [None]:
# 集成学习
ensemble_learning(graph_list, 'model/Day24-Hour13-Minutes46', min_threshold=0.375, max_threshold=0.4, device=device,predict_length=4)
print("Done!")

In [None]:
# 读取pre_predict_unordered.csv文件中的预测结果
pre_predict = pd.read_csv('prediction/pre_predict_unordered.csv', header=None)
pre_predict_tensor = torch.tensor(pre_predict.values)
# pre_predict_tensor = pre_predict_tensor.view(-1, 1140, 2)
print(pre_predict_tensor.shape)

train_graph_list, _ = dgl.load_graphs('dgl_data/dgl_graphs_train.bin')

# 读取真实值
batched_graph = dgl.batch(train_graph_list[-(len(graph_list)-4):])
ture_label = batched_graph.ndata['label'].float()
# ture_label = ture_label.view(-1, 1140, 2)
print(ture_label.shape)

# 计算loss
loss_func = nn.MSELoss()
loss = loss_func(pre_predict_tensor[:, 0], ture_label[:, 0].view_as(
                pre_predict_tensor[:, 0])) + loss_func(pre_predict_tensor[:, 1], ture_label[:, 1].view_as(pre_predict_tensor[:, 1]))
print('Loss:', loss.item())

# 计算score
sqrt_loss = torch.sqrt(loss * 4)
score = 1 / (1 + sqrt_loss)
print('Score:', score.item())

In [19]:
# 提取node_test_4_A.csv中的geohash_id和date_id，与prediction/predict.csv中的数据合并
# 读取数据
# test_df = pd.read_csv("data/node_test_4_A.csv", encoding='utf-8')
# predict_df = pd.read_csv("prediction/predict.csv", encoding='utf-8', header=None)

test_df = pd.read_csv("data/node_test_3_B.csv", encoding='utf-8')
predict_df = pd.read_csv("prediction/predict.csv", encoding='utf-8', header=None)

print("test_df.shape: ", test_df.shape)
print("predict_df.shape: ", predict_df.shape)

# 合并数据
predict_df = pd.concat([test_df.iloc[:, 0], predict_df.iloc[:, 1], predict_df.iloc[:, 0], test_df.iloc[:, 1]], axis=1)

# 添加列名
predict_df.columns = ["geohash_id", "consumption_level", "activity_level", "date_id"] 

# 保存数据并以当前日期命名

# 今日提交次数
count = 1

# 读取当前日期
import datetime
now = datetime.datetime.now()
now = now.strftime("%Y-%m-%d")

# 保存数据
predict_df.to_csv("submitCSV/submit_" + now + "_" + str(count) +".csv", index=False, header=True)

print("Done!")

test_df.shape:  (3420, 37)
predict_df.shape:  (3420, 2)
Done!


In [20]:
# 将文件中的所有逗号替换为tab
import os
import re

# 读取文件夹中的所有文件
path = "submitCSV/"
files = os.listdir(path)

# 逐个文件进行处理
for file in files:
    # 读取文件
    with open(path + file, "r", encoding="utf-8") as f:
        data = f.read()
    # 替换所有逗号
    data = re.sub(",", "\t", data)
    # 保存文件
    with open(path + file, "w", encoding="utf-8") as f:
        f.write(data)

print("Done!")

Done!
