In [None]:
from torch.nn import LSTM
import torch
from torch.nn.utils.rnn import pack_padded_sequence, pad_packed_sequence
import torch.nn as nn
import pandas as pd
import numpy as np
import random
import linecache
from tqdm import tqdm
import time
import pickle as pkl
import math
import matplotlib.pyplot as plt
from pathlib import Path
%matplotlib inline

In [None]:
root_path = Path(r"D:\Dataset\MathorCup")

In [None]:
def sample_data():
    datapath = root_path/"train_full.csv"
    train_df_iter = pd.read_csv(datapath, chunksize=500000, encoding="gbk")
    rand_list = [random.randint(0, 118042) for i in range(128)]
    counter = 0
    demo_df = pd.DataFrame()
    for train_df in tqdm(train_df_iter):
    #     print(train_df.head())
    #     print(train_df["小区编号"].unique())
        counter += train_df.loc[train_df["小区编号"].isin(rand_list)].shape[0]
        demo_df = pd.concat([demo_df, train_df.loc[train_df["小区编号"].isin(rand_list)]])
    return demo_df

In [None]:
def data_filter(demo_df):
    demo_df = demo_df.rename(columns={"日期": "Date", "时间":"Time", "小区编号":"ID", "上行业务量GB":"Upload(GB)", "下行业务量GB":"Download(GB)"})
    mapping = {'018-04-01':"2018/4/1", '018-04-02':"2018/4/2", '018-04-03':"2018/4/3", '018-04-04':"2018/4/3", '018-04-08':"2018/4/8", '018-04-09':"2018/4/9", '018-04-10':"2018/4/10"}
    demo_df["Date"] = demo_df.Date.map(lambda x: x if x not in mapping else mapping[x])
    demo_df["DateTime"] = demo_df["Date"] + ' '+demo_df["Time"]
    demo_df["DateTime"] = pd.to_datetime(demo_df["DateTime"])
    demo_df = demo_df.set_index("DateTime")
    return demo_df
# t_demo_df = data_filter(temp_demo_df)
# t_demo_df.head()

In [None]:
def save_df(df, name):
    df.to_csv(root_path/name)

In [None]:
def read_test_data():
    test_path = root_path/"test_demo.csv"
    test_df = pd.read_csv(test_path, index_col="DateTime")
    return test_df

In [None]:
def read_train_data():
    demo_path = root_path/"train_demo.csv"
    demo_df = pd.read_csv(demo_path, index_col="DateTime")
    # demo_df.reindex()
    # demo_df["Date"] = pd.to_datetime(demo_df["Date"])
    return demo_df
# demo_df = read_train_data()
# demo_df.head()

In [None]:
def make_data(df):
    """
        能够从df直接出能够用来训练的数据
        输出pad之后的数据和每个数据的长度，以及每个值都属于什么ID
    """
    def padding(data):
        max_length = 0
        lengths = []
        for item in data:
            lengths.append(item.shape[0])
        max_length = max(lengths)
        data_new = np.array([np.concatenate([item, np.zeros((max_length-item.shape[0], item.shape[1]))]) for item in data])
        return data_new, lengths
    df = df.drop(df[(df["Upload(GB)"].isna())|(df["Download(GB)"].isna())].index)
    assert df["Upload(GB)"].isna().sum() == 0, "Upload(GB) has nan"
    assert df["Download(GB)"].isna().sum() == 0, "Download(GB) has nan"
    data_df = df.sort_values(by=["ID", "DateTime"])
    index = []
    data = []
    final_df = pd.DataFrame()
    for item in data_df["ID"].unique():
        item_df = data_df.loc[data_df["ID"] == item]
        index.append(item)
        data.append(item_df.loc[:, ["Upload(GB)", "Download(GB)"]].values)
#         data.append(item_df["Upload(GB)"].tolist())
    #     print(len(item_df["Upload(GB)"].values))
#         data_D.append(item_df["Download(GB)"].tolist())
    #     print(len(item_df["Download(GB)"].values))
    data_pad, data_lengths = padding(data)
    return index, torch.Tensor(data_pad), data_lengths
# index, data_pad, data_lengths = make_data(demo_df)
# data_pad.shape, len(data_lengths), len(index)

In [None]:
# pkl.dump(["index, data_pad_T, data_lengths", index, data_pad_T, data_lengths], open(r"D:\Dataset\MathorCup\train_data.pkl", "wb"))

In [None]:
# _, index, data_pad_T, data_lengths = pkl.load(open(r"D:\Dataset\MathorCup\train_data.pkl", "rb"))
# _

# 模型处理

## LSTM模型必须需要三维
- (batch_size, time_sequence, features)

In [None]:
class MathorCup(torch.nn.Module):
    def __init__(self, input_size, hidden_size, output_size, batch_first=True, dropout=0.5, cuda=False, num_layers=2):
        super(MathorCup, self).__init__()
        self.input_size = input_size
        self.hidden_size = hidden_size
        self.output_size = output_size
        self.batch_first = batch_first
        self.dropout = dropout
        self.cuda = cuda
        self.num_layers = num_layers
        self.device = torch.device("cpu")
        if self.cuda:
            self.device = torch.device("cuda")
            
        self.lstm = nn.LSTM(input_size=self.input_size, hidden_size=self.hidden_size, batch_first=self.batch_first, dropout=self.dropout, num_layers=self.num_layers)
        if self.cuda:
            self.lstm = self.lstm.cuda()
        self.output = nn.Linear(in_features=self.hidden_size, out_features=self.output_size)
        if self.cuda:
            self.output = self.output.cuda()
    
    def forward(self, x):
        if self.cuda:
            x = x.cuda()
        x1, _ = self.lstm(x)
        o1 = self.output(x1)
        return o1.cpu()

In [None]:
def data_sample(data, lengths, index, offset=None):
    """
        使用前5个小时，一天前的4个小时，两天前的2个小时，一周前的2个小时进行预测
    """
    if offset == None:
        offset = [-1, -2, -3, -4, -5, -24, -25, -26, -27, -48, -49, -168, -169]
        offset.reverse()
    mini_offset = min(offset)
    selected = [index+item if index+item >= 0 else -1 for item in offset]
    pred = [index+item+1 if index+item >= 0 else -1 for item in offset]
#     print(pred[-1])
    data_samp = torch.stack([data[:, item, :] if item >=0 else torch.zeros_like(data[:, item, :]) for item in selected], dim=1)
    data_pred = torch.stack([data[:, item, :] if item >=0 else torch.zeros_like(data[:, item, :]) for item in pred], dim=1)
#     data_pred = data[:, [index], :]
#     return data_samp, data_pred
    return data_samp, data_pred
# data_samp, data_pred = data_sample(data_pad_T, data_lengths, 0)
# data_samp, data_pred = data_sample(data_pad, data_lengths, 16)
# data_samp.shape, data_pred.shape

In [None]:
def train(df, cuda=True, epoch=5, lr=0.0001, clip_value=0.001, hidden_num=32, log_per=1000, num_layers=1):
    index, data_pad, lengths = make_data(df)
    mc = MathorCup(2, hidden_num, 2, cuda=cuda, num_layers=num_layers)
    optim = torch.optim.Adam(mc.parameters(), lr=lr)
    loss_func = nn.MSELoss()
    counter = 0
    local_loss = []
    cum_loss = []
    for epo in range(epoch):
        for index in range(data_pad.shape[1]):
            counter += 1
            data_samp, data_pred = data_sample(data_pad, lengths, index, )
            output = mc(data_samp)
            loss = loss_func(output, data_pred)
            optim.zero_grad()
            loss.backward()
            # 剪裁
            torch.nn.utils.clip_grad_value_(mc.parameters(), clip_value)
            optim.step()
            local_loss.append(loss.item())
            cum_loss.append(loss.item())
            if math.isnan(loss.item()):
                print(f"nan Counter: {counter+1}, index: {index}")
                break
            if (counter+1) % log_per == 0:
                print(f"Epoch:{epo}, Counter:{counter+1}, Local Loss:{np.mean(local_loss)}")
                local_loss = []
    return mc
# mc = train(demo_df)

In [None]:
def test(model, df):
    index, data_pad, lengths = make_data(df)
    plot_list = []
    max_test = 100
    for index in tqdm(range(max(data_pad.shape[1], max_test))):
        data_samp, data_pred = data_sample(data_pad, lengths, index, )
        output = model(data_samp)
        plot_list.append((output[:, -1, :], data_pred[:, -1, :]))
    return plot_list

# plot_list = test(mc, t_demo_df)
# plot_list[0]

In [None]:
def sample_plot(plot_list, sample_id=None, plot_range=None):
    if sample_id==None:
        sample_id = random.randint(0, 127)
    if plot_range==None:
        temp = random.randint(0, 900)
        plot_range = [temp, temp+100]
    print(sample_id)
    print(plot_range)
    new_plot = torch.stack([torch.stack(item) for item in plot_list])
    Upload_x = new_plot[:, 0, sample_id, 0].detach().numpy()
    Upload_y = new_plot[:, 1, sample_id, 0].detach().numpy()
    
    Download_x = new_plot[:, 0, sample_id, 1].detach().numpy()
    Download_y = new_plot[:, 1, sample_id, 1].detach().numpy()
    # new_plot_x.shape
    fig, ax = plt.subplots(2, 1, squeeze=False)
    ax[0][0].plot(Upload_x[plot_range[0]:plot_range[1]])
    ax[0][0].plot(Upload_y[plot_range[0]:plot_range[1]], color='red', linestyle="--")
    ax[0][0].legend(["Raw", "Pred"])
    ax[0][0].set_title(f"Upload(GB) Prediction Plot From {plot_range[0]} To {plot_range[1]} In Sample {sample_id}")
    ax[1][0].plot(Download_x[plot_range[0]:plot_range[1]])
    ax[1][0].plot(Download_y[plot_range[0]:plot_range[1]], color='red', linestyle="--")
    ax[1][0].legend(["Raw", "Pred"])
    ax[1][0].set_title(f"Download(GB) Prediction Plot From {plot_range[0]} To {plot_range[1]} In Sample {sample_id}")
    plt.tight_layout()
    plt.show()
    return fig
# new_plot_x.detach().numpy()
# fig = sample_plot(plot_list)


In [None]:
def main():
    print("Reading Test Data...")
    try:
        test_df = read_test_data()
    except:
        print("Start Sampling...(About 3 mins)")
        test_df = data_filter(sample_data())
        save_df(test_df, "test_demo.csv")
    print("Reading Train Data...")
    try:
        train_df = read_train_data()
    except:
        print("Start Sampling...(About 3 mins)")
        train_df = data_filter(sample_data())
        save_df(train_df, "train_demo.csv")
    print("Training...")
    mc = train(train_df, cuda=True, epoch=5, lr=0.0001, clip_value=0.001, hidden_num=32, log_per=100, num_layers=1)
    print("Testing")
    plot_list = test(mc, test_df)
    return plot_list
plot_list = main()

In [None]:
fig = sample_plot(plot_list)