In [None]:
import pandas as pd
import dgl
import torch

In [None]:
def process_data(file_path, edge_path):
    # 读取数据
    df = pd.read_csv(file_path, encoding='utf-8')
    edge_df = pd.read_csv(edge_path, encoding='utf-8')

    # 初始化字典和计数器
    geohash_df_dict = {}
    date_df_dict = {}
    number_hash = 0
    number_date = 0

    # 为geohash_id创建映射
    for i in df["geohash_id"]:
        if i not in geohash_df_dict.keys():
            geohash_df_dict[i] = number_hash
            number_hash += 1

    # 为date_id创建映射
    for i in df["date_id"]:
        if i not in date_df_dict.keys():
            date_df_dict[i] = number_date
            number_date += 1
        
    # 将geohash_id和date_id替换为映射后的值
    df["geohash_id"] = df["geohash_id"].map(geohash_df_dict)
    df["date_id"] = df["date_id"].map(date_df_dict)
    edge_df["geohash6_point1"] = edge_df["geohash6_point1"].map(geohash_df_dict)
    edge_df["geohash6_point2"] = edge_df["geohash6_point2"].map(geohash_df_dict)
    edge_df["date_id"] = edge_df["date_id"].map(date_df_dict)
    
    # 去除为NaN的行
    df = df.dropna()
    edge_df = edge_df.dropna()
    
    # 将edge_df中的geohash6_point1和geohash6_point2列的数据转换为int64
    edge_df["geohash6_point1"] = edge_df["geohash6_point1"].astype("int64")
    edge_df["geohash6_point2"] = edge_df["geohash6_point2"].astype("int64")
    
    return df, edge_df, geohash_df_dict, number_date

In [None]:
def process_test_data(file_path, edge_path, pre_geohash_df_dict, pre_number_date):
    # 读取数据
    df = pd.read_csv(file_path, encoding='utf-8')
    edge_df = pd.read_csv(edge_path, encoding='utf-8')

    # 初始化字典和计数器
    geohash_df_dict = pre_geohash_df_dict
    date_df_dict = {}
    number_date = pre_number_date

    # 为date_id创建映射
    for i in df["date_id"]:
        if i not in date_df_dict.keys():
            date_df_dict[i] = number_date
            number_date += 1
        
    # 将geohash_id和date_id替换为映射后的值
    df["geohash_id"] = df["geohash_id"].map(geohash_df_dict)
    df["date_id"] = df["date_id"].map(date_df_dict)
    edge_df["geohash6_point1"] = edge_df["geohash6_point1"].map(geohash_df_dict)
    edge_df["geohash6_point2"] = edge_df["geohash6_point2"].map(geohash_df_dict)
    edge_df["date_id"] = edge_df["date_id"].map(date_df_dict)
    
    # 去除为NaN的行
    df = df.dropna()
    edge_df = edge_df.dropna()
    
    # 将edge_df中的geohash6_point1和geohash6_point2列的数据转换为int64
    edge_df["geohash6_point1"] = edge_df["geohash6_point1"].astype("int64")
    edge_df["geohash6_point2"] = edge_df["geohash6_point2"].astype("int64")
    
    return df, edge_df

In [None]:
# 读取数据
train_node_data, train_edge_data, geohash_df_dict, number_date = process_data('data/train_90.csv', 'data/edge_90.csv')

In [None]:
# 读取测试数据
test_node_data, test_edge_data = process_test_data('data/node_test_4_A.csv', 'data/edge_test_4_A.csv', geohash_df_dict, number_date)

In [None]:
# 删除train_node_data中全为0的列
train_node_data = train_node_data.loc[:, (train_node_data != 0).any(axis=0)]

In [None]:
# 删除test_node_data中全为0的列
test_node_data = test_node_data.loc[:, (test_node_data != 0).any(axis=0)]

In [None]:
# 设置node_id，用于后续的LSTM输入，从0到1139，类型为float32
train_node_id = torch.arange(0, 1140, dtype=torch.float32)
# 为节点添加两个为0的特征
train_temp = torch.zeros((90, 1140, 2), dtype=torch.float32)
# 遍历每一条边将边上的特征作为节点特征累加到对应的节点上的'Edge_F_1&2'中
for i, row in train_edge_data.iterrows():
    # 从edge_data中取出边的两个端点
    point1 = row['geohash6_point1']
    point2 = row['geohash6_point2']
    # 从edge_data中取出日期
    date = row['date_id']
    # 从edge_data中取出边上的特征
    feature1 = row['F_1']
    feature2 = row['F_2']
    
    train_temp[date][point1][0] += feature1
    train_temp[date][point1][1] += feature2
    train_temp[date][point2][0] += feature1
    train_temp[date][point2][1] += feature2

In [None]:
# 设置node_id，用于后续的LSTM输入，从0到1139，类型为float32
test_node_id = torch.arange(0, 1140, dtype=torch.float32)
# 为节点添加两个为0的特征
test_temp = torch.zeros((4, 1140, 2), dtype=torch.float32)
# 遍历每一条边将边上的特征作为节点特征累加到对应的节点上的'Edge_F_1&2'中
for i, row in test_edge_data.iterrows():
    # 从edge_data中取出边的两个端点
    point1 = row['geohash6_point1']
    point2 = row['geohash6_point2']
    # 从edge_data中取出日期
    date = row['date_id']
    # 从edge_data中取出边上的特征
    feature1 = row['F_1']
    feature2 = row['F_2']
    
    test_temp[date - 90][point1][0] += feature1
    test_temp[date - 90][point1][1] += feature2
    test_temp[date - 90][point2][0] += feature1
    test_temp[date - 90][point2][1] += feature2

In [None]:
# 对node_data的第三列到第三十五列进行标准化
train_node_data.iloc[:, 2:35] = (train_node_data.iloc[:, 2:35] - train_node_data.iloc[:, 2:35].mean()) / train_node_data.iloc[:, 2:35].std()

# 对edge_data的第三列到第四列进行标准化
train_edge_data.iloc[:, 2:4] = (train_edge_data.iloc[:, 2:4] - train_edge_data.iloc[:, 2:4].mean()) / train_edge_data.iloc[:, 2:4].std()

# 对temp进行标准化
train_temp = (train_temp - train_temp.mean()) / train_temp.std()

# 打印node_data的前5行
print(train_node_data.head())

# 打印edge_data的前5行
print(train_edge_data.head())

# 打印temp的前5行
print(train_temp[0:5])

In [None]:
# 对node_data的第三列到第三十五列进行标准化
test_node_data.iloc[:, 2:35] = (test_node_data.iloc[:, 2:35] - test_node_data.iloc[:, 2:35].mean()) / test_node_data.iloc[:, 2:35].std()

# 对edge_data的第三列到第四列进行标准化
test_edge_data.iloc[:, 2:4] = (test_edge_data.iloc[:, 2:4] - test_edge_data.iloc[:, 2:4].mean()) / test_edge_data.iloc[:, 2:4].std()

# 对temp进行标准化
test_temp = (test_temp - test_temp.mean()) / test_temp.std()

# 打印node_data的前5行
print(test_node_data.head())

# 打印edge_data的前5行
print(test_edge_data.head())

# 打印temp的前5行
print(test_temp[0:5])

In [None]:
# # 对node_data的第三列到第三十五列加入噪声
# train_node_data.iloc[:, 2:35] = train_node_data.iloc[:, 2:35].values + torch.randn(train_node_data.shape[0], 33).numpy() * 0.01

# # 对edge_data的第三列到第四列加入噪声
# train_edge_data.iloc[:, 2:4] = train_edge_data.iloc[:, 2:4].values + torch.randn(train_edge_data.shape[0], 2).numpy() * 0.01

# # 对temp加入噪声
# train_temp = train_temp + torch.randn(train_temp.shape[0], 1140, 2).numpy() * 0.01

# # 打印node_data的前5行
# print(train_node_data.head())

# # 打印edge_data的前5行
# print(train_edge_data.head())

# # 打印temp的前5行
# print(train_temp[0:5])

In [None]:
# 为node_data加入7个独热编码特征，分别为星期一到星期日
train_node_data['Monday'] = 0
train_node_data['Tuesday'] = 0
train_node_data['Wednesday'] = 0
train_node_data['Thursday'] = 0
train_node_data['Friday'] = 0
train_node_data['Saturday'] = 0
train_node_data['Sunday'] = 0

# 根据date_id为node_data的独热编码特征赋值
for i, row in train_node_data.iterrows():
    if row['date_id'] % 7 == 5:
        train_node_data.loc[i, 'Monday'] = 1
    elif row['date_id'] % 7 == 6:
        train_node_data.loc[i, 'Tuesday'] = 1
    elif row['date_id'] % 7 == 0:
        train_node_data.loc[i, 'Wednesday'] = 1
    elif row['date_id'] % 7 == 1:
        train_node_data.loc[i, 'Thursday'] = 1
    elif row['date_id'] % 7 == 2:
        train_node_data.loc[i, 'Friday'] = 1
    elif row['date_id'] % 7 == 3:
        train_node_data.loc[i, 'Saturday'] = 1
    elif row['date_id'] % 7 == 4:
        train_node_data.loc[i, 'Sunday'] = 1

# 为node_data加入1个独热编码特征，表示是否为节假日
train_node_data['Holiday'] = 0

# 根据date_id为node_data的独热编码特征赋值
holiday_list = [3, 4, 10, 11, 17, 18, 19, 20, 21, 22, 23, 31, 32, 38, 39, 45, 46, 52, 53, 59, 60, 66, 67, 73, 74, 80, 81, 87, 88, 91, 94, 95]

for i, row in train_node_data.iterrows():
    if row['date_id'] in holiday_list:
        train_node_data.loc[i, 'Holiday'] = 1

# 打印node_data的前5行
print(train_node_data.head())

In [None]:
# 为node_data加入7个独热编码特征，分别为星期一到星期日
test_node_data['Monday'] = 0
test_node_data['Tuesday'] = 0
test_node_data['Wednesday'] = 0
test_node_data['Thursday'] = 0
test_node_data['Friday'] = 0
test_node_data['Saturday'] = 0
test_node_data['Sunday'] = 0

# 根据date_id为node_data的独热编码特征赋值
for i, row in test_node_data.iterrows():
    if row['date_id'] % 7 == 5:
        test_node_data.loc[i, 'Monday'] = 1
    elif row['date_id'] % 7 == 6:
        test_node_data.loc[i, 'Tuesday'] = 1
    elif row['date_id'] % 7 == 0:
        test_node_data.loc[i, 'Wednesday'] = 1
    elif row['date_id'] % 7 == 1:
        test_node_data.loc[i, 'Thursday'] = 1
    elif row['date_id'] % 7 == 2:
        test_node_data.loc[i, 'Friday'] = 1
    elif row['date_id'] % 7 == 3:
        test_node_data.loc[i, 'Saturday'] = 1
    elif row['date_id'] % 7 == 4:
        test_node_data.loc[i, 'Sunday'] = 1

# 为node_data加入1个独热编码特征，表示是否为节假日
test_node_data['Holiday'] = 0

# 根据date_id为node_data的独热编码特征赋值
holiday_list = [3, 4, 10, 11, 17, 18, 19, 20, 21, 22, 23, 31, 32, 38, 39, 45, 46, 52, 53, 59, 60, 66, 67, 73, 74, 80, 81, 87, 88, 91, 94, 95]

for i, row in test_node_data.iterrows():
    if row['date_id'] in holiday_list:
        test_node_data.loc[i, 'Holiday'] = 1

# 打印node_data的前5行
print(test_node_data.head())

In [None]:
# 根据日期对节点和边进行分组
train_date_groups = train_node_data.groupby('date_id')
train_date_edge_groups = train_edge_data.groupby('date_id')

In [None]:
# 根据日期对节点和边进行分组
test_date_groups = test_node_data.groupby('date_id')
test_date_edge_groups = test_edge_data.groupby('date_id')

In [None]:
# 初始化一个列表来存储DGL图
train_dgl_graphs = []

# 对于每个日期，创建一个DGL图
for date, group in train_date_groups:
    # 获取节点和边的数据
    nodes_feature = group.iloc[:, 2:35].values
    date_feature = group.iloc[:, 37:].values
    nodes_labels = group.iloc[:, 35:37].values
    edges_feature = train_date_edge_groups.get_group(date).values
    
    # 创建DGL图
    g = dgl.DGLGraph()
    
    # 将NumPy数组转换为PyTorch张量并放在与DGL图相同的设备上
    nodes_feature = torch.tensor(nodes_feature, device=g.device)
    date_feature = torch.tensor(date_feature, device=g.device)
    nodes_labels = torch.tensor(nodes_labels, device=g.device)
    edges_feature = torch.tensor(edges_feature, device=g.device)
    
    # 将nodes_feature和temp中的特征拼接起来
    nodes_feature = torch.cat([train_temp[date], nodes_feature, date_feature], dim=1)
    
    # 添加节点
    g.add_nodes(nodes_feature.shape[0])
    
    # 添加正向边和反向边
    g.add_edges(edges_feature[:, 0].long(), edges_feature[:, 1].long())
    g.add_edges(edges_feature[:, 1].long(), edges_feature[:, 0].long())
    
    # 添加节点的特征和标签
    g.ndata['feat'] = nodes_feature
    g.ndata['label'] = nodes_labels
    g.ndata['node_id'] = train_node_id
    
    # 添加正向边和反向边的特征（即将特征按行拼接一次）
    g.edata['feat'] = torch.cat([edges_feature[:,2:4], edges_feature[:,2:4]], dim=0)
    
    # 将图添加到列表中
    train_dgl_graphs.append(g)

In [None]:
# 初始化一个列表来存储DGL图
test_dgl_graphs = []

# 对于每个日期，创建一个DGL图
for date, group in test_date_groups:
    # 获取节点和边的数据
    nodes_feature = group.iloc[:, 2:].values
    edges_feature = test_date_edge_groups.get_group(date).values
    
    # 创建DGL图
    g = dgl.DGLGraph()
    
    # 将NumPy数组转换为PyTorch张量并放在与DGL图相同的设备上
    nodes_feature = torch.tensor(nodes_feature, device=g.device)
    edges_feature = torch.tensor(edges_feature, device=g.device)
    
    # 将nodes_feature和temp中的特征拼接起来
    nodes_feature = torch.cat([test_temp[date - 90], nodes_feature], dim=1)
    
    # 添加节点
    g.add_nodes(nodes_feature.shape[0])
    
    # 添加正向边和反向边
    g.add_edges(edges_feature[:, 0].long(), edges_feature[:, 1].long())
    g.add_edges(edges_feature[:, 1].long(), edges_feature[:, 0].long())
    
    # 添加节点的特征和标签
    g.ndata['feat'] = nodes_feature
    g.ndata['node_id'] = test_node_id
    
    # 添加正向边和反向边的特征（即将特征按行拼接一次）
    g.edata['feat'] = torch.cat([edges_feature[:,2:4], edges_feature[:,2:4]], dim=0)
    
    # 将图添加到列表中
    test_dgl_graphs.append(g)

In [None]:
# 返回所有创建的DGL图
train_dgl_graphs

In [None]:
# 返回所有创建的DGL图
test_dgl_graphs

In [None]:
# 保存DGL图
dgl.save_graphs('dgl_data/dgl_graphs_train.bin', train_dgl_graphs)

In [None]:
# 保存DGL图
dgl.save_graphs('dgl_data/dgl_graphs_test.bin', test_dgl_graphs)

In [None]:
# 读取DGL图
train_dgl_graphs, _ = dgl.load_graphs('dgl_data/dgl_graphs_train.bin')

In [None]:
# 读取DGL图
test_dgl_graphs, _ = dgl.load_graphs('dgl_data/dgl_graphs_test.bin')

In [None]:
# 打印第一个DGL图
print(train_dgl_graphs[0])
print(train_dgl_graphs[0].ndata['feat'][0])
print(train_dgl_graphs[0].ndata['label'][0])

# 打印第一个DGL图
print(test_dgl_graphs[0])
print(test_dgl_graphs[0].ndata['feat'][0])