In [1]:
import pandas as pd
import dgl
import torch

In [2]:
def process_data(file_path, edge_path):
    # 读取数据
    df = pd.read_csv(file_path, encoding='utf-8')
    edge_df = pd.read_csv(edge_path, encoding='utf-8')

    # 初始化字典和计数器
    geohash_df_dict = {}
    date_df_dict = {}
    number_hash = 0
    number_date = 0

    # 为geohash_id创建映射
    for i in df["geohash_id"]:
        if i not in geohash_df_dict.keys():
            geohash_df_dict[i] = number_hash
            number_hash += 1

    # 为date_id创建映射
    for i in df["date_id"]:
        if i not in date_df_dict.keys():
            date_df_dict[i] = number_date
            number_date += 1
        
    # 将geohash_id和date_id替换为映射后的值
    df["geohash_id"] = df["geohash_id"].map(geohash_df_dict)
    df["date_id"] = df["date_id"].map(date_df_dict)
    edge_df["geohash6_point1"] = edge_df["geohash6_point1"].map(geohash_df_dict)
    edge_df["geohash6_point2"] = edge_df["geohash6_point2"].map(geohash_df_dict)
    edge_df["date_id"] = edge_df["date_id"].map(date_df_dict)
    
    # 去除为NaN的行
    df = df.dropna()
    edge_df = edge_df.dropna()
    
    # 将edge_df中的geohash6_point1和geohash6_point2列的数据转换为int64
    edge_df["geohash6_point1"] = edge_df["geohash6_point1"].astype("int64")
    edge_df["geohash6_point2"] = edge_df["geohash6_point2"].astype("int64")
    
    return df, edge_df, geohash_df_dict, number_date

In [3]:
def process_test_data(file_path, edge_path, pre_geohash_df_dict, pre_number_date):
    # 读取数据
    df = pd.read_csv(file_path, encoding='utf-8')
    edge_df = pd.read_csv(edge_path, encoding='utf-8')

    # 初始化字典和计数器
    geohash_df_dict = pre_geohash_df_dict
    date_df_dict = {}
    number_date = pre_number_date

    # 为date_id创建映射
    for i in df["date_id"]:
        if i not in date_df_dict.keys():
            date_df_dict[i] = number_date
            number_date += 1
        
    # 将geohash_id和date_id替换为映射后的值
    df["geohash_id"] = df["geohash_id"].map(geohash_df_dict)
    df["date_id"] = df["date_id"].map(date_df_dict)
    edge_df["geohash6_point1"] = edge_df["geohash6_point1"].map(geohash_df_dict)
    edge_df["geohash6_point2"] = edge_df["geohash6_point2"].map(geohash_df_dict)
    edge_df["date_id"] = edge_df["date_id"].map(date_df_dict)
    
    # 去除为NaN的行
    df = df.dropna()
    edge_df = edge_df.dropna()
    
    # 将edge_df中的geohash6_point1和geohash6_point2列的数据转换为int64
    edge_df["geohash6_point1"] = edge_df["geohash6_point1"].astype("int64")
    edge_df["geohash6_point2"] = edge_df["geohash6_point2"].astype("int64")
    
    return df, edge_df

In [4]:
# 读取数据
train_node_data, train_edge_data, geohash_df_dict, number_date = process_data('data/train_90.csv', 'data/edge_90.csv')

In [5]:
# 读取测试数据
test_node_data_A, test_edge_data_A = process_test_data('data/node_test_4_A.csv', 'data/edge_test_4_A.csv', geohash_df_dict, number_date)
test_node_data_B, test_edge_data_B = process_test_data('data/node_test_3_B.csv', 'data/edge_test_3_B.csv', geohash_df_dict, number_date+4)

In [6]:
# 删除train_node_data中全为0的列
train_node_data = train_node_data.loc[:, (train_node_data != 0).any(axis=0)]

In [7]:
# 删除test_node_data中全为0的列
test_node_data_A = test_node_data_A.loc[:, (test_node_data_A != 0).any(axis=0)]
test_node_data_B = test_node_data_B.loc[:, (test_node_data_B != 0).any(axis=0)]

In [8]:
# 设置node_id，用于后续的LSTM输入，从0到1139，类型为float32
train_node_id = torch.arange(0, 1140, dtype=torch.float32)
# 为节点添加两个为0的特征
train_temp = torch.zeros((90, 1140, 2), dtype=torch.float32)
# 遍历每一条边将边上的特征作为节点特征累加到对应的节点上的'Edge_F_1&2'中
for i, row in train_edge_data.iterrows():
    # 从edge_data中取出边的两个端点
    point1 = row['geohash6_point1']
    point2 = row['geohash6_point2']
    # 从edge_data中取出日期
    date = row['date_id']
    # 从edge_data中取出边上的特征
    feature1 = row['F_1']
    feature2 = row['F_2']
    
    train_temp[date][point1][0] += feature1
    train_temp[date][point1][1] += feature2
    train_temp[date][point2][0] += feature1
    train_temp[date][point2][1] += feature2

In [9]:
# 设置node_id，用于后续的LSTM输入，从0到1139，类型为float32
test_node_id_A = torch.arange(0, 1140, dtype=torch.float32)
# 为节点添加两个为0的特征
test_temp_A = torch.zeros((4, 1140, 2), dtype=torch.float32)
# 遍历每一条边将边上的特征作为节点特征累加到对应的节点上的'Edge_F_1&2'中
for i, row in test_edge_data_A.iterrows():
    # 从edge_data中取出边的两个端点
    point1 = row['geohash6_point1']
    point2 = row['geohash6_point2']
    # 从edge_data中取出日期
    date = row['date_id']
    # 从edge_data中取出边上的特征
    feature1 = row['F_1']
    feature2 = row['F_2']
    
    test_temp_A[date - 90][point1][0] += feature1
    test_temp_A[date - 90][point1][1] += feature2
    test_temp_A[date - 90][point2][0] += feature1
    test_temp_A[date - 90][point2][1] += feature2

In [10]:
# 设置node_id，用于后续的LSTM输入，从0到1139，类型为float32
test_node_id_B = torch.arange(0, 1140, dtype=torch.float32)
# 为节点添加两个为0的特征
test_temp_B = torch.zeros((3, 1140, 2), dtype=torch.float32)
# 遍历每一条边将边上的特征作为节点特征累加到对应的节点上的'Edge_F_1&2'中
for i, row in test_edge_data_B.iterrows():
    # 从edge_data中取出边的两个端点
    point1 = row['geohash6_point1']
    point2 = row['geohash6_point2']
    # 从edge_data中取出日期
    date = row['date_id']
    # 从edge_data中取出边上的特征
    feature1 = row['F_1']
    feature2 = row['F_2']
    
    test_temp_B[date - 94][point1][0] += feature1
    test_temp_B[date - 94][point1][1] += feature2
    test_temp_B[date - 94][point2][0] += feature1
    test_temp_B[date - 94][point2][1] += feature2

In [11]:
# 对node_data的第三列到第三十五列进行标准化
train_node_data.iloc[:, 2:35] = (train_node_data.iloc[:, 2:35] - train_node_data.iloc[:, 2:35].mean()) / train_node_data.iloc[:, 2:35].std()

# 对edge_data的第三列到第四列进行标准化
train_edge_data.iloc[:, 2:4] = (train_edge_data.iloc[:, 2:4] - train_edge_data.iloc[:, 2:4].mean()) / train_edge_data.iloc[:, 2:4].std()

# 对temp进行标准化
train_temp = (train_temp - train_temp.mean()) / train_temp.std()

# 打印node_data的前5行
print(train_node_data.head())

# 打印edge_data的前5行
print(train_edge_data.head())

# 打印temp的前5行
print(train_temp[0:5])

   geohash_id  date_id       F_1       F_2       F_3       F_4       F_5  \
0           0        0 -0.704497 -0.689070 -0.786935 -0.721294 -0.741659   
1           0        1 -0.906495 -0.900487 -0.946404 -0.841177 -0.853436   
2           0        2 -0.917717 -0.922956 -0.921389 -0.849374 -0.850360   
3           0        3 -0.923838 -0.929084 -0.942235 -0.834004 -0.847284   
4           0        4 -0.744285 -0.758521 -0.811949 -0.743836 -0.759092   

        F_6       F_7       F_8  ...      F_28      F_29      F_30      F_31  \
0 -0.788201  1.572960  2.471032  ... -0.087031  0.754380 -0.805618 -0.742004   
1 -0.905471 -0.350864  1.017639  ... -0.617727 -0.122875 -0.689629 -0.828154   
2 -0.912548 -0.313596  0.818213  ... -0.263930  0.372965 -0.805618 -0.841580   
3 -0.904460 -0.977366  0.014466  ...  0.001418 -0.542432 -0.341665 -0.840462   
4 -0.812464  1.146898  1.477930  ...  0.089867  0.449248 -0.573641 -0.766618   

       F_32      F_33      F_34      F_35  active_index  consu

In [12]:
# 对node_data的第三列到第三十五列进行标准化
test_node_data_A.iloc[:, 2:35] = (test_node_data_A.iloc[:, 2:35] - test_node_data_A.iloc[:, 2:35].mean()) / test_node_data_A.iloc[:, 2:35].std()

# 对edge_data的第三列到第四列进行标准化
test_edge_data_A.iloc[:, 2:4] = (test_edge_data_A.iloc[:, 2:4] - test_edge_data_A.iloc[:, 2:4].mean()) / test_edge_data_A.iloc[:, 2:4].std()

# 对temp进行标准化
test_temp_A = (test_temp_A - test_temp_A.mean()) / test_temp_A.std()

# 打印node_data的前5行
print(test_node_data_A.head())

# 打印edge_data的前5行
print(test_edge_data_A.head())

# 打印temp的前5行
print(test_temp_A[0:5])

   geohash_id  date_id       F_1       F_2       F_3       F_4       F_5  \
0           0       90 -0.950089 -0.947251 -0.992331 -0.902908 -0.903096   
1           0       91 -0.901400 -0.884535 -0.916322 -0.853772 -0.848743   
2           0       92 -0.861985 -0.838852 -0.918473 -0.842256 -0.824246   
3           0       93 -0.875124 -0.851241 -0.926361 -0.837650 -0.843384   
4           1       90 -1.126298 -1.148563 -1.186656 -1.034960 -1.049312   

        F_6       F_7       F_8  ...      F_25      F_26      F_28      F_29  \
0 -0.889930 -0.051065  0.655327  ... -0.690347 -1.335713 -0.939599  0.434156   
1 -0.839360 -0.691284  0.375309  ... -0.587756 -0.848956 -0.259178 -0.296070   
2 -0.810703  0.063665  1.019138  ... -0.570658 -1.370481 -0.372582 -0.496882   
3 -0.822503 -0.148797  0.915192  ... -0.382575  0.680849 -0.145775 -0.460371   
4 -1.058496 -1.624702 -1.002504  ... -1.528173 -2.761213  8.964301 -1.902567   

       F_30      F_31      F_32      F_33      F_34      F_35 

In [13]:
# 对node_data的第三列到第三十五列进行标准化
test_node_data_B.iloc[:, 2:35] = (test_node_data_B.iloc[:, 2:35] - test_node_data_B.iloc[:, 2:35].mean()) / test_node_data_B.iloc[:, 2:35].std()

# 对edge_data的第三列到第四列进行标准化
test_edge_data_B.iloc[:, 2:4] = (test_edge_data_A.iloc[:, 2:4] - test_edge_data_B.iloc[:, 2:4].mean()) / test_edge_data_B.iloc[:, 2:4].std()

# 对temp进行标准化
test_temp_B = (test_temp_B - test_temp_B.mean()) / test_temp_B.std()

# 打印node_data的前5行
print(test_node_data_B.head())

# 打印edge_data的前5行
print(test_edge_data_B.head())

# 打印temp的前5行
print(test_temp_B[0:5])

   geohash_id  date_id       F_1       F_2       F_3       F_4       F_5  \
0           0       94 -0.955704 -0.966840 -1.016919 -0.950525 -0.944877   
1           0       95 -0.972266 -0.994995 -1.047283 -0.963832 -0.962476   
2           0       96 -0.811515 -0.810112 -0.908593 -0.888421 -0.892958   
3           1       94 -1.109635 -1.160170 -1.198283 -1.061423 -1.086552   
4           1       95 -1.128145 -1.182694 -1.218800 -1.086264 -1.096231   

        F_6       F_7       F_8  ...      F_25      F_26      F_28      F_29  \
0 -0.946294 -0.354462  0.536914  ... -0.526737  0.116775 -0.627529 -0.671148   
1 -0.971863 -0.200447  0.491582  ... -0.355253 -0.793989 -0.308267  0.491408   
2 -0.861719  0.985563  1.803434  ... -0.074643 -0.702913  0.117415 -0.054641   
3 -1.097743 -1.334811 -1.028427  ... -0.511148 -0.490401  1.855620 -0.424545   
4 -1.110527 -1.469459 -1.280065  ... -1.680359 -0.551119  1.536358 -1.816090   

       F_30      F_31      F_32      F_33      F_34      F_35 

In [14]:
# 根据日期对节点和边进行分组
train_date_groups = train_node_data.groupby('date_id')
train_date_edge_groups = train_edge_data.groupby('date_id')

In [15]:
# 根据日期对节点和边进行分组
test_date_groups_A = test_node_data_A.groupby('date_id')
test_date_edge_groups_A = test_edge_data_A.groupby('date_id')

In [16]:
# 根据日期对节点和边进行分组
test_date_groups_B = test_node_data_B.groupby('date_id')
test_date_edge_groups_B = test_edge_data_B.groupby('date_id')

In [17]:
# 初始化一个列表来存储DGL图
train_dgl_graphs = []

# 对于每个日期，创建一个DGL图
for date, group in train_date_groups:
    # 获取节点和边的数据
    nodes_feature = group.iloc[:, 2:35].values
    nodes_labels = group.iloc[:, 35:37].values
    edges_feature = train_date_edge_groups.get_group(date).values
    
    # 创建DGL图
    g = dgl.DGLGraph()
    
    # 将NumPy数组转换为PyTorch张量并放在与DGL图相同的设备上
    nodes_feature = torch.tensor(nodes_feature, device=g.device)
    nodes_labels = torch.tensor(nodes_labels, device=g.device)
    edges_feature = torch.tensor(edges_feature, device=g.device)
    
    # 将nodes_feature和temp中的特征拼接起来
    nodes_feature = torch.cat([nodes_feature, train_temp[date]], dim=1)
    
    # 添加节点
    g.add_nodes(nodes_feature.shape[0])
    
    # 添加正向边和反向边
    g.add_edges(edges_feature[:, 0].long(), edges_feature[:, 1].long())
    g.add_edges(edges_feature[:, 1].long(), edges_feature[:, 0].long())
    
    # 添加节点的特征和标签
    g.ndata['feat'] = nodes_feature
    g.ndata['label'] = nodes_labels
    g.ndata['node_id'] = train_node_id
    
    # 添加正向边和反向边的特征（即将特征按行拼接一次）
    g.edata['feat'] = torch.cat([edges_feature[:,2:4], edges_feature[:,2:4]], dim=0)
    
    # 将图添加到列表中
    train_dgl_graphs.append(g)



In [18]:
# 初始化一个列表来存储DGL图
test_dgl_graphs_A = []

# 对于每个日期，创建一个DGL图
for date, group in test_date_groups_A:
    # 获取节点和边的数据
    nodes_feature = group.iloc[:, 2:].values
    edges_feature = test_date_edge_groups_A.get_group(date).values
    
    # 创建DGL图
    g = dgl.DGLGraph()
    
    # 将NumPy数组转换为PyTorch张量并放在与DGL图相同的设备上
    nodes_feature = torch.tensor(nodes_feature, device=g.device)
    edges_feature = torch.tensor(edges_feature, device=g.device)
    
    # 将nodes_feature和temp中的特征拼接起来
    nodes_feature = torch.cat([nodes_feature, test_temp_A[date - 90]], dim=1)
    
    # 添加节点
    g.add_nodes(nodes_feature.shape[0])
    
    # 添加正向边和反向边
    g.add_edges(edges_feature[:, 0].long(), edges_feature[:, 1].long())
    g.add_edges(edges_feature[:, 1].long(), edges_feature[:, 0].long())
    
    # 添加节点的特征和标签
    g.ndata['feat'] = nodes_feature
    g.ndata['node_id'] = test_node_id_A
    
    # 添加正向边和反向边的特征（即将特征按行拼接一次）
    g.edata['feat'] = torch.cat([edges_feature[:,2:4], edges_feature[:,2:4]], dim=0)
    
    # 将图添加到列表中
    test_dgl_graphs_A.append(g)

In [19]:
# 初始化一个列表来存储DGL图
test_dgl_graphs_B = []

# 对于每个日期，创建一个DGL图
for date, group in test_date_groups_B:
    # 获取节点和边的数据
    nodes_feature = group.iloc[:, 2:].values
    edges_feature = test_date_edge_groups_B.get_group(date).values
    
    # 创建DGL图
    g = dgl.DGLGraph()
    
    # 将NumPy数组转换为PyTorch张量并放在与DGL图相同的设备上
    nodes_feature = torch.tensor(nodes_feature, device=g.device)
    edges_feature = torch.tensor(edges_feature, device=g.device)
    
    # 将nodes_feature和temp中的特征拼接起来
    nodes_feature = torch.cat([nodes_feature, test_temp_B[date - 94]], dim=1)
    
    # 添加节点
    g.add_nodes(nodes_feature.shape[0])
    
    # 添加正向边和反向边
    g.add_edges(edges_feature[:, 0].long(), edges_feature[:, 1].long())
    g.add_edges(edges_feature[:, 1].long(), edges_feature[:, 0].long())
    
    # 添加节点的特征和标签
    g.ndata['feat'] = nodes_feature
    g.ndata['node_id'] = test_node_id_B
    
    # 添加正向边和反向边的特征（即将特征按行拼接一次）
    g.edata['feat'] = torch.cat([edges_feature[:,2:4], edges_feature[:,2:4]], dim=0)
    
    # 将图添加到列表中
    test_dgl_graphs_B.append(g)

In [20]:
# 返回所有创建的DGL图
train_dgl_graphs

[Graph(num_nodes=1140, num_edges=23418,
       ndata_schemes={'feat': Scheme(shape=(35,), dtype=torch.float64), 'label': Scheme(shape=(2,), dtype=torch.float64), 'node_id': Scheme(shape=(), dtype=torch.float32)}
       edata_schemes={'feat': Scheme(shape=(2,), dtype=torch.float64)}),
 Graph(num_nodes=1140, num_edges=23348,
       ndata_schemes={'feat': Scheme(shape=(35,), dtype=torch.float64), 'label': Scheme(shape=(2,), dtype=torch.float64), 'node_id': Scheme(shape=(), dtype=torch.float32)}
       edata_schemes={'feat': Scheme(shape=(2,), dtype=torch.float64)}),
 Graph(num_nodes=1140, num_edges=21622,
       ndata_schemes={'feat': Scheme(shape=(35,), dtype=torch.float64), 'label': Scheme(shape=(2,), dtype=torch.float64), 'node_id': Scheme(shape=(), dtype=torch.float32)}
       edata_schemes={'feat': Scheme(shape=(2,), dtype=torch.float64)}),
 Graph(num_nodes=1140, num_edges=21490,
       ndata_schemes={'feat': Scheme(shape=(35,), dtype=torch.float64), 'label': Scheme(shape=(2,), dtype

In [21]:
# 返回所有创建的DGL图
test_dgl_graphs_A

[Graph(num_nodes=1140, num_edges=42036,
       ndata_schemes={'feat': Scheme(shape=(35,), dtype=torch.float64), 'node_id': Scheme(shape=(), dtype=torch.float32)}
       edata_schemes={'feat': Scheme(shape=(2,), dtype=torch.float64)}),
 Graph(num_nodes=1140, num_edges=42094,
       ndata_schemes={'feat': Scheme(shape=(35,), dtype=torch.float64), 'node_id': Scheme(shape=(), dtype=torch.float32)}
       edata_schemes={'feat': Scheme(shape=(2,), dtype=torch.float64)}),
 Graph(num_nodes=1140, num_edges=41834,
       ndata_schemes={'feat': Scheme(shape=(35,), dtype=torch.float64), 'node_id': Scheme(shape=(), dtype=torch.float32)}
       edata_schemes={'feat': Scheme(shape=(2,), dtype=torch.float64)}),
 Graph(num_nodes=1140, num_edges=42096,
       ndata_schemes={'feat': Scheme(shape=(35,), dtype=torch.float64), 'node_id': Scheme(shape=(), dtype=torch.float32)}
       edata_schemes={'feat': Scheme(shape=(2,), dtype=torch.float64)})]

In [22]:
# 返回所有创建的DGL图
test_dgl_graphs_B

[Graph(num_nodes=1140, num_edges=42218,
       ndata_schemes={'feat': Scheme(shape=(35,), dtype=torch.float64), 'node_id': Scheme(shape=(), dtype=torch.float32)}
       edata_schemes={'feat': Scheme(shape=(2,), dtype=torch.float64)}),
 Graph(num_nodes=1140, num_edges=41074,
       ndata_schemes={'feat': Scheme(shape=(35,), dtype=torch.float64), 'node_id': Scheme(shape=(), dtype=torch.float32)}
       edata_schemes={'feat': Scheme(shape=(2,), dtype=torch.float64)}),
 Graph(num_nodes=1140, num_edges=42164,
       ndata_schemes={'feat': Scheme(shape=(35,), dtype=torch.float64), 'node_id': Scheme(shape=(), dtype=torch.float32)}
       edata_schemes={'feat': Scheme(shape=(2,), dtype=torch.float64)})]

In [23]:
# 保存DGL图
dgl.save_graphs('dgl_data/dgl_graphs_train.bin', train_dgl_graphs)

In [24]:
# 保存DGL图
dgl.save_graphs('dgl_data/dgl_graphs_test_A.bin', test_dgl_graphs_A)

In [25]:
# 保存DGL图
dgl.save_graphs('dgl_data/dgl_graphs_test_B.bin', test_dgl_graphs_B)

In [26]:
# 读取DGL图
train_dgl_graphs, _ = dgl.load_graphs('dgl_data/dgl_graphs_train.bin')

In [27]:
# 读取DGL图
test_dgl_graphs_A, _ = dgl.load_graphs('dgl_data/dgl_graphs_test_A.bin')

In [28]:
# 读取DGL图
test_dgl_graphs_B, _ = dgl.load_graphs('dgl_data/dgl_graphs_test_B.bin')

In [29]:
# 打印第一个DGL图
print(train_dgl_graphs[0])
print(train_dgl_graphs[0].ndata['feat'][0])
print(train_dgl_graphs[0].ndata['label'][0])

# 打印第一个DGL图
print(test_dgl_graphs_A[0])
print(test_dgl_graphs_A[0].ndata['feat'][0])

# 打印第一个DGL图
print(test_dgl_graphs_B[0])
print(test_dgl_graphs_B[0].ndata['feat'][0])

Graph(num_nodes=1140, num_edges=23418,
      ndata_schemes={'feat': Scheme(shape=(35,), dtype=torch.float64), 'label': Scheme(shape=(2,), dtype=torch.float64), 'node_id': Scheme(shape=(), dtype=torch.float32)}
      edata_schemes={'feat': Scheme(shape=(2,), dtype=torch.float64)})
tensor([-0.7045, -0.6891, -0.7869, -0.7213, -0.7417, -0.7882,  1.5730,  2.4710,
        -0.0699,  1.3873,  1.1090,  0.5086,  2.1474,  2.6670, -0.0710,  0.6079,
         0.2834,  0.1083,  0.4303, -0.7378, -0.7070, -0.7680,  0.4595,  1.1765,
        -1.3085, -0.0870,  0.7544, -0.8056, -0.7420, -0.8272, -0.6853, -0.7076,
        -0.9382,  0.8628, -0.2868], dtype=torch.float64)
tensor([69.3060, 63.7800], dtype=torch.float64)
Graph(num_nodes=1140, num_edges=42036,
      ndata_schemes={'feat': Scheme(shape=(35,), dtype=torch.float64), 'node_id': Scheme(shape=(), dtype=torch.float32)}
      edata_schemes={'feat': Scheme(shape=(2,), dtype=torch.float64)})
tensor([-0.9501, -0.9473, -0.9923, -0.9029, -0.9031, -0.8899, -