In [1]:
import pandas as pd
import dgl

In [2]:
def process_data(file_path, edge_path):
    # 读取数据
    df = pd.read_csv(file_path, encoding='utf-8')
    edge_df = pd.read_csv(edge_path, encoding='utf-8')

    # 初始化字典和计数器
    geohash_df_dict = {}
    date_df_dict = {}
    number_hash = 0
    number_date = 0

    # 为geohash_id创建映射
    for i in df["geohash_id"]:
        if i not in geohash_df_dict.keys():
            geohash_df_dict[i] = number_hash
            number_hash += 1

    # 为date_id创建映射
    for i in df["date_id"]:
        if i not in date_df_dict.keys():
            date_df_dict[i] = number_date
            number_date += 1
        
    # 将geohash_id和date_id替换为映射后的值
    df["geohash_id"] = df["geohash_id"].map(geohash_df_dict)
    df["date_id"] = df["date_id"].map(date_df_dict)
    edge_df["geohash6_point1"] = edge_df["geohash6_point1"].map(geohash_df_dict)
    edge_df["geohash6_point2"] = edge_df["geohash6_point2"].map(geohash_df_dict)
    edge_df["date_id"] = edge_df["date_id"].map(date_df_dict)
    
    # 去除为NaN的行
    df = df.dropna()
    edge_df = edge_df.dropna()
    
    # 将edge_df中的geohash6_point1和geohash6_point2列的数据转换为int64
    edge_df["geohash6_point1"] = edge_df["geohash6_point1"].astype("int64")
    edge_df["geohash6_point2"] = edge_df["geohash6_point2"].astype("int64")
    
    return df, edge_df, geohash_df_dict, date_df_dict

In [3]:
def process_test_data(file_path, edge_path, pre_geohash_df_dict, pre_number_date):
    # 读取数据
    df = pd.read_csv(file_path, encoding='utf-8')
    edge_df = pd.read_csv(edge_path, encoding='utf-8')

    # 初始化字典和计数器
    geohash_df_dict = pre_geohash_df_dict
    date_df_dict = {}
    number_date = pre_number_date

    # 为date_id创建映射
    for i in df["date_id"]:
        if i not in date_df_dict.keys():
            date_df_dict[i] = number_date
            number_date += 1
        
    # 将geohash_id和date_id替换为映射后的值
    df["geohash_id"] = df["geohash_id"].map(geohash_df_dict)
    df["date_id"] = df["date_id"].map(date_df_dict)
    edge_df["geohash6_point1"] = edge_df["geohash6_point1"].map(geohash_df_dict)
    edge_df["geohash6_point2"] = edge_df["geohash6_point2"].map(geohash_df_dict)
    edge_df["date_id"] = edge_df["date_id"].map(date_df_dict)
    
    # 将edge_df中的geohash6_point1和geohash6_point2列的数据转换为int64
    edge_df["geohash6_point1"] = edge_df["geohash6_point1"].astype("int64")
    edge_df["geohash6_point2"] = edge_df["geohash6_point2"].astype("int64")
    
    return df, edge_df

In [4]:
# 读取数据
node_data, edge_data , geohash_df_dict, number_date= process_data('data/train_90.csv', 'data/edge_90.csv')

# 修改边数据，使得小的geohash_id在前，大的在后
# for i in range(len(edge_data)):
#     if edge_data.iloc[i, 0] > edge_data.iloc[i, 1]:
#         edge_data.iloc[i, 0], edge_data.iloc[i, 1] = edge_data.iloc[i, 1], edge_data.iloc[i, 0]
        
# 打印edge_data的前5行
# print(edge_data.head())

In [None]:
# 读取测试数据
node_data, edge_data = process_test_data('data/node_test_4_A.csv', 'data/edge_test_4_A.csv', geohash_df_dict, number_date)

In [None]:
# 将每一行的数据处理为一个tuple
# edges = list(zip(edge_data['geohash6_point1'], edge_data['geohash6_point2']))

# 假设edges是无向边，去除重复和反向的边
# edges = list(set([(a, b) if a < b else (b, a) for a, b in edges]))

# 打印所有边
# print(len(edges))

In [5]:
# 打印前5行
print(node_data.head())
print(edge_data.head())

# 打印每一列的数据类型
print(node_data.dtypes)
print(edge_data.dtypes)

   geohash_id  date_id    F_1    F_2    F_3    F_4    F_5    F_6    F_7  \
0           0        0 -0.711 -0.696 -0.794 -0.727 -0.747 -0.792  1.539   
1           0        1 -0.909 -0.903 -0.947 -0.844 -0.856 -0.908 -0.371   
2           0        2 -0.920 -0.925 -0.923 -0.852 -0.853 -0.915 -0.334   
3           0        3 -0.926 -0.931 -0.943 -0.837 -0.850 -0.907 -0.993   
4           0        4 -0.750 -0.764 -0.818 -0.749 -0.764 -0.816  1.116   

     F_8  ...   F_28   F_29   F_30   F_31   F_32   F_33   F_34   F_35  \
0  2.433  ...  0.073  0.344  0.006 -0.446 -0.502 -0.456 -0.457 -0.830   
1  0.990  ...  0.055  0.298  0.007 -0.523 -0.558 -0.533  0.113 -0.887   
2  0.792  ...  0.067  0.324  0.006 -0.535 -0.564 -0.540  0.367 -1.021   
3 -0.006  ...  0.076  0.276  0.010 -0.534 -0.554 -0.521  0.550 -0.211   
4  1.447  ...  0.079  0.328  0.008 -0.468 -0.500 -0.419 -0.236  0.644   

   active_index  consume_index  
0        69.306          63.78  
1        68.881          61.62  
2        69

In [6]:
# 删除node_data中全为0的列
node_data = node_data.loc[:, (node_data != 0).any(axis=0)]

# 打印node_data的前5行
print(node_data.head())

   geohash_id  date_id    F_1    F_2    F_3    F_4    F_5    F_6    F_7  \
0           0        0 -0.711 -0.696 -0.794 -0.727 -0.747 -0.792  1.539   
1           0        1 -0.909 -0.903 -0.947 -0.844 -0.856 -0.908 -0.371   
2           0        2 -0.920 -0.925 -0.923 -0.852 -0.853 -0.915 -0.334   
3           0        3 -0.926 -0.931 -0.943 -0.837 -0.850 -0.907 -0.993   
4           0        4 -0.750 -0.764 -0.818 -0.749 -0.764 -0.816  1.116   

     F_8  ...   F_28   F_29   F_30   F_31   F_32   F_33   F_34   F_35  \
0  2.433  ...  0.073  0.344  0.006 -0.446 -0.502 -0.456 -0.457 -0.830   
1  0.990  ...  0.055  0.298  0.007 -0.523 -0.558 -0.533  0.113 -0.887   
2  0.792  ...  0.067  0.324  0.006 -0.535 -0.564 -0.540  0.367 -1.021   
3 -0.006  ...  0.076  0.276  0.010 -0.534 -0.554 -0.521  0.550 -0.211   
4  1.447  ...  0.079  0.328  0.008 -0.468 -0.500 -0.419 -0.236  0.644   

   active_index  consume_index  
0        69.306          63.78  
1        68.881          61.62  
2        69

In [7]:
# 对node_data的第三列到第三十五列进行标准化
node_data.iloc[:, 2:35] = (node_data.iloc[:, 2:35] - node_data.iloc[:, 2:35].mean()) / node_data.iloc[:, 2:35].std()

# 对edge_data的第三列到第四列进行标准化
edge_data.iloc[:, 2:4] = (edge_data.iloc[:, 2:4] - edge_data.iloc[:, 2:4].mean()) / edge_data.iloc[:, 2:4].std()

# 打印node_data的前5行
print(node_data.head())

# 打印edge_data的前5行
print(edge_data.head())

   geohash_id  date_id       F_1       F_2       F_3       F_4       F_5  \
0           0        0 -0.704497 -0.689070 -0.786935 -0.721294 -0.741659   
1           0        1 -0.906495 -0.900487 -0.946404 -0.841177 -0.853436   
2           0        2 -0.917717 -0.922956 -0.921389 -0.849374 -0.850360   
3           0        3 -0.923838 -0.929084 -0.942235 -0.834004 -0.847284   
4           0        4 -0.744285 -0.758521 -0.811949 -0.743836 -0.759092   

        F_6       F_7       F_8  ...      F_28      F_29      F_30      F_31  \
0 -0.788201  1.572960  2.471032  ... -0.087031  0.754380 -0.805618 -0.742004   
1 -0.905471 -0.350864  1.017639  ... -0.617727 -0.122875 -0.689629 -0.828154   
2 -0.912548 -0.313596  0.818213  ... -0.263930  0.372965 -0.805618 -0.841580   
3 -0.904460 -0.977366  0.014466  ...  0.001418 -0.542432 -0.341665 -0.840462   
4 -0.812464  1.146898  1.477930  ...  0.089867  0.449248 -0.573641 -0.766618   

       F_32      F_33      F_34      F_35  active_index  consu

In [8]:
# 根据日期对节点和边进行分组
date_groups = node_data.groupby('date_id')
date_edge_groups = edge_data.groupby('date_id')

In [9]:
# 从data_groups中取出第一天的数据
day_1 = date_groups.get_group(0)
day_1_edge = date_edge_groups.get_group(0)

# 打印第一天的数据
print(day_1.iloc[:, 2:-2].values.shape)
print(day_1_edge.iloc[:, 2:4].values.shape)

(1140, 33)
(11709, 2)


In [10]:
import torch

In [20]:
# 初始化一个列表来存储DGL图
dgl_graphs = []

# 对于每个日期，创建一个DGL图
for date, group in date_groups:
    # 获取节点和边的数据
    nodes_feature = group.iloc[:, 2:-2].values
    nodes_labels = group.iloc[:, -2:].values
    # nodes_feature = group.iloc[:, 2:].values
    # edges_feature = date_edge_groups.get_group(date).values
    
    # 创建DGL图
    g = dgl.DGLGraph()
    
    # 将NumPy数组转换为PyTorch张量并放在与DGL图相同的设备上
    nodes_feature = torch.tensor(nodes_feature, device=g.device)
    nodes_labels = torch.tensor(nodes_labels, device=g.device)
    edges_feature = torch.tensor(edges_feature, device=g.device)
    
    # 添加节点
    g.add_nodes(nodes_feature.shape[0])
    
    # 添加边
    g.add_edges(edges_feature[:, 0].long(), edges_feature[:, 1].long())
    
    # 添加节点的特征和标签
    g.ndata['feat'] = nodes_feature
    g.ndata['label'] = nodes_labels
    
    # 添加边的特征
    g.edata['feat'] = edges_feature[:, 2:4]
    
    # 将图添加到列表中
    dgl_graphs.append(g)

  edges_feature = torch.tensor(edges_feature, device=g.device)


In [21]:
# 返回所有创建的DGL图
dgl_graphs

[Graph(num_nodes=1140, num_edges=11895,
       ndata_schemes={'feat': Scheme(shape=(33,), dtype=torch.float64), 'label': Scheme(shape=(2,), dtype=torch.float64)}
       edata_schemes={'feat': Scheme(shape=(2,), dtype=torch.float64)}),
 Graph(num_nodes=1140, num_edges=11895,
       ndata_schemes={'feat': Scheme(shape=(33,), dtype=torch.float64), 'label': Scheme(shape=(2,), dtype=torch.float64)}
       edata_schemes={'feat': Scheme(shape=(2,), dtype=torch.float64)}),
 Graph(num_nodes=1140, num_edges=11895,
       ndata_schemes={'feat': Scheme(shape=(33,), dtype=torch.float64), 'label': Scheme(shape=(2,), dtype=torch.float64)}
       edata_schemes={'feat': Scheme(shape=(2,), dtype=torch.float64)}),
 Graph(num_nodes=1140, num_edges=11895,
       ndata_schemes={'feat': Scheme(shape=(33,), dtype=torch.float64), 'label': Scheme(shape=(2,), dtype=torch.float64)}
       edata_schemes={'feat': Scheme(shape=(2,), dtype=torch.float64)}),
 Graph(num_nodes=1140, num_edges=11895,
       ndata_schemes

In [22]:
# 输出dgl_graphs的大小
print(len(dgl_graphs))

# 输出第一个DGL图的节点和边的数量
print(dgl_graphs[0].number_of_nodes())
print(dgl_graphs[0].number_of_edges())

# 输出第一个DGL图的节点和边的特征
print(dgl_graphs[0].ndata['feat'])
# print(dgl_graphs[0].ndata['label'])
print(dgl_graphs[0].edata['feat'])

print(len(dgl_graphs[0].nodes()))
print(len(dgl_graphs[0].edges()))

print(dgl_graphs[0].nodes())
print(dgl_graphs[0].edges())

90
1140
11895
tensor([[-0.7045, -0.6891, -0.7869,  ..., -0.6853, -0.7076, -0.9382],
        [-0.9973, -1.0373, -0.9923,  ..., -0.8487, -1.1982,  0.1718],
        [-1.0799, -1.1303, -1.1236,  ..., -0.8974, -2.0462, -1.4434],
        ...,
        [-0.8473, -0.8882, -0.9297,  ..., -0.7321, -0.9858, -1.0479],
        [ 0.6687,  0.7500,  0.2637,  ...,  0.7403, -1.7126,  1.6360],
        [ 0.9982,  0.8746,  0.7046,  ...,  0.9085, -0.7194,  0.0148]],
       dtype=torch.float64)
tensor([[-0.1245,  1.1427],
        [-0.1245,  0.6578],
        [-0.1245,  0.8656],
        ...,
        [-0.1226,  1.0041],
        [-0.1188,  0.7271],
        [-0.1054, -1.6975]], dtype=torch.float64)
1140
2
tensor([   0,    1,    2,  ..., 1137, 1138, 1139])
(tensor([166, 129,  48,  ..., 282, 456,  22]), tensor([422, 614, 602,  ..., 568, 650, 254]))


In [23]:
# 保存DGL图
dgl.save_graphs('dgl_data/dgl_graphs_train.bin', dgl_graphs)

In [None]:
# 保存DGL图
dgl.save_graphs('dgl_data/dgl_graphs_test.bin', dgl_graphs)

In [None]:
# 读取DGL图
dgl_graphs, _ = dgl.load_graphs('dgl_data/dgl_graphs_train.bin')

# 输出dgl_graphs的大小
print(len(dgl_graphs))

# 输出第一个DGL图的节点和边的数量
print(dgl_graphs[0].number_of_nodes())
print(dgl_graphs[0].number_of_edges())

# 输出第一个DGL图的节点和边的特征
print(dgl_graphs[0].ndata['feat'])
print(dgl_graphs[0].ndata['label'])
print(dgl_graphs[0].edata['feat'])