In [16]:
import os
import pickle
import networkx as nx
import numpy as np
import pandas as pd
from tqdm import tqdm
import math

In [17]:
# 从包含POI签到数据的DataFrame中构建一个有向图
def build_global_POI_checkin_graph(df, exclude_user=None):
    G = nx.DiGraph()
    users = list(set(df['user_id'].to_list()))
    if exclude_user in users: users.remove(exclude_user)
    loop = tqdm(users)
    for user_id in loop:
        user_df = df[df['user_id'] == user_id]
        # 对于每个用户，遍历其签到数据
        # 如果POI节点尚不存在，则为其添加一个节点，节点的属性包括签到次数、位置等。
        for i, row in user_df.iterrows():
            node = row['POI_id']
            if node not in G.nodes():
                G.add_node(row['POI_id'],checkin_cnt=1,latitude=row['latitude'], longitude=row['longitude'])
            else:
                G.nodes[node]['checkin_cnt'] += 1

        # Add edges (Check-in seq)
        previous_poi_id = 0
        previous_traj_id = 0
        for i, row in user_df.iterrows():
            poi_id = row['POI_id']
            traj_id = row['trajectory_id']
            # No edge for the begin of the seq or different traj
            if (previous_poi_id == 0) or (previous_traj_id != traj_id):
                previous_poi_id = poi_id
                previous_traj_id = traj_id
                continue
            # 对于每个用户的轨迹，添加连续POI之间的边。边的权重表示用户在两个POI之间过渡的频率。
            if G.has_edge(previous_poi_id, poi_id):
                G.edges[previous_poi_id, poi_id]['weight'] += 1
            else:  # Add new edge
                G.add_edge(previous_poi_id, poi_id, weight=1)
            previous_traj_id = traj_id
            previous_poi_id = poi_id

    return G

In [18]:
# 将图G保存为两个CSV文件
def save_graph_to_csv(G, dst_dir):
    # 保存邻接矩阵graph_A.csv
    nodelist = G.nodes()
    A = nx.adjacency_matrix(G, nodelist=nodelist)
    np.savetxt(os.path.join(dst_dir, 'graph_A.csv'), A.todense(), delimiter=',')

    # 保存节点列表graph_X.csv
    nodes_data = list(G.nodes.data())  # [(node_name, {attr1, attr2}),...]
    with open(os.path.join(dst_dir, 'graph_X.csv'), 'w') as f:
        print('node_name/poi_id,checkin_cnt,latitude,longitude', file=f)
        for each in nodes_data:
            node_name = each[0]
            checkin_cnt = each[1]['checkin_cnt']
            latitude = each[1]['latitude']
            longitude = each[1]['longitude']
            print(f'{node_name},{checkin_cnt},{latitude},{longitude}', file=f)

In [19]:
# 打印图G的一些基本统计信息
def print_graph_statisics(G):
    # 节点数和边数
    print(f"Num of nodes: {G.number_of_nodes()}")
    print(f"Num of edges: {G.number_of_edges()}")

    # 节点度数 (均值和百分位数)
    node_degrees = [each[1] for each in G.degree]
    print(f"Node degree (mean): {np.mean(node_degrees):.2f}")
    for i in range(0, 101, 20):
        print(f"Node degree ({i} percentile): {np.percentile(node_degrees, i)}")

    # 边权重 (均值和百分位数)
    edge_weights = []
    for n, nbrs in G.adj.items():
        for nbr, attr in nbrs.items():
            weight = attr['weight']
            edge_weights.append(weight)
    print(f"Edge frequency (mean): {np.mean(edge_weights):.2f}")
    for i in range(0, 101, 20):
        print(f"Edge frequency ({i} percentile): {np.percentile(edge_weights, i)}")

In [20]:
# 计算两个经纬度之间的距离（单位：千米）
def haversine(lat1, lon1, lat2, lon2):
    R = 6371 # 地球的平均半径，单位为千米
    dLat = (lat2 - lat1) * math.pi / 180.0
    dLon = (lon2 - lon1) * math.pi / 180.0
    a = math.sin(dLat / 2) * math.sin(dLat / 2) + math.cos(lat1 * math.pi / 180.0) * math.cos(lat2 * math.pi / 180.0) * math.sin(dLon / 2) * math.sin(dLon / 2)
    c = 2 * math.atan2(math.sqrt(a), math.sqrt(1 - a))
    dist = R * c
    return dist

In [21]:
# 计算距离图并保存为 CSV 文件
def save_distance_graph(G, dst_dir):
    nodelist = list(G.nodes())
    num_nodes = len(nodelist)
    dist_matrix = np.zeros((num_nodes, num_nodes))

    for i in range(num_nodes):
        node1 = nodelist[i]
        lat1 = G.nodes[node1]['latitude']
        lon1 = G.nodes[node1]['longitude']
        for j in range(num_nodes):
            node2 = nodelist[j]
            lat2 = G.nodes[node2]['latitude']
            lon2 = G.nodes[node2]['longitude']
            dist_matrix[i, j] = haversine(lat1, lon1, lat2, lon2)

    np.savetxt(os.path.join(dst_dir, 'graph_dist.csv'), dist_matrix, delimiter=',')

In [7]:
# 数据集gowalla
dst_dir = r'./gowalla'
# 构建POI签到轨迹图
train_df = pd.read_csv(os.path.join(dst_dir, 'gowalla-ca_train.csv'))
print('Build global POI checkin graph -----------------------------------')
G = build_global_POI_checkin_graph(train_df)
save_graph_to_csv(G, dst_dir=dst_dir)
print('Build global POI Distance graph -----------------------------------')
# 计算并保存距离图
save_distance_graph(G, dst_dir=dst_dir)

Build global POI checkin graph -----------------------------------


100%|██████████| 5557/5557 [00:19<00:00, 278.45it/s]


Build global POI Distance graph -----------------------------------


In [8]:
gowalla_df = pd.read_csv('./gowalla/graph_A.csv', delimiter=',', header=None)  # 读取文件
gowalla_df.shape

(13508, 13508)

In [19]:
# 数据集foursquare_nyc
dst_dir = r'./foursquare/nyc'
# 构建POI签到轨迹图
train_df = pd.read_csv(os.path.join(dst_dir, 'nyc_train.csv'))
print('Build global POI checkin graph -----------------------------------')
G = build_global_POI_checkin_graph(train_df)
save_graph_to_csv(G, dst_dir=dst_dir)
print('Build global POI Distance graph -----------------------------------')
# 计算并保存距离图
save_distance_graph(G, dst_dir)

Build global POI checkin graph -----------------------------------


100%|██████████| 1072/1072 [00:07<00:00, 139.30it/s]


Build global POI Distance graph -----------------------------------


In [20]:
nyc_df = pd.read_csv('./foursquare/nyc/graph_A.csv', delimiter=',', header=None)  # 读取文件
nyc_df.shape

(5051, 5051)

In [21]:
# 数据集foursquare_tky
dst_dir = r'./foursquare/tky'
# 构建POI签到轨迹图
train_df = pd.read_csv(os.path.join(dst_dir, 'tky_train.csv'))
print('Build global POI checkin graph -----------------------------------')
G = build_global_POI_checkin_graph(train_df)
save_graph_to_csv(G, dst_dir=dst_dir)
print('Build global POI Distance graph -----------------------------------')
# 计算并保存距离图
save_distance_graph(G, dst_dir)

Build global POI checkin graph -----------------------------------


100%|██████████| 2281/2281 [00:24<00:00, 92.97it/s] 


Build global POI Distance graph -----------------------------------


In [22]:
tky_df = pd.read_csv('./foursquare/tky/graph_A.csv', delimiter=',', header=None)  # 读取文件
tky_df.shape

(7831, 7831)

In [15]:
# 数据集foursquare_tky
dst_dir = r'./long-tail/foursquare/tky'
# 构建POI签到轨迹图
train_df = pd.read_csv(os.path.join(dst_dir, 'tky_train_longtail.csv'))
print('Build global POI checkin graph -----------------------------------')
G = build_global_POI_checkin_graph(train_df)
save_graph_to_csv(G, dst_dir=dst_dir)
print('Build global POI Distance graph -----------------------------------')
# 计算并保存距离图
save_distance_graph(G, dst_dir)

# 数据集foursquare_nyc
dst_dir = r'./long-tail/foursquare/nyc'
# 构建POI签到轨迹图
train_df = pd.read_csv(os.path.join(dst_dir, 'nyc_train_longtail.csv'))
print('Build global POI checkin graph -----------------------------------')
G = build_global_POI_checkin_graph(train_df)
save_graph_to_csv(G, dst_dir=dst_dir)
print('Build global POI Distance graph -----------------------------------')
# 计算并保存距离图
save_distance_graph(G, dst_dir)

Build global POI checkin graph -----------------------------------


100%|██████████| 1898/1898 [00:13<00:00, 139.65it/s]


Build global POI Distance graph -----------------------------------
Build global POI checkin graph -----------------------------------


100%|██████████| 959/959 [00:05<00:00, 171.29it/s]


Build global POI Distance graph -----------------------------------
Build global POI checkin graph -----------------------------------


100%|██████████| 3675/3675 [00:17<00:00, 207.88it/s]


Build global POI Distance graph -----------------------------------


In [22]:
# 数据集gowalla
dst_dir = r'./long-tail/gowalla'
# 构建POI签到轨迹图
train_df = pd.read_csv(os.path.join(dst_dir, 'gowalla-ca_train_longtail.csv'))
print('Build global POI checkin graph -----------------------------------')
G = build_global_POI_checkin_graph(train_df)
save_graph_to_csv(G, dst_dir=dst_dir)
print('Build global POI Distance graph -----------------------------------')
# 计算并保存距离图
save_distance_graph(G, dst_dir=dst_dir)

Build global POI checkin graph -----------------------------------


100%|██████████| 3675/3675 [00:17<00:00, 216.17it/s]


Build global POI Distance graph -----------------------------------
