In [18]:
import pandas as pd
from datetime import datetime, timedelta
from collections import defaultdict
import csv
from datetime import datetime, timezone, timedelta
from openlocationcode import openlocationcode as olc
import os
import json
import numpy as np

In [11]:
# 数据过滤

# 处理空间位置
def get_pluscode(latitude, longitude):
    if pd.isna(latitude) or pd.isna(longitude):
        return "INVALID"
    try:
        code = olc.encode(float(latitude), float(longitude))
        return code[:6]
    except:
        return "INVALID"


# 处理时间
def format_time(row):
    # 示例输入: "Mon Oct 19 20:06:23 +0800 2025" — 但 tz_offset 已单独给出
    # 忽略字符串中的时区，只用 tz_offset
    time_str = row['time']
    # 拆分出日期部分和年份
    parts = time_str.split()
    # 重建为 "Oct 19 20:06:23 2025"
    clean_time_str = f"{parts[1]} {parts[2]} {parts[3]} {parts[5]}"
    naive_dt = datetime.strptime(clean_time_str, "%b %d %H:%M:%S %Y")
    
    # 使用 tz_offset（分钟）创建时区
    tz = timezone(timedelta(minutes=int(row['tz_offset'])))
    localized_dt = naive_dt.replace(tzinfo=tz)
    return localized_dt.strftime("%Y-%m-%d %H:%M")

# 保持映射关系
def save_mapping(mapping, file_path):
    # 保存uid映射文件
    with open(file_path, "w", newline="") as uidfile:
        writer = csv.writer(uidfile)
        writer.writerow(["original_uid", "new_uid"])
        for original_uid, new_uid in mapping.items():
            writer.writerow([original_uid, new_uid])


def filter_data(datafold=None, min_user_interactions=10, min_poi_interactions=10):

    os.makedirs(f"{datafold}", exist_ok=True)

    input_file = f"{datafold}.txt"
    df = pd.read_csv(input_file, delimiter="\t", header=None,
                     names=['uid', 'pid', '_', 'category', 'latitude', 'longitude', 'tz_offset', 'time'])

    # 转换经纬度为数值
    df['latitude'] = pd.to_numeric(df['latitude'], errors='coerce')
    df['longitude'] = pd.to_numeric(df['longitude'], errors='coerce')
    df = df.dropna(subset=['latitude', 'longitude'])

    # 过滤 POI
    poi_counts = df['pid'].value_counts()
    valid_pids = poi_counts[poi_counts >= min_poi_interactions].index
    df = df[df['pid'].isin(valid_pids)]

    # 过滤用户
    user_counts = df['uid'].value_counts()
    valid_uids = user_counts[user_counts >= min_user_interactions].index
    df = df[df['uid'].isin(valid_uids)].copy()

    # 生成 region
    df["region"] = df.apply(lambda row: get_pluscode(row['latitude'], row['longitude']), axis=1)

    # 映射 ID
    uid_map, pid_map, cid_map, region_map = {}, {}, {}, {}
    df['new_uid'] = df['uid'].apply(lambda x: uid_map.setdefault(x, len(uid_map)))
    df['new_pid'] = df['pid'].apply(lambda x: pid_map.setdefault(x, len(pid_map)))
    df['new_cid'] = df['category'].apply(lambda x: cid_map.setdefault(x, len(cid_map)))
    df['new_region'] = df['region'].apply(lambda x: region_map.setdefault(x, len(region_map)))
    df['formatted_time'] = df.apply(format_time, axis=1)

    # 保存
    output_file = f"{datafold}/{datafold}.csv"
    df[['new_uid', 'new_pid', 'new_cid', 'category', 'new_region', 'latitude', 'longitude', 'formatted_time']].to_csv(
        output_file, index=False, header=['uid', 'pid', 'cid', 'category', 'region', 'latitude', 'longitude', 'time']
    )

    save_mapping(uid_map, f"{datafold}/uidmap.csv")
    save_mapping(pid_map, f"{datafold}/pidmap.csv")
    save_mapping(cid_map, f"{datafold}/cidmap.csv")

    print("处理完成！")

# 测试
datafold = 'NYC'
filter_data(datafold, min_user_interactions=10, min_poi_interactions=10)

处理完成！


In [15]:
# 生成 poi_info.csv 文件
def poi_info(datafold):
    # 读取 data.csv 文件
    file_path = f'{datafold}/{datafold}.csv'
    if not os.path.exists(file_path):
        raise FileNotFoundError(f"错误：{file_path} 文件未找到。")

    df = pd.read_csv(file_path)

    # 确保 time 列可解析
    df['time'] = pd.to_datetime(df['time'], errors='coerce')
    df = df.dropna(subset=['time'])  # 移除无法解析的时间

    poi_info_data = []

    for pid, group in df.groupby('pid'):
        # 取第一条记录的元信息（假设同一 POI 的 category/region/经纬度一致）
        row0 = group.iloc[0]
        category = row0['category']
        region = row0['region']
        latitude = row0['latitude']
        longitude = row0['longitude']

        # 统计每小时访问次数
        hours = group['time'].dt.hour
        hour_counts = hours.value_counts().to_dict()  # {hour: count}

        # 可选：只保留 count > 1 的小时（按你的需求）
        filtered_hour_counts = {int(h): int(c) for h, c in hour_counts.items() if c > 1}

        # 按访问次数降序排序
        sorted_hour_counts = dict(
            sorted(filtered_hour_counts.items(), key=lambda x: x[1], reverse=True)
        )

        poi_info_data.append({
            'pid': pid,
            'category': category,
            'region': region,
            'latitude': latitude,
            'longitude': longitude,
            'visit_time_and_count': sorted_hour_counts
        })

    # 创建 DataFrame 并保存
    poi_info_df = pd.DataFrame(poi_info_data)
    output_path = f'{datafold}/poi_info.csv'
    poi_info_df.to_csv(output_path, index=False)

    print(f"成功创建 {output_path}，共 {len(poi_info_df)} 个 POI")

# 测试
datafold = 'NYC'
poi_info(datafold)

成功创建 NYC/poi_info.csv，共 5135 个 POI


In [12]:
# 数据切分

def remove_users_pois_test(df_train, df_test):
    # 仅保留测试集中在训练集中出现的用户和POI
    valid_users = set(df_train['uid'].unique())
    valid_pois = set(df_train['pid'].unique())
    df_test = df_test[
        df_test['uid'].isin(valid_users) &
        df_test['pid'].isin(valid_pois)
    ]
    return df_test


def split_data(datafold, train_ratio=0.8):
    file_name = f"{datafold}/{datafold}.csv"
    df = pd.read_csv(file_name)

    # 确保包含所需列
    df = df[['uid', 'pid', 'cid', 'category', 'latitude', 'longitude', 'time']]
    
    # 转换时间并排序
    df['time'] = pd.to_datetime(df['time'])
    df = df.sort_values(by='time').reset_index(drop=True)

    # 按比例划分
    train_size = int(train_ratio * len(df))
    train_df = df.iloc[:train_size].copy()
    test_df = df.iloc[train_size:].copy()

    # 过滤测试集
    test_df = remove_users_pois_test(train_df, test_df)

    # 为测试用户保留完整历史（用于序列模型）
    test_uids = test_df['uid'].unique()
    expanded_test_df = df[df['uid'].isin(test_uids)].copy()

    # 保存
    train_df.to_csv(f'{datafold}/train_data.csv', index=False)
    expanded_test_df.to_csv(f'{datafold}/test_data.csv', index=False)

    print(f"Split done: {len(train_df)} train, {len(test_df)} test interactions.")

# 测试
split_data('NYC', train_ratio=0.8)


Split done: 118350 train, 28151 test interactions.


In [None]:
def generate_check_in_sequences(datafold, datafile):
    # 读取CSV文件到pandas DataFrame
    df = pd.read_csv(f"{datafold}/{datafile}.csv")
    # 确保时间列是datetime类型
    df['time'] = pd.to_datetime(df['time'])
    # 按uid分组并聚合数据
    def aggregate(group):
        group = group.sort_values('time')
        pid_list = group['pid'].tolist()
        return pd.Series({
            'pid_sequence': pid_list,
        })
    check_in_sequences = df.groupby('uid').apply(aggregate).reset_index()
    
    # 保存结果
    output_file = f"{datafold}/{datafile.split('_')[0]}_sequences.csv"
    check_in_sequences.to_csv(output_file, index=False)
    print(f"Check-in sequences saved to {output_file}")

# 测试
generate_check_in_sequences('NYC', 'train_data')
generate_check_in_sequences('NYC', 'test_data')

  check_in_sequences = df.groupby('uid').apply(aggregate).reset_index()


Check-in sequences saved to NYC/train_sequences.csv
Check-in sequences saved to NYC/test_sequences.csv


  check_in_sequences = df.groupby('uid').apply(aggregate).reset_index()


In [20]:
def build_poi_transition_matrix(data_path, save_path=None):
    # 1. 读取序列文件
    df = pd.read_csv(data_path)
    # 还原 pid_sequence 为 list
    df['pid_sequence'] = df['pid_sequence'].apply(json.loads)

    # 2. 收集所有 POI，确定 M
    all_pids = set()
    for seq in df['pid_sequence']:
        all_pids.update(seq)
    all_pids = sorted(all_pids)
    pid_to_idx = {pid: idx for idx, pid in enumerate(all_pids)}
    M = len(all_pids)
    print(f"Total POIs: {M}")

    # 3. 统计转移频次
    # 使用 defaultdict(int) 或直接用 numpy zeros
    transition_counts = defaultdict(int)

    for seq in df['pid_sequence']:
        for i in range(len(seq) - 1):
            from_pid = seq[i]
            to_pid = seq[i + 1]
            # 可选：跳过自环（如 0->0）？根据需求决定
            # if from_pid == to_pid: continue
            transition_counts[(from_pid, to_pid)] += 1

    # 4. 构建矩阵（稠密）
    A = np.zeros((M, M), dtype=np.int32)
    for (i, j), count in transition_counts.items():
        if i in pid_to_idx and j in pid_to_idx:
            A[pid_to_idx[i], pid_to_idx[j]] = count

    # 5. （可选）保存矩阵和映射
    if save_path:
        np.save(f"{save_path}_matrix.npy", A)
        print(f"Matrix saved to {save_path}_matrix.npy")

    return A, pid_to_idx, all_pids

# 使用示例
datafold = 'NYC'
A, pid_to_idx, all_pids = build_poi_transition_matrix(
    data_path=f"{datafold}/train_sequences.csv",
    save_path=f"{datafold}/poi_transition"
)
print("Transition matrix shape:", A.shape)
print("Example: POI 0 -> POI 0 count =", A[pid_to_idx[0], pid_to_idx[0]])

Total POIs: 5081
Matrix saved to NYC/poi_transition_matrix.npy
Transition matrix shape: (5081, 5081)
Example: POI 0 -> POI 0 count = 9


Example: POI 0 -> POI 1452 count = 1


In [None]:
import pandas as pd
from geopy.distance import geodesic
from multiprocessing import Pool
from tqdm import tqdm
import numpy as np

datafold = 'TKY'

# 加载 POI 数据
try:
    poi_df = pd.read_csv(f'{datafold}/poi_info.csv')
except FileNotFoundError:
    print("Error: poi_info.csv not found. Please ensure it's in the correct directory.")
    exit()

coords = poi_df[['latitude', 'longitude']].to_numpy()
poi_ids = poi_df['pid'].tolist()
num_pois = len(poi_df)

def calculate_distances(i):
    distances = {}
    for j in range(num_pois):
        if i == j:
            distances[poi_ids[j]] = 0.0
        else:
            distance = geodesic(coords[i], coords[j]).km
            distances[poi_ids[j]] = round(distance, 2)  # 保留两位小数
    return poi_ids[i], distances


num_processes = 128  # 根据你的 CPU 核心数调整
pool = Pool(processes=num_processes)
results = []
with tqdm(total=num_pois, desc="Calculating distances (parallel)", unit="POI") as pbar:
    for pid, distances in pool.imap_unordered(calculate_distances, range(num_pois)):
        results.append((pid, distances))
        pbar.update(1)

pool.close()
pool.join()

# 将结果转换为 DataFrame
distance_matrix_data = {}
for pid, distances in results:
    distance_matrix_data[pid] = distances

distance_matrix_df = pd.DataFrame.from_dict(distance_matrix_data, orient='index', columns=poi_ids)

# 确保列的顺序与索引一致
distance_matrix_df = distance_matrix_df.reindex(columns=poi_ids)

# 保存距离矩阵到 CSV 文件
distance_matrix_df.to_csv(f'{datafold}/distance.csv')

print("Distance matrix calculated and saved to distance.csv (parallel, 2 decimal places)") 

Calculating distances (parallel): 100%|██████████| 5135/5135 [01:12<00:00, 70.72POI/s] 


Distance matrix calculated and saved to distance.csv (parallel, 2 decimal places)


In [None]:
# import pandas as pd

# def find_nearby_pois(distance_file, target_pid, threshold):
#     """
#     查找距离指定 PID 小于阈值的其他 POI 的 PID。

#     Args:
#         distance_file (str): 存储 POI 之间距离的 CSV 文件路径。
#         target_pid (int): 要查找附近 POI 的目标 POI 的 PID。
#         threshold (float): 距离阈值（单位与 distance_file 中的距离单位一致）。

#     Returns:
#         list: 包含所有距离目标 PID 小于阈值的其他 POI 的 PID 列表。
#               如果目标 PID 不存在于文件中，则返回一个空列表。
#     """
#     try:
#         distance_df = pd.read_csv(distance_file, index_col=0)
#     except FileNotFoundError:
#         print(f"错误：文件 {distance_file} 未找到。")
#         return []

#     if target_pid not in distance_df.index:
#         print(f"警告：目标 PID {target_pid} 不存在于距离文件中。")
#         return []

#     # 获取目标 PID 对应的距离Series
#     target_distances = distance_df.loc[target_pid]

#     # 筛选出距离小于阈值的 PID，并排除目标 PID 本身
#     nearby_pois = target_distances[
#         (target_distances < threshold) & (distance_df.index != target_pid)
#     ].index.tolist()

#     return nearby_pois

# if __name__ == '__main__':
#     distance_file = 'NYC/distance.csv'  # 替换为你的 distance.csv 文件路径
#     target_pid = 0  # 指定你要查找附近 POI 的目标 PID
#     distance_threshold = 2.0  # 指定距离阈值，例如 1.0 公里

#     nearby_pids = find_nearby_pois(distance_file, target_pid, distance_threshold)

#     if nearby_pids:
#         print(f"距离 PID {target_pid} 小于 {distance_threshold} 的 POI PID 有{len(nearby_pids)}个:")
#         print(nearby_pids)
#     else:
#         print(f"没有找到距离 PID {target_pid} 小于 {distance_threshold} 的其他 POI。")

距离 PID 0 小于 2.0 的 POI PID 有723个:
['0', '8', '20', '28', '62', '76', '77', '83', '85', '88', '91', '112', '119', '120', '121', '132', '134', '146', '152', '153', '181', '186', '194', '197', '206', '207', '208', '220', '222', '234', '237', '247', '252', '253', '256', '259', '260', '277', '288', '289', '292', '310', '311', '318', '319', '339', '344', '356', '363', '365', '366', '370', '371', '375', '383', '385', '387', '391', '395', '400', '401', '410', '414', '417', '421', '427', '428', '458', '460', '470', '493', '494', '512', '515', '516', '530', '537', '545', '552', '563', '574', '580', '651', '652', '664', '674', '675', '686', '689', '708', '732', '760', '763', '789', '799', '810', '817', '821', '831', '845', '848', '851', '854', '857', '858', '871', '875', '876', '882', '890', '905', '910', '912', '915', '918', '926', '939', '956', '960', '966', '975', '978', '983', '986', '991', '994', '1002', '1004', '1020', '1021', '1022', '1025', '1038', '1048', '1067', '1068', '1069', '1074', '

In [2]:
import pandas as pd
import numpy as np
from geopy.distance import geodesic


datafold = 'TKY'

def process_poi_data(file_path):
    """
    处理POI签到数据，按用户ID分组，并按时间排序POI访问记录。
    计算每个POI与前一个POI的距离。

    参数:
    file_path (str): 包含POI签到数据的文件的路径。

    返回:
    pandas.DataFrame: 包含处理后数据的DataFrame，其中包含uid、pid_list、category_list、time_list和distance_list列。
    """
    # 读取CSV文件到pandas DataFrame
    df = pd.read_csv(file_path)

    # 确保时间列是datetime类型
    df['time'] = pd.to_datetime(df['time'])

    # 按uid分组并聚合数据
    def aggregate_and_calculate_distance(group):
        pid_list = list(group['pid'])
        category_list = list(group['category'])
        region_list = list(group['region'])
        time_list = list(group['time'])
        
        # 按时间排序
        sorted_indices = sorted(range(len(time_list)), key=lambda i: time_list[i])
        pid_list = [pid_list[i] for i in sorted_indices]
        category_list = [category_list[i] for i in sorted_indices]
        region_list = [region_list[i] for i in sorted_indices]
        time_list = [time_list[i] for i in sorted_indices]
        
        # 计算距离
        # distance_list = [0.0]  # 第一个POI的距离为0
        # for i in range(1, len(latitude_list)):
        #     coord1 = (latitude_list[i-1], longitude_list[i-1])
        #     coord2 = (latitude_list[i], longitude_list[i])
        #     distance = geodesic(coord1, coord2).km  # 使用geodesic计算距离，单位为公里
        #     distance_list.append(round(distance, 2))
        
        return pd.Series({
            'pid_list': pid_list,
            'category_list': category_list,
            'region_list': region_list,
            'time_list': [t.strftime('%Y-%m-%d %H:%M') for t in time_list], # 格式化时间
            # 'distance_list': distance_list
        })

    df_grouped = df.groupby('uid').apply(aggregate_and_calculate_distance).reset_index()
    return df_grouped


def save_processed_data(df, output_file_path):
    """
    将处理后的数据保存到CSV文件。

    参数:
    df (pandas.DataFrame): 包含处理后数据的DataFrame。
    output_file_path (str): 输出文件的路径。
    """
    df.to_csv(output_file_path, index=False, header=True)
    print(f"处理后的数据已保存到：{output_file_path}")

# 指定输入和输出文件路径
input_file_path = f'{datafold}/{datafold}.csv'  # 替换为您的输入文件路径
output_file_path = f'{datafold}/poi_checkin.csv'  # 替换为您想要的输出文件路径

# 处理数据
processed_df = process_poi_data(input_file_path)

# 保存处理后的数据
save_processed_data(processed_df, output_file_path)

  df_grouped = df.groupby('uid').apply(aggregate_and_calculate_distance).reset_index()


处理后的数据已保存到：TKY/poi_checkin.csv


In [3]:
import pandas as pd
from collections import defaultdict


datafold = 'TKY'

def build_poi_transition_graph(file_path):
    """
    根据用户访问序列文件构建POI到POI的转移图。

    参数:
    file_path (str): 包含用户访问序列数据的文件路径。

    返回:
    dict: 一个字典，表示POI到POI的转移图。
          键是POI ID，值是潜在的后续POI ID列表。
    """
    # 读取CSV文件到pandas DataFrame
    df = pd.read_csv(file_path)

    # 初始化转移图
    transition_graph = defaultdict(list)

    df['poi_list'] = df['pid_list'].apply(lambda x: eval(x))  # 将字符串转换为列表
    user_checkin = df['poi_list'].tolist()

    for i in range(len(user_checkin)):
        # 获取当前用户的POI列表
        poi_list = user_checkin[i]
        # 遍历当前用户访问的POI列表
        for j in range(len(poi_list) - 1):
            current_poi = poi_list[j]
            next_poi = poi_list[j + 1]
            # 将下一个POI添加到当前POI的潜在后续POI列表中
            if next_poi not in transition_graph[current_poi]:
                transition_graph[current_poi].append(next_poi)

    return transition_graph

def save_transition_graph(graph, output_file_path):
    """
    将POI到POI的转移图保存到CSV文件。

    参数:
    graph (dict): POI到POI的转移图。
    output_file_path (str): 输出文件的路径。
    """
    # 将字典转换为DataFrame
    df = pd.DataFrame(list(graph.items()), columns=['pid', 'potential_poi'])
    # 保存到CSV文件
    df.to_csv(output_file_path, index=False)
    print(f"转移图已保存到：{output_file_path}")

if __name__ == "__main__":
    # 指定输入和输出文件路径
    input_file_path = f'{datafold}/poi_checkin.csv'  # 替换为您的输入文件路径
    output_file_path = f'{datafold}/poi_transition_graph.csv'  # 替换为您想要的输出文件路径

    # 构建转移图
    transition_graph = build_poi_transition_graph(input_file_path)

    # 保存转移图
    save_transition_graph(transition_graph, output_file_path)

转移图已保存到：TKY/poi_transition_graph.csv


In [None]:
# import csv
# import json
# import ast


# datafold = 'NYC'
# def convert_csv_to_json(input_csv_path, output_json_path):
#     data = []

#     with open(input_csv_path, mode='r', encoding='utf-8') as csvfile:
#         reader = csv.DictReader(csvfile)
#         for row in reader:
#             uid = int(row['uid'])
#             pid_list = ast.literal_eval(row['pid_list'])
#             category_list = ast.literal_eval(row['category_list'])
#             region_list = ast.literal_eval(row['region_list'])
#             time_list = ast.literal_eval(row['time_list'])
#             # distance_list = ast.literal_eval(row['distance_list'])
            
#             max_length = 50
#             record = {
#                 "uid": uid,
#                 "pid_list": pid_list[-max_length:] if len(pid_list) > max_length else pid_list,
#                 "category_list": category_list[-max_length:] if len(category_list) > max_length else category_list,
#                 "region_list": region_list[-max_length:] if len(region_list) > max_length else region_list,
#                 "time_list": time_list[-max_length:] if len(time_list) > max_length else time_list,
#                 # "distance_list": distance_list[-max_length:] if len(distance_list) > max_length else distance_list
#             }

#             data.append(record)

#     with open(output_json_path, mode='w', encoding='utf-8') as jsonfile:
#         jsonfile.write('[\n')
#         for i, record in enumerate(data):
#             line = json.dumps(record, ensure_ascii=False, separators=(',', ': '))
#             jsonfile.write('  ' + line)
#             if i < len(data) - 1:
#                 jsonfile.write(',\n')
#             else:
#                 jsonfile.write('\n')
#         jsonfile.write(']\n')

#     print(f"转换完成，结果保存至: {output_json_path}")
# # 示例用法
# input_file_path = f'{datafold}/poi_checkin.csv'  # 替换为您的输入文件路径
# output_file_path = f'{datafold}/data.json'  # 替换为您想要的输出文件路径
# convert_csv_to_json(input_file_path, output_file_path)


转换完成，结果保存至: NYC/data.json


In [None]:
import csv
import json
import ast


datafold = 'TKY'
max_length = 50

def convert_csv_to_json(input_csv_path, output_file_path):
    data = []

    with open(input_csv_path, mode='r', encoding='utf-8') as csvfile:
        reader = csv.DictReader(csvfile)
        for row in reader:
            uid = int(row['uid'])
            pid_list = ast.literal_eval(row['pid_list'])
            # category_list = ast.literal_eval(row['category_list'])
            # region_list = ast.literal_eval(row['region_list'])
            time_list = ast.literal_eval(row['time_list'])
            # distance_list = ast.literal_eval(row['distance_list'])
            record = {
                "uid": uid,
                "history": pid_list[-max_length-1:-1] if len(pid_list) > max_length else pid_list,
                "time": time_list[-max_length-1:-1] if len(time_list) > max_length else time_list,
                "next_time": time_list[-1],
                "target_pid": pid_list[-1]
            }

            data.append(record)

    json_records = []
    for record in data:
        uid = record["uid"]
        history = record["history"]
        time_seq = record["time"]
        next_time = record["next_time"]
        target_pid = record["target_pid"]

            # 构造 input 字符串
        input_text = (
            f"The historical POI check-in records for user {uid} are as follows:\n"
            f"POI list: {history}, with corresponding check-in times: {time_seq}."
            # f"At {next_time}, which POI is the user most likely to check in at?"
        )

        record = {
            "input": input_text,
            "next_time": next_time,
            "target": target_pid
        }
        json_records.append(record)


    with open(output_file_path, mode='w', encoding='utf-8') as file:
        json.dump(json_records, file, ensure_ascii=False, indent=2)

    print(f"转换完成，结果保存至: {output_file_path}")
# 示例用法
input_file_path = f'{datafold}/poi_checkin.csv'  # 替换为您的输入文件路径
output_file_path = f'{datafold}/data{max_length}.json'  # 替换为您想要的输出文件路径
convert_csv_to_json(input_file_path, output_file_path)


转换完成，结果保存至: NYC/data50.json
