In [4]:
import pandas as pd

datafold = 'CA' 
# 读取 data.csv 文件
try:
    df = pd.read_csv(f'{datafold}/{datafold}.csv')
except FileNotFoundError:
    print(f"错误：{datafold}.csv 文件未找到。请确保文件位于正确的目录下。")
    exit()

# 按照 'pid' 进行分组
grouped = df.groupby('pid')

# 创建 poi_info 数据
poi_info_data = []
for pid, group in grouped:
    # pid = int(pid)
    category = group['category'].iloc[0]
    region = group['region'].iloc[0]
    latitude, longitude = group[['latitude', 'longitude']].iloc[0]
    hourly_visits = {}
    for timestamp_str in group['time']:
        try:
            hour = pd.to_datetime(timestamp_str).hour
            hourly_visits[hour] = hourly_visits.get(hour, 0) + 1
        except ValueError:
            print(f"警告：无法解析时间戳：{timestamp_str}。已跳过。")
    
    # 只保留访问频率大于1的时间段
    filtered_hourly_visits = {hour: count for hour, count in hourly_visits.items() if count > 1}
    # 按访问次数（值）降序排序
    sorted_hourly_visits = dict(
        sorted(filtered_hourly_visits.items(), key=lambda item: item[1], reverse=True)
    )
    poi_info_data.append({
        'pid': pid,
        'category': category,
        'region': region,
        'latitude': latitude,
        'longitude': longitude,
        'visit_time_and_count': sorted_hourly_visits
    })

# 创建 poi_info DataFrame
poi_info_df = pd.DataFrame(poi_info_data)

# 将 poi_info DataFrame 保存到 poi_info.csv 文件
poi_info_df.to_csv(f'{datafold}/poi_info.csv', index=False)

print("成功创建 poi_info.csv 文件")

成功创建 poi_info.csv 文件


In [None]:
import pandas as pd
from geopy.distance import geodesic
from multiprocessing import Pool
from tqdm import tqdm
import numpy as np

datafold = 'CA'

# 加载 POI 数据
try:
    poi_df = pd.read_csv(f'{datafold}/poi_info.csv')
except FileNotFoundError:
    print("Error: poi_info.csv not found. Please ensure it's in the correct directory.")
    exit()

coords = poi_df[['latitude', 'longitude']].to_numpy()
poi_ids = poi_df['pid'].tolist()
num_pois = len(poi_df)

def calculate_distances(i):
    distances = {}
    for j in range(num_pois):
        if i == j:
            distances[poi_ids[j]] = 0.0
        else:
            distance = geodesic(coords[i], coords[j]).km
            distances[poi_ids[j]] = round(distance, 2)  # 保留两位小数
    return poi_ids[i], distances


num_processes = 128  # 根据你的 CPU 核心数调整
pool = Pool(processes=num_processes)
results = []
with tqdm(total=num_pois, desc="Calculating distances (parallel)", unit="POI") as pbar:
    for pid, distances in pool.imap_unordered(calculate_distances, range(num_pois)):
        results.append((pid, distances))
        pbar.update(1)

pool.close()
pool.join()

# 将结果转换为 DataFrame
distance_matrix_data = {}
for pid, distances in results:
    distance_matrix_data[pid] = distances

distance_matrix_df = pd.DataFrame.from_dict(distance_matrix_data, orient='index', columns=poi_ids)

# 确保列的顺序与索引一致
distance_matrix_df = distance_matrix_df.reindex(columns=poi_ids)

# 保存距离矩阵到 CSV 文件
distance_matrix_df.to_csv(f'{datafold}/distance.csv')

print("Distance matrix calculated and saved to distance.csv (parallel, 2 decimal places)") 

Calculating distances (parallel): 100%|██████████| 5135/5135 [01:12<00:00, 70.72POI/s] 


Distance matrix calculated and saved to distance.csv (parallel, 2 decimal places)


In [None]:
# import pandas as pd

# def find_nearby_pois(distance_file, target_pid, threshold):
#     """
#     查找距离指定 PID 小于阈值的其他 POI 的 PID。

#     Args:
#         distance_file (str): 存储 POI 之间距离的 CSV 文件路径。
#         target_pid (int): 要查找附近 POI 的目标 POI 的 PID。
#         threshold (float): 距离阈值（单位与 distance_file 中的距离单位一致）。

#     Returns:
#         list: 包含所有距离目标 PID 小于阈值的其他 POI 的 PID 列表。
#               如果目标 PID 不存在于文件中，则返回一个空列表。
#     """
#     try:
#         distance_df = pd.read_csv(distance_file, index_col=0)
#     except FileNotFoundError:
#         print(f"错误：文件 {distance_file} 未找到。")
#         return []

#     if target_pid not in distance_df.index:
#         print(f"警告：目标 PID {target_pid} 不存在于距离文件中。")
#         return []

#     # 获取目标 PID 对应的距离Series
#     target_distances = distance_df.loc[target_pid]

#     # 筛选出距离小于阈值的 PID，并排除目标 PID 本身
#     nearby_pois = target_distances[
#         (target_distances < threshold) & (distance_df.index != target_pid)
#     ].index.tolist()

#     return nearby_pois

# if __name__ == '__main__':
#     distance_file = 'NYC/distance.csv'  # 替换为你的 distance.csv 文件路径
#     target_pid = 0  # 指定你要查找附近 POI 的目标 PID
#     distance_threshold = 2.0  # 指定距离阈值，例如 1.0 公里

#     nearby_pids = find_nearby_pois(distance_file, target_pid, distance_threshold)

#     if nearby_pids:
#         print(f"距离 PID {target_pid} 小于 {distance_threshold} 的 POI PID 有{len(nearby_pids)}个:")
#         print(nearby_pids)
#     else:
#         print(f"没有找到距离 PID {target_pid} 小于 {distance_threshold} 的其他 POI。")

距离 PID 0 小于 2.0 的 POI PID 有723个:
['0', '8', '20', '28', '62', '76', '77', '83', '85', '88', '91', '112', '119', '120', '121', '132', '134', '146', '152', '153', '181', '186', '194', '197', '206', '207', '208', '220', '222', '234', '237', '247', '252', '253', '256', '259', '260', '277', '288', '289', '292', '310', '311', '318', '319', '339', '344', '356', '363', '365', '366', '370', '371', '375', '383', '385', '387', '391', '395', '400', '401', '410', '414', '417', '421', '427', '428', '458', '460', '470', '493', '494', '512', '515', '516', '530', '537', '545', '552', '563', '574', '580', '651', '652', '664', '674', '675', '686', '689', '708', '732', '760', '763', '789', '799', '810', '817', '821', '831', '845', '848', '851', '854', '857', '858', '871', '875', '876', '882', '890', '905', '910', '912', '915', '918', '926', '939', '956', '960', '966', '975', '978', '983', '986', '991', '994', '1002', '1004', '1020', '1021', '1022', '1025', '1038', '1048', '1067', '1068', '1069', '1074', '

In [5]:
import pandas as pd
import numpy as np
from geopy.distance import geodesic


datafold = 'CA'

def process_poi_data(file_path):
    """
    处理POI签到数据，按用户ID分组，并按时间排序POI访问记录。
    计算每个POI与前一个POI的距离。

    参数:
    file_path (str): 包含POI签到数据的文件的路径。

    返回:
    pandas.DataFrame: 包含处理后数据的DataFrame，其中包含uid、pid_list、category_list、time_list和distance_list列。
    """
    # 读取CSV文件到pandas DataFrame
    df = pd.read_csv(file_path)

    # 确保时间列是datetime类型
    df['time'] = pd.to_datetime(df['time'])

    # 按uid分组并聚合数据
    def aggregate_and_calculate_distance(group):
        pid_list = list(group['pid'])
        category_list = list(group['category'])
        region_list = list(group['region'])
        time_list = list(group['time'])
        
        # 按时间排序
        sorted_indices = sorted(range(len(time_list)), key=lambda i: time_list[i])
        pid_list = [pid_list[i] for i in sorted_indices]
        category_list = [category_list[i] for i in sorted_indices]
        region_list = [region_list[i] for i in sorted_indices]
        time_list = [time_list[i] for i in sorted_indices]
        
        # 计算距离
        # distance_list = [0.0]  # 第一个POI的距离为0
        # for i in range(1, len(latitude_list)):
        #     coord1 = (latitude_list[i-1], longitude_list[i-1])
        #     coord2 = (latitude_list[i], longitude_list[i])
        #     distance = geodesic(coord1, coord2).km  # 使用geodesic计算距离，单位为公里
        #     distance_list.append(round(distance, 2))
        
        return pd.Series({
            'pid_list': pid_list,
            'category_list': category_list,
            'region_list': region_list,
            'time_list': [t.strftime('%Y-%m-%d %H:%M') for t in time_list], # 格式化时间
            # 'distance_list': distance_list
        })

    df_grouped = df.groupby('uid').apply(aggregate_and_calculate_distance).reset_index()
    return df_grouped


def save_processed_data(df, output_file_path):
    """
    将处理后的数据保存到CSV文件。

    参数:
    df (pandas.DataFrame): 包含处理后数据的DataFrame。
    output_file_path (str): 输出文件的路径。
    """
    df.to_csv(output_file_path, index=False, header=True)
    print(f"处理后的数据已保存到：{output_file_path}")

# 指定输入和输出文件路径
input_file_path = f'{datafold}/{datafold}.csv'  # 替换为您的输入文件路径
output_file_path = f'{datafold}/poi_checkin.csv'  # 替换为您想要的输出文件路径

# 处理数据
processed_df = process_poi_data(input_file_path)

# 保存处理后的数据
save_processed_data(processed_df, output_file_path)

  df_grouped = df.groupby('uid').apply(aggregate_and_calculate_distance).reset_index()


处理后的数据已保存到：CA/poi_checkin.csv


In [7]:
import pandas as pd
from collections import defaultdict


datafold = 'CA'

def build_poi_transition_graph(file_path):
    """
    根据用户访问序列文件构建POI到POI的转移图。

    参数:
    file_path (str): 包含用户访问序列数据的文件路径。

    返回:
    dict: 一个字典，表示POI到POI的转移图。
          键是POI ID，值是潜在的后续POI ID列表。
    """
    # 读取CSV文件到pandas DataFrame
    df = pd.read_csv(file_path)

    # 初始化转移图
    transition_graph = defaultdict(list)

    df['poi_list'] = df['pid_list'].apply(lambda x: eval(x))  # 将字符串转换为列表
    user_checkin = df['poi_list'].tolist()

    for i in range(len(user_checkin)):
        # 获取当前用户的POI列表
        poi_list = user_checkin[i]
        # 遍历当前用户访问的POI列表
        for j in range(len(poi_list) - 1):
            current_poi = poi_list[j]
            next_poi = poi_list[j + 1]
            # 将下一个POI添加到当前POI的潜在后续POI列表中
            if next_poi not in transition_graph[current_poi]:
                transition_graph[current_poi].append(next_poi)

    return transition_graph

def save_transition_graph(graph, output_file_path):
    """
    将POI到POI的转移图保存到CSV文件。

    参数:
    graph (dict): POI到POI的转移图。
    output_file_path (str): 输出文件的路径。
    """
    # 将字典转换为DataFrame
    df = pd.DataFrame(list(graph.items()), columns=['pid', 'potential_poi'])
    # 保存到CSV文件
    df.to_csv(output_file_path, index=False)
    print(f"转移图已保存到：{output_file_path}")

if __name__ == "__main__":
    # 指定输入和输出文件路径
    input_file_path = f'{datafold}/poi_checkin.csv'  # 替换为您的输入文件路径
    output_file_path = f'{datafold}/poi_transition_graph.csv'  # 替换为您想要的输出文件路径

    # 构建转移图
    transition_graph = build_poi_transition_graph(input_file_path)

    # 保存转移图
    save_transition_graph(transition_graph, output_file_path)

转移图已保存到：CA/poi_transition_graph.csv


In [None]:
# import csv
# import json
# import ast


# datafold = 'NYC'
# def convert_csv_to_json(input_csv_path, output_json_path):
#     data = []

#     with open(input_csv_path, mode='r', encoding='utf-8') as csvfile:
#         reader = csv.DictReader(csvfile)
#         for row in reader:
#             uid = int(row['uid'])
#             pid_list = ast.literal_eval(row['pid_list'])
#             category_list = ast.literal_eval(row['category_list'])
#             region_list = ast.literal_eval(row['region_list'])
#             time_list = ast.literal_eval(row['time_list'])
#             # distance_list = ast.literal_eval(row['distance_list'])
            
#             max_length = 50
#             record = {
#                 "uid": uid,
#                 "pid_list": pid_list[-max_length:] if len(pid_list) > max_length else pid_list,
#                 "category_list": category_list[-max_length:] if len(category_list) > max_length else category_list,
#                 "region_list": region_list[-max_length:] if len(region_list) > max_length else region_list,
#                 "time_list": time_list[-max_length:] if len(time_list) > max_length else time_list,
#                 # "distance_list": distance_list[-max_length:] if len(distance_list) > max_length else distance_list
#             }

#             data.append(record)

#     with open(output_json_path, mode='w', encoding='utf-8') as jsonfile:
#         jsonfile.write('[\n')
#         for i, record in enumerate(data):
#             line = json.dumps(record, ensure_ascii=False, separators=(',', ': '))
#             jsonfile.write('  ' + line)
#             if i < len(data) - 1:
#                 jsonfile.write(',\n')
#             else:
#                 jsonfile.write('\n')
#         jsonfile.write(']\n')

#     print(f"转换完成，结果保存至: {output_json_path}")
# # 示例用法
# input_file_path = f'{datafold}/poi_checkin.csv'  # 替换为您的输入文件路径
# output_file_path = f'{datafold}/data.json'  # 替换为您想要的输出文件路径
# convert_csv_to_json(input_file_path, output_file_path)


转换完成，结果保存至: NYC/data.json


In [8]:
import csv
import json
import ast


datafold = 'CA'
max_length = 50

def convert_csv_to_json(input_csv_path, output_file_path):
    data = []

    with open(input_csv_path, mode='r', encoding='utf-8') as csvfile:
        reader = csv.DictReader(csvfile)
        for row in reader:
            uid = int(row['uid'])
            pid_list = ast.literal_eval(row['pid_list'])
            # category_list = ast.literal_eval(row['category_list'])
            # region_list = ast.literal_eval(row['region_list'])
            time_list = ast.literal_eval(row['time_list'])
            # distance_list = ast.literal_eval(row['distance_list'])
            record = {
                "uid": uid,
                "history": pid_list[-max_length-1:-1] if len(pid_list) > max_length else pid_list,
                "time": time_list[-max_length-1:-1] if len(time_list) > max_length else time_list,
                "next_time": time_list[-1],
                "target_pid": pid_list[-1]
            }

            data.append(record)

    json_records = []
    for record in data:
        uid = record["uid"]
        history = record["history"]
        time_seq = record["time"]
        next_time = record["next_time"]
        target_pid = record["target_pid"]

            # 构造 input 字符串
        input_text = (
            f"The historical POI check-in records for user {uid} are as follows:\n"
            f"POI list: {history}, with corresponding check-in times: {time_seq}."
            # f"At {next_time}, which POI is the user most likely to check in at?"
        )

        record = {
            "input": input_text,
            "next_time": next_time,
            "target": target_pid
        }
        json_records.append(record)


    with open(output_file_path, mode='w', encoding='utf-8') as file:
        json.dump(json_records, file, ensure_ascii=False, indent=2)

    print(f"转换完成，结果保存至: {output_file_path}")
# 示例用法
input_file_path = f'{datafold}/poi_checkin.csv'  # 替换为您的输入文件路径
output_file_path = f'{datafold}/data{max_length}.json'  # 替换为您想要的输出文件路径
convert_csv_to_json(input_file_path, output_file_path)


转换完成，结果保存至: CA/data50.json


In [None]:
import csv
import json
import ast


datafold = 'CA'
max_length = 100

def convert_csv_to_json(input_csv_path, output_file_path):
    data = []

    with open(input_csv_path, mode='r', encoding='utf-8') as csvfile:
        reader = csv.DictReader(csvfile)
        for row in reader:
            uid = int(row['uid'])
            pid_list = ast.literal_eval(row['pid_list'])
            category_list = ast.literal_eval(row['category_list'])
            region_list = ast.literal_eval(row['region_list'])
            time_list = ast.literal_eval(row['time_list'])
            # distance_list = ast.literal_eval(row['distance_list'])
            record = {
                "uid": uid,
                "pid_list": pid_list[-max_length-1:-1] if len(pid_list) > max_length else pid_list[:-1],
                "category": category_list[-max_length-1:-1] if len(category_list) > max_length else category_list[:-1],
                "region": region_list[-max_length-1:-1] if len(region_list) > max_length else region_list[:-1],
                "time": time_list[-max_length-1:-1] if len(time_list) > max_length else time_list[:-1],
                "next_time": time_list[-1],
                "target_pid": pid_list[-1]
            }

            data.append(record)

    json_records = []
    for record in data:
        uid = record["uid"]
        pid_list = record["pid_list"]
        category = record["category"]
        region = record["region"]
        time_seq = record["time"]
        next_time = record["next_time"]
        target_pid = record["target_pid"]

        # 构造 input 字符串
        input_text = (
            f"The historical POI check-in records: {pid_list}." 
            # f"with corresponding categories: {category},\n"
            # f"regions: {region},\n"
            # f"with corresponding check-in times: {time_seq}."
            # f"At {next_time}, which POI is the user most likely to check in at?"
        )

        record = {
            "input": input_text,
            # "next_time": next_time,
            "target": target_pid
        }
        json_records.append(record)


    with open(output_file_path, mode='w', encoding='utf-8') as file:
        json.dump(json_records, file, ensure_ascii=False, indent=2)

    print(f"转换完成，结果保存至: {output_file_path}")
# 示例用法
input_file_path = f'{datafold}/poi_checkin.csv'  # 替换为您的输入文件路径
output_file_path = f'{datafold}/data{max_length}.json'  # 替换为您想要的输出文件路径
convert_csv_to_json(input_file_path, output_file_path)

转换完成，结果保存至: TKY/data100.json


In [2]:
import csv
import json
import ast


datafold = 'TKY'
max_length = 50

def convert_csv_to_json(input_csv_path, output_file_path):
    data = []

    with open(input_csv_path, mode='r', encoding='utf-8') as csvfile:
        reader = csv.DictReader(csvfile)
        for row in reader:
            uid = int(row['uid'])
            pid_list = ast.literal_eval(row['pid_list'])
            category_list = ast.literal_eval(row['category_list'])
            region_list = ast.literal_eval(row['region_list'])
            time_list = ast.literal_eval(row['time_list'])
            # distance_list = ast.literal_eval(row['distance_list'])
            record = {
                "uid": uid,
                "pid_list": pid_list[-max_length-1:-1] if len(pid_list) > max_length else pid_list[:-1],
                "category": category_list[-max_length-1:-1] if len(category_list) > max_length else category_list[:-1],
                "region": region_list[-max_length-1:-1] if len(region_list) > max_length else region_list[:-1],
                "time": time_list[-max_length-1:-1] if len(time_list) > max_length else time_list[:-1],
                "next_time": time_list[-1],
                "target_pid": pid_list[-1]
            }

            data.append(record)

    json_records = []
    for record in data:
        uid = record["uid"]
        pid_list = record["pid_list"]
        category = record["category"]
        region = record["region"]
        time_seq = record["time"]
        next_time = record["next_time"]
        target_pid = record["target_pid"]

        # 构造 input 字符串
        # input_text = (
        #     f"The user has recently visited the following: {pid_list}, with corresponding check-in times: {time_seq}."
        # )

        sequence = [
            f"poi {poi}" + f' (belong to {category[i]}' + f', located in region {region[i]})' + f' at {time_seq[i]}, ' if i < len(pid_list) - 1 else
            f"poi {poi}" + f' (belong to {category[i]}' + f', located in region {region[i]})' + f' at {time_seq[i]}.'
            for i, poi in enumerate(pid_list)
        ]

        # 构造 input 字符串
        input_text = f"User_{uid} visited: " + "".join(sequence) + f" Now is {next_time}, user_{uid} is likely to visit?"

        record = {
            "input": input_text,
            # "next_time": next_time,
            "target": target_pid
        }
        json_records.append(record)


    with open(output_file_path, mode='w', encoding='utf-8') as file:
        json.dump(json_records, file, ensure_ascii=False, indent=2)

    print(f"转换完成，结果保存至: {output_file_path}")
# 示例用法
input_file_path = f'{datafold}/poi_checkin.csv'  # 替换为您的输入文件路径
output_file_path = f'{datafold}/data{max_length}.json'  # 替换为您想要的输出文件路径
convert_csv_to_json(input_file_path, output_file_path)

转换完成，结果保存至: TKY/data50.json


In [1]:

import pandas as pd

datafold = 'NYC'
file_name = f"{datafold}/{datafold}.csv"
df = pd.read_csv(file_name)

df = df[['uid', 'pid', 'category', 'region', 'latitude', 'longitude', 'time']]
# 按照时间排序
df = df.sort_values(by='time')

# 计算80%数据的索引
train_size = int(0.8 * len(df))

# 将前80%作为训练集
train_df = df[:train_size]
# 将后20%作为测试集
test_df = df[train_size:]

def romove_users_pois_test(df_train, df_test):
    users_train = df_train['uid'].unique()
    df_test = df_test[df_test['uid'].isin(users_train)]
    users_test = df_test['uid'].unique()
    df_train = df_train[df_train['uid'].isin(users_test)]

    pois_train = df_train['pid'].unique()
    df_test = df_test[df_test['pid'].isin(pois_train)]
    return df_test

test_df = romove_users_pois_test(train_df, test_df)

# 将训练集和测试集合并
new_df = pd.concat([train_df, test_df], ignore_index=True)

# 获取测试集中所有需要保留的 uid
test_uids = test_df['uid'].unique()

# 过滤原始 df，只保留那些 Uid 出现在 test_df 中的记录
expanded_df = new_df[new_df['uid'].isin(test_uids)]



# 保存训练集和测试集
# train_df.to_csv(f'{datafold}/train_data.csv', index=False)
expanded_df.to_csv(f'{datafold}/my_data.csv', index=False)

In [2]:
import pandas as pd

datafold = 'NYC' 
# 读取 data.csv 文件
try:
    df = pd.read_csv(f'{datafold}/my_data.csv')
except FileNotFoundError:
    print(f"错误：{datafold}.csv 文件未找到。请确保文件位于正确的目录下。")
    exit()

# 按照 'pid' 进行分组
grouped = df.groupby('pid')

# 创建 poi_info 数据
poi_info_data = []
for pid, group in grouped:
    # pid = int(pid)
    category = group['category'].iloc[0]
    region = group['region'].iloc[0]
    latitude, longitude = group[['latitude', 'longitude']].iloc[0]
    hourly_visits = {}
    for timestamp_str in group['time']:
        try:
            hour = pd.to_datetime(timestamp_str).hour
            hourly_visits[hour] = hourly_visits.get(hour, 0) + 1
        except ValueError:
            print(f"警告：无法解析时间戳：{timestamp_str}。已跳过。")
    
    # 只保留访问频率大于1的时间段
    filtered_hourly_visits = {hour: count for hour, count in hourly_visits.items() if count > 1}
    # 按访问次数（值）降序排序
    sorted_hourly_visits = dict(
        sorted(filtered_hourly_visits.items(), key=lambda item: item[1], reverse=True)
    )
    poi_info_data.append({
        'pid': pid,
        'category': category,
        'region': region,
        'latitude': latitude,
        'longitude': longitude,
        'visit_time_and_count': sorted_hourly_visits
    })

# 创建 poi_info DataFrame
poi_info_df = pd.DataFrame(poi_info_data)

# 将 poi_info DataFrame 保存到 poi_info.csv 文件
poi_info_df.to_csv(f'{datafold}/poi_info.csv', index=False)

print("成功创建 poi_info.csv 文件")

成功创建 poi_info.csv 文件


In [3]:
import pandas as pd
import numpy as np
from geopy.distance import geodesic


datafold = 'NYC'

def process_poi_data(file_path, max_length=50):
    """
    处理POI签到数据，按用户ID分组，并按时间排序POI访问记录。
    计算每个POI与前一个POI的距离。

    参数:
    file_path (str): 包含POI签到数据的文件的路径。

    返回:
    pandas.DataFrame: 包含处理后数据的DataFrame，其中包含uid、pid_list、category_list、time_list和distance_list列。
    """
    # 读取CSV文件到pandas DataFrame
    df = pd.read_csv(file_path)

    # 确保时间列是datetime类型
    df['time'] = pd.to_datetime(df['time'])

    # 按uid分组并聚合数据
    def aggregate_and_calculate_distance(group):
        pid_list = list(group['pid'])
        category_list = list(group['category'])
        region_list = list(group['region'])
        time_list = list(group['time'])
        
        # 按时间排序
        sorted_indices = sorted(range(len(time_list)), key=lambda i: time_list[i])
        pid_list = [pid_list[i] for i in sorted_indices]
        category_list = [category_list[i] for i in sorted_indices]
        region_list = [region_list[i] for i in sorted_indices]
        time_list = [time_list[i] for i in sorted_indices]
        
        pid_list = pid_list[:max_length] if len(pid_list) > max_length else pid_list
        category_list = category_list[:max_length] if len(category_list) > max_length else category_list
        region_list = region_list[:max_length] if len(region_list) > max_length else region_list
        time_list = time_list[:max_length] if len(time_list) > max_length else time_list

        return pd.Series({
            'pid_list': pid_list,
            'category_list': category_list,
            'region_list': region_list,
            'time_list': [t.strftime('%Y-%m-%d %H:%M') for t in time_list], # 格式化时间
            # 'distance_list': distance_list
        })

    df_grouped = df.groupby('uid').apply(aggregate_and_calculate_distance).reset_index()
    return df_grouped


def save_processed_data(df, output_file_path):
    """
    将处理后的数据保存到CSV文件。

    参数:
    df (pandas.DataFrame): 包含处理后数据的DataFrame。
    output_file_path (str): 输出文件的路径。
    """
    df.to_csv(output_file_path, index=False, header=True)
    print(f"处理后的数据已保存到：{output_file_path}")

# 指定输入和输出文件路径
input_file_path = f'{datafold}/my_data.csv'  # 替换为您的输入文件路径
output_file_path = f'{datafold}/poi_checkin.csv'  # 替换为您想要的输出文件路径
max_length = 100
# 处理数据
processed_df = process_poi_data(input_file_path, max_length)

# 保存处理后的数据
save_processed_data(processed_df, output_file_path)

处理后的数据已保存到：NYC/poi_checkin.csv


  df_grouped = df.groupby('uid').apply(aggregate_and_calculate_distance).reset_index()


In [20]:
import pandas as pd
from collections import defaultdict


datafold = 'CA' # NYC, TKY, CA

def build_poi_transition_graph(file_path):
    """
    根据用户访问序列文件构建POI到POI的转移图。

    参数:
    file_path (str): 包含用户访问序列数据的文件路径。

    返回:
    dict: 一个字典，表示POI到POI的转移图。
          键是POI ID，值是潜在的后续POI ID列表。
    """
    # 读取CSV文件到pandas DataFrame
    df = pd.read_csv(file_path)

    # 初始化转移图
    transition_graph = defaultdict(list)

    df['poi_list'] = df['pid_list'].apply(lambda x: eval(x))  # 将字符串转换为列表
    user_checkin = df['poi_list'].tolist()

    # 数据-1 防止数据泄漏
    user_checkin = user_checkin[:-1]  # 去掉最后一个用户的签到记录
    for i in range(len(user_checkin)):
        # 获取当前用户的POI列表
        poi_list = user_checkin[i]
        # 遍历当前用户访问的POI列表
        for j in range(len(poi_list) - 1):
            current_poi = poi_list[j]
            next_poi = poi_list[j + 1]
            # 将下一个POI添加到当前POI的潜在后续POI列表中
            if next_poi not in transition_graph[current_poi]:
                transition_graph[current_poi].append(next_poi)

    return transition_graph

def save_transition_graph(graph, output_file_path):
    """
    将POI到POI的转移图保存到CSV文件。

    参数:
    graph (dict): POI到POI的转移图。
    output_file_path (str): 输出文件的路径。
    """
    # 将字典转换为DataFrame
    df = pd.DataFrame(list(graph.items()), columns=['pid', 'potential_poi'])
    # 保存到CSV文件
    df.to_csv(output_file_path, index=False)
    print(f"转移图已保存到：{output_file_path}")

if __name__ == "__main__":
    # 指定输入和输出文件路径
    input_file_path = f'{datafold}/poi_checkin.csv'  # 替换为您的输入文件路径
    output_file_path = f'{datafold}/poi_transition_graph.csv'  # 替换为您想要的输出文件路径

    # 构建转移图
    transition_graph = build_poi_transition_graph(input_file_path)

    # 保存转移图
    save_transition_graph(transition_graph, output_file_path)

转移图已保存到：CA/poi_transition_graph.csv


In [21]:
import csv
import json
import ast


datafold = 'CA'
max_length = 100

def convert_csv_to_json(input_csv_path, output_file_path):
    data = []

    with open(input_csv_path, mode='r', encoding='utf-8') as csvfile:
        reader = csv.DictReader(csvfile)
        for row in reader:
            uid = int(row['uid'])
            pid_list = ast.literal_eval(row['pid_list'])
            category_list = ast.literal_eval(row['category_list'])
            region_list = ast.literal_eval(row['region_list'])
            time_list = ast.literal_eval(row['time_list'])
            # distance_list = ast.literal_eval(row['distance_list'])
            record = {
                "uid": uid,
                "pid_list": pid_list[-max_length-1:-1] if len(pid_list) > max_length else pid_list[:-1],
                "category": category_list[-max_length-1:-1] if len(category_list) > max_length else category_list[:-1],
                "region": region_list[-max_length-1:-1] if len(region_list) > max_length else region_list[:-1],
                "time": time_list[-max_length-1:-1] if len(time_list) > max_length else time_list[:-1],
                "next_time": time_list[-1],
                "target_pid": pid_list[-1]
            }

            data.append(record)

    json_records = []
    for record in data:
        uid = record["uid"]
        pid_list = record["pid_list"]
        category = record["category"]
        region = record["region"]
        time_seq = record["time"]
        next_time = record["next_time"]
        target_pid = record["target_pid"]

        # 构造 input 字符串
        input_text = (
            f"The historical POI check-in records: {pid_list}." 
            # f"with corresponding categories: {category},\n"
            # f"regions: {region},\n"
            # f"with corresponding check-in times: {time_seq}."
            # f"At {next_time}, which POI is the user most likely to check in at?"
        )

        record = {
            "input": input_text,
            # "next_time": next_time,
            "target": target_pid
        }
        json_records.append(record)


    with open(output_file_path, mode='w', encoding='utf-8') as file:
        json.dump(json_records, file, ensure_ascii=False, indent=2)

    print(f"转换完成，结果保存至: {output_file_path}")
# 示例用法
input_file_path = f'{datafold}/poi_checkin.csv'  # 替换为您的输入文件路径
output_file_path = f'{datafold}/history{max_length}.json'  # 替换为您想要的输出文件路径
convert_csv_to_json(input_file_path, output_file_path)

转换完成，结果保存至: CA/history100.json


In [22]:
import csv
import json
import ast


datafold = 'CA'
max_length = 50

def convert_csv_to_json(input_csv_path, output_file_path):
    data = []

    with open(input_csv_path, mode='r', encoding='utf-8') as csvfile:
        reader = csv.DictReader(csvfile)
        for row in reader:
            uid = int(row['uid'])
            pid_list = ast.literal_eval(row['pid_list'])
            category_list = ast.literal_eval(row['category_list'])
            region_list = ast.literal_eval(row['region_list'])
            time_list = ast.literal_eval(row['time_list'])
            # distance_list = ast.literal_eval(row['distance_list'])
            record = {
                "uid": uid,
                "pid_list": pid_list[-max_length-1:-1] if len(pid_list) > max_length else pid_list[:-1],
                "category": category_list[-max_length-1:-1] if len(category_list) > max_length else category_list[:-1],
                "region": region_list[-max_length-1:-1] if len(region_list) > max_length else region_list[:-1],
                "time": time_list[-max_length-1:-1] if len(time_list) > max_length else time_list[:-1],
                "next_time": time_list[-1],
                "target_pid": pid_list[-1]
            }

            data.append(record)

    json_records = []
    for record in data:
        uid = record["uid"]
        pid_list = record["pid_list"]
        category = record["category"]
        region = record["region"]
        time_seq = record["time"]
        next_time = record["next_time"]
        target_pid = record["target_pid"]

        # 构造 input 字符串
        input_text = (
            f"The user{uid} has recently POI check-in records: {pid_list}, with corresponding check-in times: {time_seq}."
        )

        # sequence = [
        #     f"poi {poi}" + f' (belong to {category[i]}' + f', located in region {region[i]})' + f' at {time_seq[i]}, ' if i < len(pid_list) - 1 else
        #     f"poi {poi}" + f' (belong to {category[i]}' + f', located in region {region[i]})' + f' at {time_seq[i]}.'
        #     for i, poi in enumerate(pid_list)
        # ]

        # # 构造 input 字符串
        # input_text = f"User_{uid} visited: " + "".join(sequence) + f" Now is {next_time}, user_{uid} is likely to visit?"

        record = {
            "input": input_text,
            "next_time": next_time,
            "target": target_pid
        }
        json_records.append(record)


    with open(output_file_path, mode='w', encoding='utf-8') as file:
        json.dump(json_records, file, ensure_ascii=False, indent=2)

    print(f"转换完成，结果保存至: {output_file_path}")
# 示例用法
input_file_path = f'{datafold}/poi_checkin.csv'  # 替换为您的输入文件路径
output_file_path = f'{datafold}/recent{max_length}.json'  # 替换为您想要的输出文件路径
convert_csv_to_json(input_file_path, output_file_path)

转换完成，结果保存至: CA/recent50.json


In [23]:
import csv
import json
import ast


datafold = 'CA'
max_length = 20

def convert_csv_to_json(input_csv_path, output_file_path):
    data = []

    with open(input_csv_path, mode='r', encoding='utf-8') as csvfile:
        reader = csv.DictReader(csvfile)
        for row in reader:
            uid = int(row['uid'])
            pid_list = ast.literal_eval(row['pid_list'])
            category_list = ast.literal_eval(row['category_list'])
            region_list = ast.literal_eval(row['region_list'])
            time_list = ast.literal_eval(row['time_list'])
            # distance_list = ast.literal_eval(row['distance_list'])
            record = {
                "uid": uid,
                "pid_list": pid_list[-max_length-1:-1] if len(pid_list) > max_length else pid_list[:-1],
                "category": category_list[-max_length-1:-1] if len(category_list) > max_length else category_list[:-1],
                "region": region_list[-max_length-1:-1] if len(region_list) > max_length else region_list[:-1],
                "time": time_list[-max_length-1:-1] if len(time_list) > max_length else time_list[:-1],
                "next_time": time_list[-1],
                "target_pid": pid_list[-1]
            }

            data.append(record)

    json_records = []
    for record in data:
        uid = record["uid"]
        pid_list = record["pid_list"]
        category = record["category"]
        region = record["region"]
        time_seq = record["time"]
        next_time = record["next_time"]
        target_pid = record["target_pid"]

        # 构造 input 字符串
        input_text = (
            f"The user{uid} has recently POI check-in records: {pid_list}, with corresponding check-in times: {time_seq}."
        )

        # sequence = [
        #     f"poi {poi}" + f' (belong to {category[i]}' + f', located in region {region[i]})' + f' at {time_seq[i]}, ' if i < len(pid_list) - 1 else
        #     f"poi {poi}" + f' (belong to {category[i]}' + f', located in region {region[i]})' + f' at {time_seq[i]}.'
        #     for i, poi in enumerate(pid_list)
        # ]

        # # 构造 input 字符串
        # input_text = f"User_{uid} visited: " + "".join(sequence) + f" Now is {next_time}, user_{uid} is likely to visit?"

        record = {
            "input": input_text,
            "next_time": next_time,
            "target": target_pid
        }
        json_records.append(record)


    with open(output_file_path, mode='w', encoding='utf-8') as file:
        json.dump(json_records, file, ensure_ascii=False, indent=2)

    print(f"转换完成，结果保存至: {output_file_path}")
# 示例用法
input_file_path = f'{datafold}/poi_checkin.csv'  # 替换为您的输入文件路径
output_file_path = f'{datafold}/recent{max_length}.json'  # 替换为您想要的输出文件路径
convert_csv_to_json(input_file_path, output_file_path)

转换完成，结果保存至: CA/recent20.json
