In [None]:

import pandas as pd

datafold = 'NYC'
file_name = f"{datafold}/{datafold}.csv"
df = pd.read_csv(file_name)

df = df[['uid', 'pid', 'category', 'region', 'latitude', 'longitude', 'time']]
# 按照时间排序
df = df.sort_values(by='time')

# 计算80%数据的索引
train_size = int(0.8 * len(df))

# 将前80%作为训练集
train_df = df[:train_size]
# 将后20%作为测试集
test_df = df[train_size:]

def romove_users_pois_test(df_train, df_test):
    users_train = df_train['uid'].unique()
    df_test = df_test[df_test['uid'].isin(users_train)]
    users_test = df_test['uid'].unique()
    df_train = df_train[df_train['uid'].isin(users_test)]

    pois_train = df_train['pid'].unique()
    df_test = df_test[df_test['pid'].isin(pois_train)]
    return df_test

test_df = romove_users_pois_test(train_df, test_df)

# 将训练集和测试集合并
new_df = pd.concat([train_df, test_df], ignore_index=True)

# 获取测试集中所有需要保留的 uid
test_uids = test_df['uid'].unique()

# 过滤原始 df，只保留那些 Uid 出现在 test_df 中的记录
expanded_df = new_df[new_df['uid'].isin(test_uids)]



# 保存训练集和测试集
# train_df.to_csv(f'{datafold}/train_data.csv', index=False)
expanded_df.to_csv(f'{datafold}/my_data.csv', index=False)

In [2]:
import pandas as pd

datafold = 'NYC' 
# 读取 data.csv 文件
try:
    df = pd.read_csv(f'{datafold}/my_data.csv')
except FileNotFoundError:
    print(f"错误：{datafold}.csv 文件未找到。请确保文件位于正确的目录下。")
    exit()

# 按照 'pid' 进行分组
grouped = df.groupby('pid')

# 创建 poi_info 数据
poi_info_data = []
for pid, group in grouped:
    # pid = int(pid)
    category = group['category'].iloc[0]
    region = group['region'].iloc[0]
    latitude, longitude = group[['latitude', 'longitude']].iloc[0]
    hourly_visits = {}
    for timestamp_str in group['time']:
        try:
            hour = pd.to_datetime(timestamp_str).hour
            hourly_visits[hour] = hourly_visits.get(hour, 0) + 1
        except ValueError:
            print(f"警告：无法解析时间戳：{timestamp_str}。已跳过。")
    
    # 只保留访问频率大于1的时间段
    filtered_hourly_visits = {hour: count for hour, count in hourly_visits.items() if count > 1}
    # 按访问次数（值）降序排序
    sorted_hourly_visits = dict(
        sorted(filtered_hourly_visits.items(), key=lambda item: item[1], reverse=True)
    )
    poi_info_data.append({
        'pid': pid,
        'category': category,
        'region': region,
        'latitude': latitude,
        'longitude': longitude,
        'visit_time_and_count': sorted_hourly_visits
    })

# 创建 poi_info DataFrame
poi_info_df = pd.DataFrame(poi_info_data)

# 将 poi_info DataFrame 保存到 poi_info.csv 文件
poi_info_df.to_csv(f'{datafold}/poi_info.csv', index=False)

print("成功创建 poi_info.csv 文件")

成功创建 poi_info.csv 文件


In [None]:
import pandas as pd
import numpy as np
from geopy.distance import geodesic


datafold = 'NYC'

def process_poi_data(file_path, max_length=50):
    # 读取CSV文件到pandas DataFrame
    df = pd.read_csv(file_path)

    # 确保时间列是datetime类型
    df['time'] = pd.to_datetime(df['time'])

    # 按uid分组并聚合数据
    def aggregate_and_calculate_distance(group):
        pid_list = list(group['pid'])
        category_list = list(group['category'])
        region_list = list(group['region'])
        time_list = list(group['time'])
        
        # 按时间排序
        sorted_indices = sorted(range(len(time_list)), key=lambda i: time_list[i])
        pid_list = [pid_list[i] for i in sorted_indices]
        category_list = [category_list[i] for i in sorted_indices]
        region_list = [region_list[i] for i in sorted_indices]
        time_list = [time_list[i] for i in sorted_indices]
        
        pid_list = pid_list[:max_length] if len(pid_list) > max_length else pid_list
        category_list = category_list[:max_length] if len(category_list) > max_length else category_list
        region_list = region_list[:max_length] if len(region_list) > max_length else region_list
        time_list = time_list[:max_length] if len(time_list) > max_length else time_list

        return pd.Series({
            'pid_list': pid_list,
            'category_list': category_list,
            'region_list': region_list,
            'time_list': [t.strftime('%Y-%m-%d %H:%M') for t in time_list], # 格式化时间
            # 'distance_list': distance_list
        })

    df_grouped = df.groupby('uid').apply(aggregate_and_calculate_distance).reset_index()
    return df_grouped


def save_processed_data(df, output_file_path):
    df.to_csv(output_file_path, index=False, header=True)
    print(f"处理后的数据已保存到：{output_file_path}")

# 指定输入和输出文件路径
input_file_path = f'{datafold}/my_data.csv'  # 替换为您的输入文件路径
output_file_path = f'{datafold}/poi_checkin.csv'  # 替换为您想要的输出文件路径
max_length = 100
# 处理数据
processed_df = process_poi_data(input_file_path, max_length)

# 保存处理后的数据
save_processed_data(processed_df, output_file_path)

处理后的数据已保存到：NYC/poi_checkin.csv


  df_grouped = df.groupby('uid').apply(aggregate_and_calculate_distance).reset_index()


In [None]:
import pandas as pd
from collections import defaultdict


datafold = 'NYC' # NYC, TKY, CA

def build_poi_transition_graph(file_path):
    # 读取CSV文件到pandas DataFrame
    df = pd.read_csv(file_path)

    # 初始化转移图
    transition_graph = defaultdict(list)

    df['poi_list'] = df['pid_list'].apply(lambda x: eval(x))  # 将字符串转换为列表
    user_checkin = df['poi_list'].tolist()

    # 数据-1 防止数据泄漏
    user_checkin = user_checkin[:-1]  # 去掉最后一个用户的签到记录
    for i in range(len(user_checkin)):
        # 获取当前用户的POI列表
        poi_list = user_checkin[i]
        # 遍历当前用户访问的POI列表
        for j in range(len(poi_list) - 1):
            current_poi = poi_list[j]
            next_poi = poi_list[j + 1]
            # 将下一个POI添加到当前POI的潜在后续POI列表中
            if next_poi not in transition_graph[current_poi]:
                transition_graph[current_poi].append(next_poi)

    return transition_graph

def save_transition_graph(graph, output_file_path):
    # 将字典转换为DataFrame
    df = pd.DataFrame(list(graph.items()), columns=['pid', 'potential_poi'])
    # 保存到CSV文件
    df.to_csv(output_file_path, index=False)
    print(f"转移图已保存到：{output_file_path}")

if __name__ == "__main__":
    # 指定输入和输出文件路径
    input_file_path = f'{datafold}/poi_checkin.csv'  # 替换为您的输入文件路径
    output_file_path = f'{datafold}/poi_transition_graph.csv'  # 替换为您想要的输出文件路径

    # 构建转移图
    transition_graph = build_poi_transition_graph(input_file_path)

    # 保存转移图
    save_transition_graph(transition_graph, output_file_path)

转移图已保存到：CA/poi_transition_graph.csv


In [None]:
import csv
import json
import ast


datafold = 'NYC'
max_length = 100

def convert_csv_to_json(input_csv_path, output_file_path):
    data = []

    with open(input_csv_path, mode='r', encoding='utf-8') as csvfile:
        reader = csv.DictReader(csvfile)
        for row in reader:
            uid = int(row['uid'])
            pid_list = ast.literal_eval(row['pid_list'])
            category_list = ast.literal_eval(row['category_list'])
            region_list = ast.literal_eval(row['region_list'])
            time_list = ast.literal_eval(row['time_list'])
            # distance_list = ast.literal_eval(row['distance_list'])
            record = {
                "uid": uid,
                "pid_list": pid_list[-max_length-1:-1] if len(pid_list) > max_length else pid_list[:-1],
                "category": category_list[-max_length-1:-1] if len(category_list) > max_length else category_list[:-1],
                "region": region_list[-max_length-1:-1] if len(region_list) > max_length else region_list[:-1],
                "time": time_list[-max_length-1:-1] if len(time_list) > max_length else time_list[:-1],
                "next_time": time_list[-1],
                "target_pid": pid_list[-1]
            }

            data.append(record)

    json_records = []
    for record in data:
        uid = record["uid"]
        pid_list = record["pid_list"]
        category = record["category"]
        region = record["region"]
        time_seq = record["time"]
        next_time = record["next_time"]
        target_pid = record["target_pid"]

        # 构造 input 字符串
        input_text = (
            f"The historical POI check-in records: {pid_list}." 
        )

        record = {
            "input": input_text,
            # "next_time": next_time,
            "target": target_pid
        }
        json_records.append(record)


    with open(output_file_path, mode='w', encoding='utf-8') as file:
        json.dump(json_records, file, ensure_ascii=False, indent=2)

    print(f"转换完成，结果保存至: {output_file_path}")
# 示例用法
input_file_path = f'{datafold}/poi_checkin.csv'  # 替换为您的输入文件路径
output_file_path = f'{datafold}/history{max_length}.json'  # 替换为您想要的输出文件路径
convert_csv_to_json(input_file_path, output_file_path)

转换完成，结果保存至: CA/history100.json


In [None]:
import csv
import json
import ast


datafold = 'NYC'
max_length = 50

def convert_csv_to_json(input_csv_path, output_file_path):
    data = []

    with open(input_csv_path, mode='r', encoding='utf-8') as csvfile:
        reader = csv.DictReader(csvfile)
        for row in reader:
            uid = int(row['uid'])
            pid_list = ast.literal_eval(row['pid_list'])
            category_list = ast.literal_eval(row['category_list'])
            region_list = ast.literal_eval(row['region_list'])
            time_list = ast.literal_eval(row['time_list'])
            # distance_list = ast.literal_eval(row['distance_list'])
            record = {
                "uid": uid,
                "pid_list": pid_list[-max_length-1:-1] if len(pid_list) > max_length else pid_list[:-1],
                "category": category_list[-max_length-1:-1] if len(category_list) > max_length else category_list[:-1],
                "region": region_list[-max_length-1:-1] if len(region_list) > max_length else region_list[:-1],
                "time": time_list[-max_length-1:-1] if len(time_list) > max_length else time_list[:-1],
                "next_time": time_list[-1],
                "target_pid": pid_list[-1]
            }

            data.append(record)

    json_records = []
    for record in data:
        uid = record["uid"]
        pid_list = record["pid_list"]
        category = record["category"]
        region = record["region"]
        time_seq = record["time"]
        next_time = record["next_time"]
        target_pid = record["target_pid"]

        # 构造 input 字符串
        input_text = (
            f"The user{uid} has recently POI check-in records: {pid_list}, with corresponding check-in times: {time_seq}."
        )

        # sequence = [
        #     f"poi {poi}" + f' (belong to {category[i]}' + f', located in region {region[i]})' + f' at {time_seq[i]}, ' if i < len(pid_list) - 1 else
        #     f"poi {poi}" + f' (belong to {category[i]}' + f', located in region {region[i]})' + f' at {time_seq[i]}.'
        #     for i, poi in enumerate(pid_list)
        # ]

        # # 构造 input 字符串
        # input_text = f"User_{uid} visited: " + "".join(sequence) + f" Now is {next_time}, user_{uid} is likely to visit?"

        record = {
            "input": input_text,
            "next_time": next_time,
            "target": target_pid
        }
        json_records.append(record)


    with open(output_file_path, mode='w', encoding='utf-8') as file:
        json.dump(json_records, file, ensure_ascii=False, indent=2)

    print(f"转换完成，结果保存至: {output_file_path}")
# 示例用法
input_file_path = f'{datafold}/poi_checkin.csv'  # 替换为您的输入文件路径
output_file_path = f'{datafold}/recent{max_length}.json'  # 替换为您想要的输出文件路径
convert_csv_to_json(input_file_path, output_file_path)

转换完成，结果保存至: CA/recent50.json


In [None]:
import csv
import json
import ast


datafold = 'NYC'
max_length = 20

def convert_csv_to_json(input_csv_path, output_file_path):
    data = []

    with open(input_csv_path, mode='r', encoding='utf-8') as csvfile:
        reader = csv.DictReader(csvfile)
        for row in reader:
            uid = int(row['uid'])
            pid_list = ast.literal_eval(row['pid_list'])
            category_list = ast.literal_eval(row['category_list'])
            region_list = ast.literal_eval(row['region_list'])
            time_list = ast.literal_eval(row['time_list'])
            # distance_list = ast.literal_eval(row['distance_list'])
            record = {
                "uid": uid,
                "pid_list": pid_list[-max_length-1:-1] if len(pid_list) > max_length else pid_list[:-1],
                "category": category_list[-max_length-1:-1] if len(category_list) > max_length else category_list[:-1],
                "region": region_list[-max_length-1:-1] if len(region_list) > max_length else region_list[:-1],
                "time": time_list[-max_length-1:-1] if len(time_list) > max_length else time_list[:-1],
                "next_time": time_list[-1],
                "target_pid": pid_list[-1]
            }

            data.append(record)

    json_records = []
    for record in data:
        uid = record["uid"]
        pid_list = record["pid_list"]
        category = record["category"]
        region = record["region"]
        time_seq = record["time"]
        next_time = record["next_time"]
        target_pid = record["target_pid"]

        # 构造 input 字符串
        input_text = (
            f"The user{uid} has recently POI check-in records: {pid_list}, with corresponding check-in times: {time_seq}."
        )

        # sequence = [
        #     f"poi {poi}" + f' (belong to {category[i]}' + f', located in region {region[i]})' + f' at {time_seq[i]}, ' if i < len(pid_list) - 1 else
        #     f"poi {poi}" + f' (belong to {category[i]}' + f', located in region {region[i]})' + f' at {time_seq[i]}.'
        #     for i, poi in enumerate(pid_list)
        # ]

        # # 构造 input 字符串
        # input_text = f"User_{uid} visited: " + "".join(sequence) + f" Now is {next_time}, user_{uid} is likely to visit?"

        record = {
            "input": input_text,
            "next_time": next_time,
            "target": target_pid
        }
        json_records.append(record)


    with open(output_file_path, mode='w', encoding='utf-8') as file:
        json.dump(json_records, file, ensure_ascii=False, indent=2)

    print(f"转换完成，结果保存至: {output_file_path}")
# 示例用法
input_file_path = f'{datafold}/poi_checkin.csv'  # 替换为您的输入文件路径
output_file_path = f'{datafold}/recent{max_length}.json'  # 替换为您想要的输出文件路径
convert_csv_to_json(input_file_path, output_file_path)

转换完成，结果保存至: CA/recent20.json
