In [10]:
import pandas as pd
from datetime import datetime, timedelta
import os

In [11]:
# 划分数据集
def process_checkins(file_path):
    # 1. 读取数据并预处理
    df = pd.read_csv(file_path, sep='\t', encoding='iso-8859-1', header=None, names=[
        'userId', 'venueId', 'category_id', 'category_name',
        'latitude', 'longitude', 'timezone_offset', 'utcTimestamp'
    ])

    def convert_time(row):
        try:
            # 修改时间解析格式
            utc_time = datetime.strptime(row['utcTimestamp'], '%a %b %d %H:%M:%S %z %Y')
            # 处理时区偏移
            utc_time = utc_time - timedelta(minutes=row['timezone_offset'])
            # 新增：计算时间标准化值（0-1）
            total_seconds = utc_time.hour * 3600 + utc_time.minute * 60 + utc_time.second
            norm_in_day = total_seconds / (24 * 3600)
            return pd.Series([utc_time, norm_in_day])
        except ValueError:
            return pd.Series([None, None])

    df[['UTC_time', 'norm_in_day_time']] = df.apply(convert_time, axis=1)

    # 2. 过滤有效用户和POI
    user_checkins = df.groupby('userId').size()
    valid_users = user_checkins[user_checkins >= 10].index
    poi_checkins = df.groupby('venueId').size()
    valid_pois = poi_checkins[poi_checkins >= 10].index
    df_filtered = df[df['userId'].isin(valid_users) & df['venueId'].isin(valid_pois)]

    # 3. 生成轨迹ID（保持24小时间隔）
    df_filtered = df_filtered.sort_values(by=['userId', 'UTC_time'])
    trajectory_counter = 1

    def assign_trajectory_id(group):
        nonlocal trajectory_counter
        start_time = group.iloc[0]['UTC_time']
        traj_id_list = []
        for _, row in group.iterrows():
            if row['UTC_time'] - start_time > timedelta(hours=24):
                trajectory_counter += 1
                start_time = row['UTC_time']
            traj_id_list.append(f"{row['userId']}_{trajectory_counter}")
        return pd.Series(traj_id_list, index=group.index)

    df_filtered['trajectory_id'] = df_filtered.groupby('userId').apply(assign_trajectory_id).reset_index(level=0, drop=True)

    # 4. 调整列名和顺序（保持与原输出一致）
    df_filtered.rename(columns={
        'userId': 'user_id',
        'venueId': 'POI_id',
    }, inplace=True)

    output_columns = [
        'user_id', 'POI_id', 'latitude', 'longitude',
        'UTC_time', 'norm_in_day_time', 'trajectory_id'
    ]
    df_filtered = df_filtered[output_columns]

    # 5. 过滤轨迹长度小于2的记录
    trajectory_counts = df_filtered.groupby('trajectory_id').size()
    valid_trajectories = trajectory_counts[trajectory_counts >= 2].index
    df_filtered = df_filtered[df_filtered['trajectory_id'].isin(valid_trajectories)]

    # 对 POI_id 重新编号
    df_filtered['POI_id'] = pd.factorize(df_filtered['POI_id'])[0]

    # 6.统计指标
    num_users = df_filtered['user_id'].nunique()
    num_pois = df_filtered['POI_id'].nunique()
    num_check_ins = len(df_filtered)
    num_trajectories = df_filtered['trajectory_id'].nunique()

    print(f"#Users: {num_users}")
    print(f"#POIs: {num_pois}")
    print(f"#Check-Ins: {num_check_ins}")
    print(f"#Trajectories: {num_trajectories}")

    # 7. 数据划分8:1:1
    df_filtered = df_filtered.sort_values(by='UTC_time')
    total_rows = len(df_filtered)
    train_end = int(total_rows * 0.8)
    validate_end = int(total_rows * 0.9)
    train = df_filtered.iloc[:train_end]
    validate = df_filtered.iloc[train_end:validate_end]
    test = df_filtered.iloc[validate_end:]

    return df_filtered, train, validate, test

In [12]:
# 预处理数据集 foursquare_nyc
nyc_df_filtered, nyc_train, nyc_validate, nyc_test = process_checkins('./foursquare/nyc/dataset_TSMC2014_NYC.txt')
nyc_df_filtered.to_csv('./foursquare/nyc/nyc_checkins.csv', index=False)
nyc_train.to_csv('./foursquare/nyc/nyc_train.csv', index=False)
nyc_validate.to_csv('./foursquare/nyc/nyc_val.csv', index=False)
nyc_test.to_csv('./foursquare/nyc/nyc_test.csv', index=False)

  df_filtered['trajectory_id'] = df_filtered.groupby('userId').apply(assign_trajectory_id).reset_index(level=0, drop=True)


#Users: 1082
#POIs: 5135
#Check-Ins: 121377
#Trajectories: 34132


In [13]:
# 预处理数据集 foursquare_tky
tky_df_filtered, tky_train, tky_validate, tky_test = process_checkins('./foursquare/tky/dataset_TSMC2014_TKY.txt')
tky_df_filtered.to_csv('./foursquare/tky/tky_checkins.csv', index=False)
tky_train.to_csv('./foursquare/tky/tky_train.csv', index=False)
tky_validate.to_csv('./foursquare/tky/tky_val.csv', index=False)
tky_test.to_csv('./foursquare/tky/tky_test.csv', index=False)

  df_filtered['trajectory_id'] = df_filtered.groupby('userId').apply(assign_trajectory_id).reset_index(level=0, drop=True)


#Users: 2293
#POIs: 7869
#Check-Ins: 398308
#Trajectories: 92589


In [14]:
res_nyc = pd.read_csv('./foursquare/nyc/nyc_train.csv')
res_tky = pd.read_csv('./foursquare/tky/tky_train.csv')

In [15]:
res_nyc.head(5)

Unnamed: 0,user_id,POI_id,latitude,longitude,UTC_time,norm_in_day_time,trajectory_id
0,990,2455,40.758505,-73.989143,2012-04-03 14:56:03+00:00,0.622257,990_54789
1,990,1197,40.75909,-73.98468,2012-04-03 16:07:35+00:00,0.671933,990_54789
2,990,2455,40.758505,-73.989143,2012-04-03 17:05:41+00:00,0.71228,990_54789
3,990,2455,40.758505,-73.989143,2012-04-03 17:19:39+00:00,0.721979,990_54789
4,990,5019,40.749541,-73.977733,2012-04-03 18:17:52+00:00,0.762407,990_54789


In [17]:
res_tky.head(5)

Unnamed: 0,user_id,POI_id,latitude,longitude,UTC_time,norm_in_day_time,trajectory_id
0,114,2765,35.714542,139.480065,2012-04-03 10:12:07+00:00,0.425081,114_7367
1,114,402,35.700253,139.480255,2012-04-03 10:35:36+00:00,0.441389,114_7367
2,1635,7380,35.755759,139.733573,2012-04-03 10:51:50+00:00,0.452662,1635_99122
3,2033,433,35.693121,139.699447,2012-04-03 10:51:59+00:00,0.452766,2033_124507
4,589,159,35.548963,139.784611,2012-04-03 10:59:06+00:00,0.457708,589_36833
