<h1>数据提取

In [None]:
import os
data_lable_dir=[]
# 指定数据目录路径
data_dir = r"**your_dir**\Geolife Trajectories 1.3\Data"
# 遍历所有子文件夹
count = 0
for folder in os.listdir(data_dir):
    folder_path = os.path.join(data_dir, folder)
    
    # 检查是否是文件夹且名称是三位数字
    if os.path.isdir(folder_path) and folder.isdigit() and len(folder) == 3:
        label_path = os.path.join(folder_path, "labels.txt")
        
        # 检查labels.txt是否存在
        if os.path.exists(label_path):
            count += 1
            print(f"文件夹 {folder} 包含 labels.txt")
            data_lable_dir.append(folder_path)
        else:
            pass
            # print(f"文件夹 {folder} 不包含 labels.txt")

In [None]:
import pandas as pd
import os
from tqdm import tqdm

# 定义合并函数
def merge_trips(group):
    if len(group) > 1:
        return pd.DataFrame({
            # 'Date': [group['Date'].iloc[0]],
            'Start Time': [group['Start Time'].min()],
            'End Time': [group['End Time'].max()],
            'Transportation Mode': ['-'.join(group['Transportation Mode'])],
            'time_diff': [group['time_diff'].iloc[-1]]
        })
    return group

# 获取所有用户目录
user_dirs = data_lable_dir

all_results = []

for user_dir in tqdm(user_dirs,total=len(user_dirs)):
    all_results = []
    # 处理轨迹数据
    traj_dir = os.path.join(user_dir, 'trajectory')
    plt_files = [f for f in os.listdir(traj_dir) if f.endswith('.plt')]
    user_id = user_dir.split('\\')[-1]
    # 按顺序处理每个轨迹文件
    for plt_file in plt_files:
        traj = pd.read_csv(os.path.join(traj_dir, plt_file),
                          skiprows=6,
                          header=None,
                          names=['latitude', 'longitude', 'zero', 'altitude', 'date_days', 'date', 'time'])
        
        # 处理datetime和分段
        traj['datetime'] = traj['date'].astype(str) + ' ' + traj['time'].astype(str)
        traj['datetime'] = pd.to_datetime(traj['datetime'])
        traj['time_diff'] = traj['datetime'].diff().dt.total_seconds()
        traj['segment'] = (traj['time_diff'] >= 60).cumsum()
        traj['user'] = user_id  # 使用目录名作为用户ID
        
        # 分组处理
        result = traj.groupby(['user', 'segment'], group_keys=False).apply(
            lambda x: {
                'user': f"{user_id}_{x.name[1]}",
                'start_time': x['datetime'].min(),
                'end_time': x['datetime'].max(),
                'traj': [[lon, lat, str(time)] for lon, lat, time in zip(x['longitude'], x['latitude'], x['datetime'])]
            }, include_groups=False
        ).reset_index(drop=True)
        all_results.extend(result.tolist())

    # 将所有结果合并到一个DataFrame
    result_df = pd.DataFrame(all_results)
    result_df.to_csv(os.path.join(user_dir,'Trajectory/session_traj.csv'), index=False)
    # 处理标签数据
    label_file = os.path.join(user_dir, 'labels.txt')
    label_data = pd.read_csv(label_file, sep='\t')
    
    # 合并日期和时间列

    # 这个是处理微软单独发布的transportation_mode版本的（日期与时间是分开的）
    # label_data['Start Time'] = label_data['Date']+' '+label_data['Start Time']
    # label_data['End Time'] = label_data['Date']+' '+label_data['End Time']

    label_data['Start Time'] = pd.to_datetime(label_data['Start Time'], format='mixed', dayfirst=False)
    label_data['End Time'] = pd.to_datetime(label_data['End Time'], format='mixed', dayfirst=False)
    
    """这是对于有连环出行链需求的研究进行的标签合并的工作：（譬如一个出行里是连续的出行，但是标签是分开的）
        例如：一个人的出行链是 步行-骑行-驾车 标签需要合并（步行-骑行-驾车）
    """
    ################################################################################################
    ################################################################################################
    # # 计算时间差
    # label_data['time_diff'] = label_data['Start Time'].shift(-1) - label_data['End Time']
    
    # # 标记需要合并的行
    # label_data['to_merge'] = ((label_data['time_diff'].shift(1) > pd.Timedelta(-1)) & 
    #                         (label_data['time_diff'].shift(1) < pd.Timedelta('60s'))) | \
    #                         ((label_data['time_diff'] > pd.Timedelta(-1)) & 
    #                         (label_data['time_diff'] < pd.Timedelta('60s')))
    
    # # 创建分组
    # label_data['group'] = (~label_data['to_merge'] | 
    #                     ((label_data['to_merge'].shift(1)==False) & 
    #                     (label_data['to_merge']))).cumsum()
    
    # # 应用合并
    # merged_labels = label_data.groupby('group', group_keys=False).apply(merge_trips).reset_index(drop=True)
    # if 'to_merge' in merged_labels.columns:
    #     merged_labels = merged_labels.drop(columns=['to_merge', 'group'])
    ################################################################################################
    ################################################################################################
    merged_labels = label_data

    result_df['mode'] = None
    # 对每个轨迹段进行模糊匹配
    for idx, row in result_df.iterrows():
        # 时间范围扩大5分钟
        start_lower = row['start_time'] - pd.Timedelta(minutes=5)
        start_upper = row['start_time'] + pd.Timedelta(minutes=5)
        end_lower = row['end_time'] - pd.Timedelta(minutes=5)
        end_upper = row['end_time'] + pd.Timedelta(minutes=5)
        
        # 查找匹配的label
        matched = merged_labels[
            (
                ((merged_labels['Start Time'] >= start_lower) & 
            (merged_labels['Start Time'] <= start_upper)) &
            ((merged_labels['End Time'] >= end_lower) & 
            (merged_labels['End Time'] <= end_upper))
            )|
            (
                (merged_labels['Start Time']<=row['start_time'])&
                (merged_labels['End Time']>=row['end_time'])
            )
        ]
        # 如果是连续的出行链
    ################################################################################################
        """
            matched = merged_labels[
                ((merged_labels['Start Time'] >= start_lower) & 
            (merged_labels['Start Time'] <= start_upper)) &
            ((merged_labels['End Time'] >= end_lower) & 
            (merged_labels['End Time'] <= end_upper))
        ]
        """
        # 匹配轨迹长度约为2k条，注意如果是要筛选出行链的话，前面对每一个session_traj的时间阈值要放大一点
        # 2k的记录是阈值2分钟的结果，可以尝试5分钟
    ################################################################################################
        # 如果找到匹配项，取第一个匹配的mode
        if not matched.empty:
            result_df.at[idx, 'mode'] = matched.iloc[0]['Transportation Mode']
    corrected_match_length = len(result_df[result_df['mode'].notna()])
    data_length = len(result_df)
    print(f'{user_id}成功匹配率: {corrected_match_length / data_length * 100:.2f}%')

    out_df = result_df[result_df['mode'].notna()].copy()
    
    out_df.to_csv(os.path.join(user_dir, 'match_mode.csv'), index=False)
    print(f'{user_id}匹配保存成功')
    import ast
def parse_traj(traj_str):
    return ast.literal_eval(traj_str)
user_match_modes = []
for user_dir in user_dirs:
    user_match_modes.append(os.path.join(user_dir, 'match_mode.csv'))
    
combine_match_modes = pd.concat([
    pd.read_csv(
        f, 
        converters={'traj': parse_traj}  # 对traj列应用转换函数
    ) for f in user_match_modes
])
traj_match_length = len(combine_match_modes)
print(f'成功匹配方式轨迹数据量：{traj_match_length}')

<h1>数据保存

In [None]:
combine_match_modes['start_point'] = combine_match_modes['traj'].apply(lambda x: x[0][:2])
combine_match_modes.reset_index(drop=True, inplace=True)
combine_match_modes.to_csv('match_mode_combine.csv', index=False)
# 筛选北京轨迹
inbeijing = combine_match_modes['start_point'].apply(lambda x: (117.5>=x[0]>=115.5)&(41.6>=x[1]>=39.4))
combine_match_modes_bj = combine_match_modes[inbeijing]
combine_match_modes_bj.reset_index(drop=True, inplace=True)
combine_match_modes_bj.to_csv('match_mode_combine_bj.csv', index=False)
print('北京轨迹筛选完成')
print(f'北京轨迹数量：{len(combine_match_modes_bj)}')