# 生成上下文单个的测试函数

In [None]:
import pandas as pd
from io import StringIO
import random
from datetime import timedelta
import numpy as np

# --- 常量定义 ---
TIME_LIMITS = {
    'month': timedelta(days=30),
    'week': timedelta(days=7),
    'day': timedelta(days=1)
}
N_TOP_FREQUENT = 3 # 排除的周期停留点数量


# # --- 模拟您的输入数据 ---
# data = """,userID,stime,etime,lon,lat,duration,grid
# 0,173,2007-11-30 14:36:34,2007-11-30 18:18:47,116.3300116,39.975275,13333.0,14994
# 47,173,2007-11-30 18:40:25,2007-12-01 10:04:59,116.3132266,39.9703666,55474.0,14616
# 206,173,2007-12-01 10:23:33,2007-12-01 11:02:19,116.410585,39.96704,2326.0,16127
# 213,173,2007-12-01 11:02:19,2007-12-02 13:36:08,116.41666,39.8555383,95629.0,16304
# 617,173,2007-12-02 14:18:55,2007-12-02 18:39:35,116.313185,39.9660799,15640.0,14615
# 650,173,2007-12-02 18:51:50,2007-12-02 19:46:12,116.3198316,39.9465316,3262.0,14802
# 684,173,2007-12-02 20:00:44,2007-12-04 16:43:35,116.31226,39.9692849,160971.0,14616
# 811,173,2007-12-15 14:09:32,2007-12-16 20:01:27,116.3131133,39.9684866,107515.0,14616
# 859,173,2007-12-16 20:01:27,2007-12-16 21:49:33,116.3117599,39.97306,6486.0,14616
# """

# # 1. 数据加载与预处理
# df = pd.read_csv(StringIO(data), index_col=0)

userid = '002'
df = pd.read_csv('./Data/Output/Stays/{}.csv'.format(userid), index_col=0)

df['stime'] = pd.to_datetime(df['stime'])
df['etime'] = pd.to_datetime(df['etime'])
df['context'] = None # 初始化 context 列

# 2. 识别周期停留点并构建 aperiodic_stay_list
stay_list = list(df.index)

# 获取频率最高的 N 个 grid
top_n_grids = df['grid'].value_counts().nlargest(N_TOP_FREQUENT).index.tolist()

# 获取所有非周期停留点的索引列表 (这些行可以作为目标 generate_context_stay)
aperiodic_stay_list = df[~df['grid'].isin(top_n_grids)].index.tolist()

# 过滤掉最后一个停留点，它不能作为起点 current_stay
# 确保所有索引都用于后续操作
all_indices = df.index.tolist()
print(f"原始数据行号 (Index): {all_indices}")
print(f"非周期目的地行号 (Aperiodic): {aperiodic_stay_list}")
print(f"Top {N_TOP_FREQUENT} 周期 Grid: {top_n_grids}\n")

# --- 辅助函数：模糊时间生成 ---
def get_fuzzy_time_expression(time_delta, target_time):
    """根据时间差和目标时间生成模糊时间表达式。"""
    
    # 按月模糊
    if time_delta > TIME_LIMITS['month']:
        months = round(time_delta.days / 30)
        return f"in about {months} months" if months > 1 else "in the next month"
    
    # 按周模糊
    elif time_delta > TIME_LIMITS['week']:
        weeks = round(time_delta.days / 7)
        weekday = ["Monday", "Tuesday", "Wednesday", "Thursday", "Friday", "Saturday", "Sunday"][target_time.weekday()]
        return f"in about {weeks} weeks, specifically next {weekday}"
    
    # 按天模糊
    elif time_delta > TIME_LIMITS['day']:
        days = time_delta.days
        return f"in about {days} days, on {target_time.date().strftime('%Y-%m-%d')}"
    
    # 按小时模糊
    else: # time_delta <= 1 day
        hour = target_time.hour
        if 5 <= hour < 12:
            period = "in the morning"
        elif 12 <= hour < 18:
            period = "in the afternoon"
        elif 18 <= hour < 22:
            period = "in the evening"
        else:
            period = "late at night"
        return f"later today, {period}"

# --- 主要循环逻辑 (步骤 3 - 12) ---
# 3. 按随机遍历 aperiodic_stay_list 列表（作为未来的目标点）
random.shuffle(aperiodic_stay_list)

context_generated_count = 0
current_stay_candidates = df.index[:-1].tolist() # 所有非最后一个行号都可以作为起点

while aperiodic_stay_list and current_stay_candidates:
    # 6. 从 current_stay_candidates 中随机选一个起点 current_stay
    #    (注: 原始逻辑说从 aperiodic_stay_list 随机选，但 aperiodic_stay_list 是目标点，
    #     起点应从所有非终点行号中选，这里做合理修正)
    current_stay_idx = random.choice(current_stay_candidates)
    
    # 5. 从 current_stay 之后的行号中选一个目标点 generate_context_stay
    #    要求目标点必须在 aperiodic_stay_list 中（未被预测过，且非周期）
    
    # 筛选出满足条件的未来目标点
    future_aperiodic_stays = [
        target_idx for target_idx in aperiodic_stay_list 
        if target_idx > current_stay_idx
    ]
    
    if not future_aperiodic_stays:
        # 如果找不到合适的未来目标，将当前起点从候选列表中删除，避免死循环
        current_stay_candidates.remove(current_stay_idx)
        continue

    # 随机选择一个目标点
    generate_context_stay_idx = random.choice(future_aperiodic_stays)
    
    # 获取停留点数据
    current_stay = df.loc[current_stay_idx]
    generate_context_stay = df.loc[generate_context_stay_idx]
    
    start_grid = current_stay['grid']
    end_grid = generate_context_stay['grid']
    user_id = current_stay['userID']
    
    # 计算时间差 (从当前停留点的结束时间到目标停留点的开始时间)
    time_delta = generate_context_stay['stime'] - current_stay['etime']
    
    # 初始化时间表达式为 None
    time_expression = None
    is_accurate_time = False

    # --- 7. 时间差大于一个月 (year_random) ---
    if time_delta > TIME_LIMITS['month']:
        year_random = random.random()
        if year_random >= 0.1:
            time_expression = get_fuzzy_time_expression(time_delta, generate_context_stay['stime']) # 7.1 按月模糊
        else:
            is_accurate_time = True # 7.2 准确时间

    # --- 8. 时间差小于一个月，大于一个星期 (month_random) ---
    elif time_delta > TIME_LIMITS['week']:
        month_random = random.random()
        if month_random >= 0.3:
            time_expression = get_fuzzy_time_expression(time_delta, generate_context_stay['stime']) # 8.1 按周模糊
        else:
            is_accurate_time = True # 8.2 准确时间

    # --- 9. 时间差小于一个星期，大于一天 (week_random) ---
    elif time_delta > TIME_LIMITS['day']:
        week_random = random.random()
        if week_random >= 0.5:
            time_expression = get_fuzzy_time_expression(time_delta, generate_context_stay['stime']) # 9.1 按天模糊
        else:
            is_accurate_time = True # 9.2 准确时间

    # --- 10. 时间差小于一天 (day_random) ---
    else:
        day_random = random.random()
        if day_random >= 0.7:
            time_expression = get_fuzzy_time_expression(time_delta, generate_context_stay['stime']) # 10.1 按小时模糊
        else:
            is_accurate_time = True # 10.2 准确时间

    # 构造最终时间表达式
    if is_accurate_time:
        time_expression = generate_context_stay['stime'].strftime('%Y-%m-%d %H:%M:%S')

    # 构造上下文文本
    context_text = (
        f"User {user_id} will move from grid {start_grid} to grid {end_grid}, "
        f"arriving around {time_expression}."
    )

    # 7.3, 8.3, 9.3, 10.3: 将生成的上下文内容填入 current_stay 的 context 字段中
    df.loc[current_stay_idx, 'context'] = context_text
    context_generated_count += 1
    
    # 11. 将 generate_context_stay 从 aperiodic_stay_list 中删除
    aperiodic_stay_list.remove(generate_context_stay_idx)
    
    # 目标行号被预测后，它不能再次作为目标，也不能再作为起点（如果它被预测了，就不需要它再做预测了）
    # 确保 current_stay 只能生成一次上下文 (否则 context 字段会被覆盖)
    current_stay_candidates.remove(current_stay_idx)

# 12. 循环直到数据的最后一个行或者 aperiodic_stay_list 中未空。
#     循环结束条件已通过 while 语句控制。


# --- 结果展示 ---
print("\n--- 基于详细逻辑的上下文生成结果 ---")
print(f"总共生成了 {context_generated_count} 条上下文。")
print("原始数据 (仅显示关键列和生成的 context):")
print(df[['userID', 'stime', 'etime', 'grid', 'context']].head(len(df)))

df.to_csv('./Data/Output/Context/{}.csv'.format(userid))

## 将文件夹下的多个用户文件合并成为一个文件用于模型训练

In [None]:
import pandas as pd

N_top_frequent = 3
data_dir = './Data/Output/Context/'

# 遍历输入文件夹下的所有文件
for filename in os.listdir(data_dir):
    # 跳过已经生成的上下文文件和合并文件
    # if filename.startswith(OUTPUT_PREFIX) or filename == os.path.basename(combined_output_filepath):
    #     continue
        
    if filename.endswith(".csv"):
        filepath = os.path.join(data_dir, filename)
        
        print(f"\n--- 正在处理文件: {filename} ---")
        try:
            # 1. 读取数据
            df_raw = pd.read_csv(filepath, index_col=0) 
            
            if df_raw.empty or len(df_raw) < 2:
                print(f"Skipping: 数据为空或行数不足。")
                continue
            
            # 2. 调用核心函数处理数据
            df_processed = generate_context_for_df(df_raw, N_top_frequent)
            
            # 确保 userID 列存在
            if 'userID' not in df_processed.columns:
                print("警告: 缺少 'userID' 列。尝试从文件名推断 UserID。")
                user_id = filename.split('_')[0] 
                df_processed.insert(0, 'userID', user_id)
            
            # 形式一：每个用户单独保存（保存到指定 Individual 路径）
            output_filename_single = f"{filename}"
            output_filepath_single = os.path.join(individual_output_dir, output_filename_single)
            df_processed.to_csv(output_filepath_single, index=True)
            print(f" 保存单个用户数据到: {output_filepath_single}")
            
            all_users_data.append(df_processed)

        except Exception as e:
            print(f"处理文件 {filename} 时发生错误: {e}")
            continue


# 之前试验代码


## （流程有问题）生成的上下文时间是完全准确的

In [None]:
import pandas as pd
from io import StringIO
import numpy as np

# 模拟您的输入数据
# data = """,userID,stime,etime,lon,lat,duration,grid
# 0,173,2007-11-30 14:36:34,2007-11-30 18:18:47,116.3300116,39.975275,13333.0,14994
# 47,173,2007-11-30 18:40:25,2007-12-01 10:04:59,116.3132266,39.9703666,55474.0,14616
# 206,173,2007-12-01 10:23:33,2007-12-01 11:02:19,116.410585,39.96704,2326.0,16127
# 213,173,2007-12-01 11:02:19,2007-12-02 13:36:08,116.41666,39.8555383,95629.0,16304
# 617,173,2007-12-02 14:18:55,2007-12-02 18:39:35,116.313185,39.9660799,15640.0,14615
# 650,173,2007-12-02 18:51:50,2007-12-02 19:46:12,116.3198316,39.9465316,3262.0,14802
# 684,173,2007-12-02 20:00:44,2007-12-04 16:43:35,116.31226,39.9692849,160971.0,14616
# 811,173,2007-12-15 14:09:32,2007-12-16 20:01:27,116.3131133,39.9684866,107515.0,14616
# 859,173,2007-12-16 20:01:27,2007-12-16 21:49:33,116.3117599,39.97306,6486.0,14616
# """

# 加载数据
# df = pd.read_csv(StringIO(data), index_col=0)
userid = '002'
df = pd.read_csv('./Data/Output/Stays/{}.csv'.format(userid), index_col=0)

# 1. 数据类型转换
df['stime'] = pd.to_datetime(df['stime'])
df['etime'] = pd.to_datetime(df['etime'])

# 2. 识别出现频率最高的 3 个 grid
# nlargest(3) 可能会因为样本量小而包含多个频率相同的值，但这里只取前3个。
top_3_grids = df['grid'].value_counts().nlargest(3).index.tolist()
print(f"出现频率最高的 3 个 Grid: {top_3_grids}\n")

# 3. 准备下一个停留点的信息（时间、网格）
# 使用 shift(-1) 将下一行的数据填充到当前行
df['next_stime'] = df['stime'].shift(-1)
df['next_grid'] = df['grid'].shift(-1)

# 4. 初始化 context 列
df['context'] = None

# 5. 基于条件生成 context
def generate_context(row):
    # 规则 6: 最后一个停留点没有下一个目的地，不生成上下文
    if pd.isna(row['next_stime']):
        return None

    # 规则 7: 如果下一个目的地（next_grid）属于周期性活动（Top 3），则不生成上下文
    # 这里的预测是“用户将前往 {next_grid}”，如果 {next_grid} 是周期性地点，则跳过
    if int(row['next_grid']) in top_3_grids:
        return None

    # 规则 3, 4, 5: 构造文本
    user_id = row['userID']
    # {time/time period} 使用下一个停留点的准确开始时间
    time_str = row['next_stime'].strftime('%Y-%m-%d %H:%M:%S')
    grid_number = int(row['next_grid'])

    # 规则 2: 文本格式
    context_text = f"User {user_id} will go to the area with grid number {grid_number} at {time_str}."
    
    return context_text

# 应用函数生成 context
df['context'] = df.apply(generate_context, axis=1)

# 6. 清理辅助列并显示结果
df = df.drop(columns=['next_stime', 'next_grid'])
print("--- 最终结果 (新增 context 列) ---")
print(df[['userID', 'stime', 'grid', 'context']])

df.to_csv('./Data/Output/Context/{}.csv'.format(userid))


In [None]:
print(df[['context']])

## （流程有问题）生成的时间是一个模糊的

In [None]:
import pandas as pd
from io import StringIO
import random
from datetime import timedelta

# 模拟您的输入数据
data = """,userID,stime,etime,lon,lat,duration,grid
0,173,2007-11-30 14:36:34,2007-11-30 18:18:47,116.3300116,39.975275,13333.0,14994
47,173,2007-11-30 18:40:25,2007-12-01 10:04:59,116.3132266,39.9703666,55474.0,14616
206,173,2007-12-01 10:23:33,2007-12-01 11:02:19,116.410585,39.96704,2326.0,16127
213,173,2007-12-01 11:02:19,2007-12-02 13:36:08,116.41666,39.8555383,95629.0,16304
617,173,2007-12-02 14:18:55,2007-12-02 18:39:35,116.313185,39.9660799,15640.0,14615
650,173,2007-12-02 18:51:50,2007-12-02 19:46:12,116.3198316,39.9465316,3262.0,14802
684,173,2007-12-02 20:00:44,2007-12-04 16:43:35,116.31226,39.9692849,160971.0,14616
811,173,2007-12-15 14:09:32,2007-12-16 20:01:27,116.3131133,39.9684866,107515.0,14616
859,173,2007-12-16 20:01:27,2007-12-16 21:49:33,116.3117599,39.97306,6486.0,14616
"""

# 1. 数据加载与预处理
df = pd.read_csv(StringIO(data), index_col=0)


# userid = '002'
# df = pd.read_csv('./Data/Output/Stays/{}.csv'.format(userid), index_col=0)

df['stime'] = pd.to_datetime(df['stime'])
df['etime'] = pd.to_datetime(df['etime'])

# 识别出现频率最高的 3 个 grid
top_3_grids = df['grid'].value_counts().nlargest(3).index.tolist()

# 2. 准备下一个停留点的信息
df['next_stime'] = df['stime'].shift(-1)
df['next_grid'] = df['grid'].shift(-1)
df['start_grid'] = df['grid'] # 当前停留点作为起点

# 3. 定义随机时间生成函数
def get_random_time_expression(target_time, reference_time):
    """根据随机策略将精确时间转换为精确或模糊的文本。"""
    
    # 策略 1: 精确时间 (30% 概率)
    if random.random() < 0.3:
        return target_time.strftime('%Y-%m-%d %H:%M:%S')

    # 策略 2: 模糊日期（基于周和天）(40% 概率)
    elif random.random() < 0.7:
        time_delta = target_time - reference_time
        
        if time_delta.total_seconds() < 3600 * 24: # 24小时内
            return "later today" # 今天晚些时候
        elif time_delta.days < 7: # 1周内
            weekday = ["Monday", "Tuesday", "Wednesday", "Thursday", "Friday", "Saturday", "Sunday"][target_time.weekday()]
            # 随机选择 'this' 或 'next'
            if time_delta.days > 2:
                return f"next {weekday}"
            else:
                return f"this {weekday}"
        elif time_delta.days < 30: # 1个月内
             return f"in about {round(time_delta.days / 7)} weeks"
        else:
             return "in the near future" # 较远的未来
    
    # 策略 3: 模糊时段（基于一天的时间）(30% 概率)
    else:
        hour = target_time.hour

        if 5 <= hour < 12:
            return "in the morning"
        elif 12 <= hour < 14:
            return "in the noon"
        elif 14 <= hour < 18:
            return "in the afternoon"
        elif 18 <= hour < 22:
            return "in the evening"
        else:
            return "late at night"

        # if 5 <= hour < 12:
        #     return "in the morning (上午)"
        # elif 12 <= hour < 14:
        #     return "in the noon (中午)"
        # elif 14 <= hour < 18:
        #     return "in the afternoon (下午)"
        # elif 18 <= hour < 22:
        #     return "in the evening (傍晚)"
        # else:
        #     return "late at night (深夜)"


# 4. 生成 context
def generate_improved_context(row):
    # 检查是否是最后一个停留点
    if pd.isna(row['next_stime']):
        return None

    # 检查目的地是否是周期性活动 (Top 3)
    if int(row['next_grid']) in top_3_grids:
        return None

    # 获取随机时间表达式
    random_time_expression = get_random_time_expression(row['next_stime'], row['etime'])
    
    start_grid = int(row['start_grid'])
    end_grid = int(row['next_grid'])
    user_id = row['userID']
    
    # 构造新的上下文文本
    context_text = (
        f"User {user_id} will move from grid {start_grid} to grid {end_grid}, "
        f"arriving around {random_time_expression}."
    )
    
    return context_text

# 应用函数并清理
df['improved_context'] = df.apply(generate_improved_context, axis=1)
print(df)

# 清理辅助列并显示结果
df = df.drop(columns=['next_stime', 'next_grid', 'start_grid'])

print("--- 随机时间上下文生成结果 ---")
# 为了展示对比，这里只显示起点、终点和新的上下文列
print(df[['userID', 'stime', 'etime', 'grid', 'improved_context']])

# df.to_csv('./Data/Output/ImprovedContext/{}.csv'.format(userid))

In [None]:
df[['improved_context']]