In [3]:
import pandas as pd
from datetime import datetime, timedelta
from collections import defaultdict
import csv
from datetime import datetime, timezone, timedelta
from openlocationcode import openlocationcode as olc
import os
import json
import numpy as np

In [2]:
import pandas as pd
import ast

def keep_first_k_consecutive(seq, k=2):
    """对序列中连续相同的值（包括0），最多保留前k次"""
    if not seq:
        return []
    result = [seq[0]]
    prev = seq[0]
    count = 1
    for x in seq[1:]:
        if x == prev:
            if count < k:
                result.append(x)
                count += 1
            # else: 跳过第k+1次及以后的连续重复
        else:
            result.append(x)
            prev = x
            count = 1
    return result

def clean_pid_sequence(seq_str, k=2):
    """
    输入：字符串形式的列表，如 "[0, 1, 1, 1, 2]"
    输出：清洗后的字符串形式列表，如 "[0, 1, 1, 2]"
    """
    try:
        seq = ast.literal_eval(seq_str)
    except (ValueError, SyntaxError):
        return "[]"

    cleaned = keep_first_k_consecutive(seq, k)
    return str(cleaned)

# ===== 主程序 =====
datafold = "NYC" 
model = "test" 
input_file = f"{datafold}/{model}_sequences.csv"
output_file = f"{datafold}/cleaned_{model}_sequences.csv"

# 读取数据
df = pd.read_csv(input_file)

# 清洗 pid_sequence 列
df['pid_sequence'] = df['pid_sequence'].apply(clean_pid_sequence)

# 保存结果
df.to_csv(output_file, index=False)

print(f"✅ 清洗完成！结果已保存到: {output_file}")

✅ 清洗完成！结果已保存到: NYC/cleaned_test_sequences.csv


In [None]:
def build_poi_transition_matrix(data_path, save_path=None):
    # 1. 读取序列文件
    df = pd.read_csv(data_path)
    # 还原 pid_sequence 为 list
    df['pid_sequence'] = df['pid_sequence'].apply(json.loads)

    # 2. 收集所有 POI，确定 M
    all_pids = set()
    for seq in df['pid_sequence']:
        all_pids.update(seq)
    all_pids = sorted(all_pids)
    pid_to_idx = {pid: idx for idx, pid in enumerate(all_pids)}
    M = len(all_pids)
    print(f"Total POIs: {M}")

    # 3. 统计转移频次
    # 使用 defaultdict(int) 或直接用 numpy zeros
    transition_counts = defaultdict(int)

    for seq in df['pid_sequence']:
        for i in range(len(seq) - 1):
            from_pid = seq[i]
            to_pid = seq[i + 1]
            # 可选：跳过自环（如 0->0）？根据需求决定
            # if from_pid == to_pid: continue
            transition_counts[(from_pid, to_pid)] += 1

    # 4. 构建矩阵（稠密）
    A = np.zeros((M, M), dtype=np.int32)
    for (i, j), count in transition_counts.items():
        if i in pid_to_idx and j in pid_to_idx:
            A[pid_to_idx[i], pid_to_idx[j]] = count

    # 5. （可选）保存矩阵和映射
    if save_path:
        np.save(f"{save_path}_matrix.npy", A)
        print(f"Matrix saved to {save_path}_matrix.npy")

    return A, pid_to_idx, all_pids

# 使用示例
datafold = 'NYC'
A, pid_to_idx, all_pids = build_poi_transition_matrix(
    data_path=f"{datafold}/cleaned_train_sequences.csv",
    save_path=f"{datafold}/cleaned_poi_transition"
)
print("Transition matrix shape:", A.shape)
print("Example: POI 0 -> POI 0 count =", A[pid_to_idx[0], pid_to_idx[0]])

Total POIs: 5081
Matrix saved to NYC/cleaned_poi_transition_matrix.npy
Transition matrix shape: (5081, 5081)
Example: POI 0 -> POI 0 count = 5


In [12]:
Matrix = np.load(f"{datafold}/cleaned_poi_transition_matrix.npy")
k = 2  # 阈值
pos_pairs = np.argwhere(Matrix >= k)  # 取出正样本对 (i, j)

print("Positive pairs (i, j):", pos_pairs)
print(len(pos_pairs), "positive pairs found with count >=", k)

Positive pairs (i, j): [[   0    0]
 [   0 3872]
 [   1    1]
 ...
 [5078 5079]
 [5079 5078]
 [5079 5079]]
14520 positive pairs found with count >= 2
