# 生成特征代码

In [35]:
import pandas as pd
import numpy as np
from datetime import datetime
import holidays  # 需要安装：pip install holidays

# 初始化中国节假日表（你可以替换成你国家的）
cn_holidays = holidays.CountryHoliday('CN')

def extract_time_features(time_str, slot_minutes=30):
    """
    输入: date_str='2008-10-24', time_str='23:44:05'
    输出: np.array([month, day, weekday, is_holiday])
    """
    dt = datetime.strptime(time_str, '%Y-%m-%d %H:%M:%S')
    month = dt.month      # 1-12
    day = dt.day           # 1-31
    weekday = dt.weekday() # 0=周一 ... 6=周日
    is_holiday = 1 if dt.date() in cn_holidays else 0

    total_minutes = dt.hour * 60 + dt.minute
    slot = total_minutes // slot_minutes

    return np.array([month, day, weekday, slot, is_holiday], dtype=np.int64)

def load_user_trajectory_with_time_feats(file_path):
    """
    读取单个用户轨迹CSV，返回排序后的df和时间特征矩阵
    """
    df = pd.read_csv(file_path, header=0, usecols=["lat", "lon", "stime"])
    df = df.sort_values(by=["stime"]).reset_index(drop=True)
    
    # 计算时间特征矩阵
    time_feats = np.stack([extract_time_features(r['stime']) for _, r in df.iterrows()])
    # time_feats shape: [数据点数, 4]
    
    return df, time_feats

def sliding_window_with_time_feats(df, time_feats, window_size=10, step=5):
    """
    滑动窗口切片，返回 loc, time_feats
    """
    trajs = []
    data_len = len(df)
    for start in range(0, data_len - window_size + 1, step):
        window_df = df.iloc[start:start+window_size]
        window_time_feats = time_feats[start:start+window_size]
        
        loc = window_df[['lat', 'lon']].values.astype(np.float32)           # [window_size, 2]
        time_feat = window_time_feats.astype(np.int64)                      # [window_size, 4]
        
        trajs.append({"loc": loc, "time_feats": time_feat})
    return trajs

def build_dataset_with_time_feats(folder_path, window_size=10, step=5):
    loc_list, time_feats_list, user_list = [], [], []
    user_files = [f for f in os.listdir(folder_path) if f.endswith('.csv')]
    
    for user_file in user_files:
        print(user_file)
        user_id = int(user_file.split('.')[0])
        df, time_feats = load_user_trajectory_with_time_feats(os.path.join(folder_path, user_file))
        traj_slices = sliding_window_with_time_feats(df, time_feats, window_size, step)
        
        for traj in traj_slices:
            loc_list.append(traj['loc'])
            time_feats_list.append(traj['time_feats'])
            user_list.append(user_id)
    
    locs = np.stack(loc_list, axis=0)            # [N, L, 2]
    time_feats_arr = np.stack(time_feats_list, axis=0)  # [N, L, 4]
    user_ids = np.array(user_list, dtype=np.int64)       # [N]
    
    return locs, time_feats_arr, user_ids

# === 使用示例 ===
folder = "./Data/Test2/"
locs, time_feats, user_ids = build_dataset_with_time_feats(folder, window_size=10, step=5)

print("locs shape:", locs.shape)
print("time_feats shape:", time_feats.shape)
print("user_ids shape:", user_ids.shape)


000.csv
001.csv
002.csv
003.csv
004.csv
005.csv
locs shape: (693, 10, 2)
time_feats shape: (693, 10, 5)
user_ids shape: (693,)


# 生成embedding、特征融合并且生成负样本

In [36]:
import torch
import torch.nn as nn
import torch.nn.functional as F

class MultiTimeEmbedding(nn.Module):
    def __init__(self, time_feat_dims, emb_dim):
        """
        time_feat_dims: list，每个时间特征的类别数（词表大小）
        emb_dim: embedding维度
        """
        super().__init__()
        self.embs = nn.ModuleList([nn.Embedding(d, emb_dim) for d in time_feat_dims])
        
    def forward(self, x):
        # x: [B, L, T]  T=4 时间特征数
        print(self.embs)
        # x[..., 0] 就是取最后一维的第0个元素，保持前面所有维度不变
        embs = [emb(x[..., i]) for i, emb in enumerate(self.embs)]  # 每个 [B,L,emb_dim]
        out = sum(embs)  # 加法融合
        return out  # [B, L, emb_dim]

class TrajectoryEmbeddingModel(nn.Module):
    def __init__(self, user_num, time_feat_dims, d_model=128):
        super().__init__()
        self.d_model = d_model
        
        # 经纬度线性映射
        self.loc_proj = nn.Linear(2, d_model)
        
        # 时间多路embedding
        self.time_emb = MultiTimeEmbedding(time_feat_dims, d_model)
        
        # 用户embedding
        self.user_emb = nn.Embedding(user_num, d_model)
        
        # 融合层（这里用加法，也可改concat+Linear）
        self.fusion = nn.Identity()
        
    def forward(self, locs, time_feats, user_ids):
        """
        locs: [B, L, 2]  float
        time_feats: [B, L, T] long (T=4)
        user_ids: [B] long
        
        返回融合embedding [B, L, d_model]
        """
        time_feats = time_feats.long()
        user_ids = user_ids.long()

        loc_emb = self.loc_proj(locs)               # [B, L, d_model]
        time_emb = self.time_emb(time_feats)       # [B, L, d_model]
        user_emb = self.user_emb(user_ids)         # [B, d_model]
        user_emb = user_emb.unsqueeze(1).expand(-1, locs.size(1), -1)  # [B, L, d_model]
        
        fused = self.fusion(loc_emb + time_emb + user_emb)
        return fused

def generate_batch_with_negatives(batch_data, neg_ratio=1):
    """
    简单批内负样本采样示例
    batch_data: list of samples [(locs, time_feats, user_id), ...]
    返回 anchor, positive, negative 三个batch
    
    这里演示用简单随机采样 negatives，实际训练请根据需求改写
    """
    anchors, positives, negatives = [], [], []
    batch_size = len(batch_data)
    
    for i, (locs, time_feats, user_id) in enumerate(batch_data):
        anchors.append((locs, time_feats, user_id))
        
        # Positive：这里简单选同用户的另一个样本（如果有），否则用本样本做正样本（示例简化）
        positive = None
        for j, (locs_p, time_feats_p, user_id_p) in enumerate(batch_data):
            if i != j and user_id_p == user_id:
                positive = (locs_p, time_feats_p, user_id_p)
                break
        if positive is None:
            positive = (locs, time_feats, user_id)
        positives.append(positive)
        
        # Negative：随机选不同用户的样本
        negs_for_sample = []
        neg_candidates = [x for k, x in enumerate(batch_data) if k != i and x[2] != user_id]
        for _ in range(neg_ratio):
            if neg_candidates:
                neg_sample = neg_candidates[torch.randint(0, len(neg_candidates), (1,)).item()]
            else:
                neg_sample = (locs, time_feats, user_id)  # 退化情况
            negs_for_sample.append(neg_sample)
        negatives.append(negs_for_sample)
    
    return anchors, positives, negatives

# # === 简单示例使用 ===
# if __name__ == "__main__":

B, L = 8, 10
user_num = 100
time_feat_dims = [13, 32, 7, 49, 2]  # 月、日、星期、节假日类别数示例

model = TrajectoryEmbeddingModel(user_num, time_feat_dims, d_model=128)

# 假数据
# locs = torch.randn(B, L, 2)
# time_feats = torch.randint(0, 10, (B, L, 4))
# user_ids = torch.randint(0, user_num, (B,))

locs = torch.Tensor(locs)
time_feats = torch.Tensor(time_feats)
user_ids = torch.Tensor(user_ids)

fused_emb = model(locs, time_feats, user_ids)  # [B, L, 128]
print("Fused embedding shape:", fused_emb.shape)


ModuleList(
  (0): Embedding(13, 128)
  (1): Embedding(32, 128)
  (2): Embedding(7, 128)
  (3): Embedding(49, 128)
  (4): Embedding(2, 128)
)
Fused embedding shape: torch.Size([693, 10, 128])
