In [1]:
import torch
import pandas as pd
import os
from pathlib import Path
from timm.models import create_model
import numpy as np
# from csvdata import CSVMAE
import utils
import modeling_pretrain


In [2]:
# 參數設定（手動設定，不需要 argparse）
csv_folder = "/mnt/d/mae/bigclassroom/foundation_dataset/456"   # 輸入 CSV 檔案
save_path = "output_csv"  # 結果輸出的資料夾
model_path = "ckpt_output_dir/checkpoint-1999.pth"  # 模型檔案
device = "cuda" if torch.cuda.is_available() else "cpu"  # 自動選擇 GPU 或 CPU
model_name = "pretrain_videomae_base_patch1_4"  # 你的 VideoMAE 模型
drop_path = 0.0  # Drop Path Rate   
mask_ratio = 0.75  # 設定 Masking 比例 (例如 75%)
num_frames = 16
decoder_depth = 8
patch_size = 1
window_size = (num_frames // 2, 4, 4)  # (T, H, W)
# 確保輸出資料夾存在
Path(save_path).mkdir(parents=True, exist_ok=True)
csv_files = sorted([f for f in os.listdir(csv_folder) if f.endswith(".csv")])


In [3]:
custom_mask = [0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1]

In [4]:
import os
import torch
import numpy as np
from torch.utils.data import Dataset
from masking_generator import TubeMaskingGenerator
import re
from collections import defaultdict

class CSVMAETEST(Dataset):
    def __init__(self, root_dir, num_frames=16, mask_ratio=0.75, window_size=None):
        self.root_dir = root_dir
        self.num_frames = num_frames
        self.window_size = window_size
        self.mask_generator = TubeMaskingGenerator(window_size, mask_ratio=mask_ratio, custom_mask=custom_mask)

        # 遞迴地獲取所有 CSV 檔案
        self.file_paths = []
        for subdir, _, files in os.walk(root_dir):
            for file in files:
                if file.endswith('.csv'):
                    self.file_paths.append(os.path.join(subdir, file))

        # 自訂排序函數: 先按類別名稱排序，再按數字時間戳排序
        def extract_sort_key(path):
            filename = os.path.basename(path)
            match = re.match(r"(.+?)_(\d+)\.csv", filename)  # 擷取類別名稱 & 數字
            if match:
                category = match.group(1)  # 取得類別名稱 (如 1119-1in3out-mv-mc-mid)
                time_value = int(match.group(2))  # 取得數字時間戳
                return (category, time_value)
            return (filename, 0)  # 若無法匹配則放最後

        # 排序後按類別分組
        self.file_paths.sort(key=extract_sort_key)
        self.groups = defaultdict(list)  # 每個類別對應的 CSV 檔案
        for path in self.file_paths:
            category = extract_sort_key(path)[0]  # 取得類別名稱
            self.groups[category].append(path)

        # 預先切割成 num_frames 時間段，確保分組不重複
        self.sequence_groups = []
        for category, files in self.groups.items():
            if len(files) >= num_frames:
                for i in range(0, len(files) - num_frames + 1, num_frames):  # 修改這裡，使每次分組間隔num_frames
                    self.sequence_groups.append(files[i: i + num_frames])

        # 檢查是否有可用數據
        if not self.sequence_groups:
            raise ValueError(f"CSV 檔案數量不足 {num_frames}，請檢查 `{root_dir}` 資料夾！")

        # 顯示前 5 組數據確認順序
        print("= 加載的 CSV 時間序列組: (顯示前 5 組) =")
        for i, seq in enumerate(self.sequence_groups[:5]):
            print(f"組 {i+1}: {[os.path.basename(f) for f in seq]}")

    def __len__(self):
        return len(self.sequence_groups)

    def __getitem__(self, idx):
        file_list = self.sequence_groups[idx]  # 取得對應的一組 num_frames CSV 檔案
        frames = []
        for csv_path in file_list:
            data = np.genfromtxt(csv_path, delimiter=',')
            frames.append(data)

        frames = np.stack(frames)  # (T, H, W)

        # 增加 Channel 維度 -> (C=1, T, H, W)
        frames = np.expand_dims(frames, axis=0)
        frames = torch.tensor(frames, dtype=torch.float32)

        # 生成 mask
        mask = self.mask_generator()

        return (frames, mask)


In [5]:
dataset = CSVMAETEST(root_dir=csv_folder, num_frames=num_frames, mask_ratio=mask_ratio, window_size=window_size)

dataloader = torch.utils.data.DataLoader(dataset, batch_size=1, shuffle=False)

print(f"Loading model: {model_name}")

model = create_model(
        model_name,
        pretrained=False,
        drop_path_rate=0,
        drop_block_rate=None,
        decoder_depth=decoder_depth,
        use_checkpoint=model_path
    )
checkpoint = torch.load(model_path, map_location="cpu")
model.load_state_dict(checkpoint["model"])
model.to(device)
model.eval()

= 加載的 CSV 時間序列組: (顯示前 5 組) =
組 1: ['1119-1in3out-mv-mc-mid_5.csv', '1119-1in3out-mv-mc-mid_10.csv', '1119-1in3out-mv-mc-mid_15.csv', '1119-1in3out-mv-mc-mid_20.csv', '1119-1in3out-mv-mc-mid_25.csv', '1119-1in3out-mv-mc-mid_30.csv', '1119-1in3out-mv-mc-mid_35.csv', '1119-1in3out-mv-mc-mid_40.csv', '1119-1in3out-mv-mc-mid_45.csv', '1119-1in3out-mv-mc-mid_50.csv', '1119-1in3out-mv-mc-mid_55.csv', '1119-1in3out-mv-mc-mid_60.csv', '1119-1in3out-mv-mc-mid_65.csv', '1119-1in3out-mv-mc-mid_70.csv', '1119-1in3out-mv-mc-mid_75.csv', '1119-1in3out-mv-mc-mid_80.csv']
組 2: ['1119-1in3out-mv-mc-mid_85.csv', '1119-1in3out-mv-mc-mid_90.csv', '1119-1in3out-mv-mc-mid_95.csv', '1119-1in3out-mv-mc-mid_100.csv', '1119-1in3out-mv-mc-mid_105.csv', '1119-1in3out-mv-mc-mid_110.csv', '1119-1in3out-mv-mc-mid_115.csv', '1119-1in3out-mv-mc-mid_120.csv', '1119-1in3out-mv-mc-mid_125.csv', '1119-1in3out-mv-mc-mid_130.csv', '1119-1in3out-mv-mc-mid_135.csv', '1119-1in3out-mv-mc-mid_140.csv', '1119-1in3out-mv-mc-mid_145

  checkpoint = torch.load(model_path, map_location="cpu")


PretrainVisionTransformer(
  (encoder): PretrainVisionTransformerEncoder(
    (patch_embed): PatchEmbed(
      (proj): Conv3d(1, 768, kernel_size=(2, 1, 1), stride=(2, 1, 1))
    )
    (blocks): ModuleList(
      (0-11): 12 x Block(
        (norm1): LayerNorm((768,), eps=1e-06, elementwise_affine=True)
        (attn): Attention(
          (qkv): Linear(in_features=768, out_features=2304, bias=False)
          (attn_drop): Dropout(p=0.0, inplace=False)
          (proj): Linear(in_features=768, out_features=768, bias=True)
          (proj_drop): Dropout(p=0.0, inplace=False)
        )
        (drop_path): Identity()
        (norm2): LayerNorm((768,), eps=1e-06, elementwise_affine=True)
        (mlp): Mlp(
          (fc1): Linear(in_features=768, out_features=3072, bias=True)
          (act): GELU(approximate='none')
          (fc2): Linear(in_features=3072, out_features=768, bias=True)
          (drop): Dropout(p=0.0, inplace=False)
        )
      )
    )
    (norm): LayerNorm((768,), e

In [6]:
def inverse_normalize(log_normalized_values):
    # 將 0~1 的正規化數據反轉回 log 範圍，然後再轉回線性空間的原始範圍
    original_min = 300
    original_max = 1600
    log_values = log_normalized_values * (np.log(original_max) - np.log(original_min)) + np.log(original_min)
    original_values = np.exp(log_values)
    return original_values

In [33]:
from einops import rearrange
import torch.nn as nn
import numpy as np
import pandas as pd

loss_func = nn.MSELoss()
final_df_list = []

for idx, (videos, bool_masked_pos) in enumerate(dataloader):
    videos = videos.to(device, non_blocking=True)
    bool_masked_pos = bool_masked_pos.to(device, non_blocking=True).flatten(1).to(torch.bool)

    # 進行推理
    with torch.no_grad():
        # Rearrange and process the frames
        videos_patch = rearrange(videos, 'b c (t p0) (h p1) (w p2) -> b (t h w) (p0 p1 p2 c)',
                                 p0=2, p1=patch_size, p2=patch_size)
        B, _, C = videos_patch.shape

        labels = videos_patch[bool_masked_pos].reshape(B, -1, C)
        outputs = model(videos, bool_masked_pos)
        loss = loss_func(input=outputs, target=labels)
        print(f"Loss for video {idx}: {loss.item()}")

    # 轉換為 NumPy 格式
    outputs_np = outputs.cpu().detach().numpy().reshape(-1, 2)  # (N_mask, C)
    mask_np = bool_masked_pos.cpu().detach().numpy().reshape(-1).astype(int)  # (N,)

    # ✅ **即時儲存結果**（原本的方式）
    output_list = [outputs_np]
    mask_list = [mask_np]

    df_outputs = pd.DataFrame(np.concatenate(output_list, axis=0))  # (N, C)
    df_masks = pd.DataFrame(np.concatenate(mask_list, axis=0))  # (N,)

    print("🔍 原始 mask 形狀:", df_masks.shape)
    print("🔍 原始 output 形狀:", df_outputs.shape)

    # **還原完整的感測器數據**
    sensor_data = videos.cpu().detach().numpy().squeeze(0)  # (C=1, T, H, W) → (T, H, W)
    sensor_data = sensor_data.reshape(num_frames, -1)  # 轉換為 (T, sensors)
    # 保留一份原始感測器數據（反正規化前）
    sensor_data_ori = sensor_data.copy()
    sensor_data_ori = inverse_normalize(sensor_data_ori)

    # **mask=1 的地方用 `df_outputs` 預測值替換**
    predicted_values = df_outputs.to_numpy().reshape(num_frames, -1)  # (T, 預測的 sensor 值)
    mask_reshaped = df_masks.to_numpy().reshape(-1, num_frames)  # (H × W, T) → (T, H × W)
    mask_reshaped = np.tile(mask_reshaped, (2, 1))

    print(predicted_values.shape)
    print(sensor_data.shape)
    print(mask_reshaped.shape)

    predicted_values_padded = np.zeros_like(sensor_data)
    predicted_values_padded[mask_reshaped == 1] = predicted_values.flatten()
    sensor_data[mask_reshaped == 1] = predicted_values_padded[mask_reshaped == 1]
    sensor_data = inverse_normalize(sensor_data)

    # 先建立 DataFrame，只有 sensor_data（修改後）
    final_df = pd.DataFrame(sensor_data)
    sensor_columns = [f"sensor{i+1}" for i in range(sensor_data.shape[1])]
    final_df.columns = sensor_columns
    # ── 新增：建立原始感測器數據的 DataFrame ──
    ori_sensor_df = pd.DataFrame(sensor_data_ori, columns=[f"origin_{i+1}" for i in range(sensor_data_ori.shape[1])])
    # ── 新增：計算相對誤差（僅針對 mask 掉的感測器） ──
    # 相對誤差 = abs(修改後 - 原始) / abs(原始)
    rel_error = np.abs(sensor_data - sensor_data_ori) / np.abs(sensor_data_ori)
    # 只保留 mask 區域的誤差，其他設定為 NaN
    rel_error[mask_reshaped != 1] = np.nan
    # 建立相對誤差的 DataFrame，欄位命名為 mask_1, mask_2, ..., mask_n
    rel_error_df = pd.DataFrame(rel_error, columns=[f"mask_{i+1}" for i in range(sensor_data.shape[1])])
    # ────────────────────────────────────────────────

    # ── 新增：計算每個時間步（行）的平均相對誤差（忽略 NaN），新增欄位 "average" ──
    average_error = rel_error_df.mean(axis=1, skipna=True)
    average_error_df = pd.DataFrame(average_error, columns=["average"])
    # ────────────────────────────────────────────────

    # ── 新增：計算每個時間步（行）中，mask_{編號} 欄位的最大值，新增欄位 "max" ──
    max_error = rel_error_df.max(axis=1, skipna=True)
    max_error_df = pd.DataFrame(max_error, columns=["max"])
    # ────────────────────────────────────────────────

    # 合併修改後的感測器數據、原始感測器數據、相對誤差、平均相對誤差與最大誤差（橫向合併）
    final_df = pd.concat([final_df, ori_sensor_df, rel_error_df, average_error_df, max_error_df], axis=1)
    final_df_list.append(final_df)

# 合併所有 video 的 DataFrame 成一個 DataFrame，並儲存到同一個 CSV 檔案中
all_final_df = pd.concat(final_df_list, axis=0, ignore_index=True)
all_final_df.to_csv(f"{save_path}/final_output.csv", index=False)

print(f"✅ 所有結果已合併儲存至 {save_path}/final_output.csv")


  return fn(*args, **kwargs)
  return fn(*args, **kwargs)
  return fn(*args, **kwargs)


Loss for video 0: 0.002899080514907837
🔍 原始 mask 形狀: (128, 1)
🔍 原始 output 形狀: (64, 2)
(16, 8)
(16, 16)
(16, 16)
Loss for video 1: 0.0044289506040513515
🔍 原始 mask 形狀: (128, 1)
🔍 原始 output 形狀: (64, 2)
(16, 8)
(16, 16)
(16, 16)
Loss for video 2: 0.0023416648618876934
🔍 原始 mask 形狀: (128, 1)
🔍 原始 output 形狀: (64, 2)
(16, 8)
(16, 16)
(16, 16)


  return fn(*args, **kwargs)
  return fn(*args, **kwargs)
  return fn(*args, **kwargs)
  return fn(*args, **kwargs)
  return fn(*args, **kwargs)


Loss for video 3: 0.0020556622184813023
🔍 原始 mask 形狀: (128, 1)
🔍 原始 output 形狀: (64, 2)
(16, 8)
(16, 16)
(16, 16)
Loss for video 4: 0.0028789681382477283
🔍 原始 mask 形狀: (128, 1)
🔍 原始 output 形狀: (64, 2)
(16, 8)
(16, 16)
(16, 16)
Loss for video 5: 0.002318082842975855
🔍 原始 mask 形狀: (128, 1)
🔍 原始 output 形狀: (64, 2)
(16, 8)
(16, 16)
(16, 16)
Loss for video 6: 0.002188839018344879
🔍 原始 mask 形狀: (128, 1)
🔍 原始 output 形狀: (64, 2)
(16, 8)
(16, 16)
(16, 16)
Loss for video 7: 0.0032433411106467247
🔍 原始 mask 形狀: (128, 1)
🔍 原始 output 形狀: (64, 2)
(16, 8)
(16, 16)
(16, 16)


  return fn(*args, **kwargs)
  return fn(*args, **kwargs)
  return fn(*args, **kwargs)
  return fn(*args, **kwargs)
  return fn(*args, **kwargs)


Loss for video 8: 0.0024422649294137955
🔍 原始 mask 形狀: (128, 1)
🔍 原始 output 形狀: (64, 2)
(16, 8)
(16, 16)
(16, 16)
Loss for video 9: 0.0014716566074639559
🔍 原始 mask 形狀: (128, 1)
🔍 原始 output 形狀: (64, 2)
(16, 8)
(16, 16)
(16, 16)
Loss for video 10: 0.0016057101311162114
🔍 原始 mask 形狀: (128, 1)
🔍 原始 output 形狀: (64, 2)
(16, 8)
(16, 16)
(16, 16)
Loss for video 11: 0.0010196678340435028
🔍 原始 mask 形狀: (128, 1)
🔍 原始 output 形狀: (64, 2)
(16, 8)
(16, 16)
(16, 16)
Loss for video 12: 0.0008419518126174808
🔍 原始 mask 形狀: (128, 1)
🔍 原始 output 形狀: (64, 2)
(16, 8)
(16, 16)
(16, 16)


  return fn(*args, **kwargs)
  return fn(*args, **kwargs)
  return fn(*args, **kwargs)
  return fn(*args, **kwargs)
  return fn(*args, **kwargs)


Loss for video 13: 0.0004488519625738263
🔍 原始 mask 形狀: (128, 1)
🔍 原始 output 形狀: (64, 2)
(16, 8)
(16, 16)
(16, 16)
Loss for video 14: 0.00038490607403218746
🔍 原始 mask 形狀: (128, 1)
🔍 原始 output 形狀: (64, 2)
(16, 8)
(16, 16)
(16, 16)
Loss for video 15: 0.001671662088483572
🔍 原始 mask 形狀: (128, 1)
🔍 原始 output 形狀: (64, 2)
(16, 8)
(16, 16)
(16, 16)
Loss for video 16: 0.0014508028980344534
🔍 原始 mask 形狀: (128, 1)
🔍 原始 output 形狀: (64, 2)
(16, 8)
(16, 16)
(16, 16)
Loss for video 17: 0.0008252931875176728
🔍 原始 mask 形狀: (128, 1)
🔍 原始 output 形狀: (64, 2)
(16, 8)
(16, 16)
(16, 16)


  return fn(*args, **kwargs)
  return fn(*args, **kwargs)
  return fn(*args, **kwargs)
  return fn(*args, **kwargs)
  return fn(*args, **kwargs)


Loss for video 18: 0.0018800904508680105
🔍 原始 mask 形狀: (128, 1)
🔍 原始 output 形狀: (64, 2)
(16, 8)
(16, 16)
(16, 16)
Loss for video 19: 0.0006583036156371236
🔍 原始 mask 形狀: (128, 1)
🔍 原始 output 形狀: (64, 2)
(16, 8)
(16, 16)
(16, 16)
Loss for video 20: 0.004521622322499752
🔍 原始 mask 形狀: (128, 1)
🔍 原始 output 形狀: (64, 2)
(16, 8)
(16, 16)
(16, 16)
Loss for video 21: 0.0026586195454001427
🔍 原始 mask 形狀: (128, 1)
🔍 原始 output 形狀: (64, 2)
(16, 8)
(16, 16)
(16, 16)
Loss for video 22: 0.0014308742247521877
🔍 原始 mask 形狀: (128, 1)
🔍 原始 output 形狀: (64, 2)
(16, 8)
(16, 16)
(16, 16)


  return fn(*args, **kwargs)
  return fn(*args, **kwargs)
  return fn(*args, **kwargs)
  return fn(*args, **kwargs)
  return fn(*args, **kwargs)


Loss for video 23: 0.0025491099804639816
🔍 原始 mask 形狀: (128, 1)
🔍 原始 output 形狀: (64, 2)
(16, 8)
(16, 16)
(16, 16)
Loss for video 24: 0.0010373072000220418
🔍 原始 mask 形狀: (128, 1)
🔍 原始 output 形狀: (64, 2)
(16, 8)
(16, 16)
(16, 16)
Loss for video 25: 0.0009260214865207672
🔍 原始 mask 形狀: (128, 1)
🔍 原始 output 形狀: (64, 2)
(16, 8)
(16, 16)
(16, 16)
Loss for video 26: 0.0007329643703997135
🔍 原始 mask 形狀: (128, 1)
🔍 原始 output 形狀: (64, 2)
(16, 8)
(16, 16)
(16, 16)
Loss for video 27: 0.0012101843021810055
🔍 原始 mask 形狀: (128, 1)
🔍 原始 output 形狀: (64, 2)
(16, 8)
(16, 16)
(16, 16)


  return fn(*args, **kwargs)
  return fn(*args, **kwargs)
  return fn(*args, **kwargs)
  return fn(*args, **kwargs)
  return fn(*args, **kwargs)


Loss for video 28: 0.0012758761877194047
🔍 原始 mask 形狀: (128, 1)
🔍 原始 output 形狀: (64, 2)
(16, 8)
(16, 16)
(16, 16)
Loss for video 29: 0.0033207666128873825
🔍 原始 mask 形狀: (128, 1)
🔍 原始 output 形狀: (64, 2)
(16, 8)
(16, 16)
(16, 16)
Loss for video 30: 0.0020950494799762964
🔍 原始 mask 形狀: (128, 1)
🔍 原始 output 形狀: (64, 2)
(16, 8)
(16, 16)
(16, 16)
Loss for video 31: 0.00226253317669034
🔍 原始 mask 形狀: (128, 1)
🔍 原始 output 形狀: (64, 2)
(16, 8)
(16, 16)
(16, 16)
Loss for video 32: 0.0007531484588980675
🔍 原始 mask 形狀: (128, 1)
🔍 原始 output 形狀: (64, 2)
(16, 8)
(16, 16)
(16, 16)


  return fn(*args, **kwargs)
  return fn(*args, **kwargs)
  return fn(*args, **kwargs)
  return fn(*args, **kwargs)
  return fn(*args, **kwargs)


Loss for video 33: 0.0007104355609044433
🔍 原始 mask 形狀: (128, 1)
🔍 原始 output 形狀: (64, 2)
(16, 8)
(16, 16)
(16, 16)
Loss for video 34: 0.001528094056993723
🔍 原始 mask 形狀: (128, 1)
🔍 原始 output 形狀: (64, 2)
(16, 8)
(16, 16)
(16, 16)
Loss for video 35: 0.003332449123263359
🔍 原始 mask 形狀: (128, 1)
🔍 原始 output 形狀: (64, 2)
(16, 8)
(16, 16)
(16, 16)
Loss for video 36: 0.0029773046262562275
🔍 原始 mask 形狀: (128, 1)
🔍 原始 output 形狀: (64, 2)
(16, 8)
(16, 16)
(16, 16)
Loss for video 37: 0.0004075428587384522
🔍 原始 mask 形狀: (128, 1)
🔍 原始 output 形狀: (64, 2)
(16, 8)
(16, 16)
(16, 16)


  return fn(*args, **kwargs)
  return fn(*args, **kwargs)
  return fn(*args, **kwargs)
  return fn(*args, **kwargs)
  return fn(*args, **kwargs)


Loss for video 38: 0.0006841092254035175
🔍 原始 mask 形狀: (128, 1)
🔍 原始 output 形狀: (64, 2)
(16, 8)
(16, 16)
(16, 16)
Loss for video 39: 0.0008136228425428271
🔍 原始 mask 形狀: (128, 1)
🔍 原始 output 形狀: (64, 2)
(16, 8)
(16, 16)
(16, 16)
Loss for video 40: 0.0005859936936758459
🔍 原始 mask 形狀: (128, 1)
🔍 原始 output 形狀: (64, 2)
(16, 8)
(16, 16)
(16, 16)
Loss for video 41: 0.0009556381264701486
🔍 原始 mask 形狀: (128, 1)
🔍 原始 output 形狀: (64, 2)
(16, 8)
(16, 16)
(16, 16)
Loss for video 42: 0.0013937163166701794
🔍 原始 mask 形狀: (128, 1)
🔍 原始 output 形狀: (64, 2)
(16, 8)
(16, 16)
(16, 16)
Loss for video 43: 0.0008542629657313228
🔍 原始 mask 形狀: (128, 1)
🔍 原始 output 形狀: (64, 2)
(16, 8)
(16, 16)
(16, 16)
✅ 所有結果已合併儲存至 output_csv/final_output.csv


  return fn(*args, **kwargs)


In [None]:
import os
import torch
import numpy as np
from torch.utils.data import DataLoader
from timm.models import create_model
from utils import NativeScalerWithGradNormCount as NativeScaler
import utils
from datasets import CSVMAE
from einops import rearrange

def load_model(model_name, weights_path, device):
    print(f"Loading model: {model_name}")
    model = create_model(
        model_name,
        pretrained=False,
        drop_path_rate=0.0,
        drop_block_rate=None,
        decoder_depth=4,
        use_checkpoint=False
    )
    checkpoint = torch.load(weights_path, map_location='cpu')
    model.load_state_dict(checkpoint['model'], strict=False)
    model.to(device)
    model.eval()
    return model

def validate(model, dataloader, device, output_file):
    loss_func = torch.nn.MSELoss()
    results = []
    total_loss = 0.0
    num_samples = 0
    
    with torch.no_grad():
        for step, (videos, bool_masked_pos) in enumerate(dataloader):

            videos = videos.to(device, non_blocking=True)
            bool_masked_pos = bool_masked_pos.to(device, non_blocking=True).flatten(1).to(torch.bool)
            
            videos_patch = rearrange(videos, 'b c (t p0) (h p1) (w p2) -> b (t h w) (p0 p1 p2 c)', p0=2, p1=1, p2=1)
            B, _, C = videos_patch.shape
            labels = videos_patch[bool_masked_pos].reshape(B, -1, C)
            outputs = model(videos, bool_masked_pos)
            # **計算 Loss**
            loss = loss_func(outputs, labels)
            loss_value = loss.item()
            total_loss += loss_value
            num_samples += 1

            print(f"Step {step + 1}/{len(dataloader)} - Loss: {loss_value:.6f}")

    # 計算平均 Loss
    avg_loss = total_loss / num_samples if num_samples > 0 else float('nan')
    print(f"✅ 平均 Loss: {avg_loss:.6f}")

    # **儲存結果**
    with open(output_file, 'w') as f:
        f.write(f"Average Loss: {avg_loss:.6f}\n")
        for res in results:
            f.write(str(res) + '\n')

    print(f"Validation results and loss saved to {output_file}")


def run_validation(model_name, weights_path, data_path, batch_size, device, output_file):
    if not os.path.exists(data_path):
        print("Error: Data path does not exist!")
        return
    
    model = load_model(model_name, weights_path, device)
    patch_size = model.encoder.patch_embed.patch_size
    window_size = (16 // 2, 4 // patch_size[0], 4 // patch_size[1])
    dataset = CSVMAE(root_dir=data_path, num_frames=16, window_size=window_size)
    dataloader = DataLoader(dataset, batch_size=batch_size, num_workers=4, pin_memory=True)
    validate(model, dataloader, device, output_file)

model_name = 'pretrain_videomae_base_patch1_4'
weights_path = '/mnt/d/haoxiangbiya/VideoMAE/ckpt_output_dir/checkpoint-2499.pth'
data_path = '/mnt/d/mae/bigclassroom/foundation_dataset/train'
batch_size = 8
device = 'cuda'
output_file = 'validation_results.txt'

run_validation(model_name, weights_path, data_path, batch_size, device, output_file)


In [None]:
import os
import torch
import numpy as np
from torch.utils.data import Dataset
from masking_generator import TubeMaskingGenerator  # 假設你有這個mask生成器
from collections import defaultdict
import re

class CustomCSVMAE(Dataset):
    def __init__(self, root_dir, num_frames=16, mask_ratio=0.75, patch_size=16, window_size=None):
        self.root_dir = root_dir
        self.num_frames = num_frames
        self.mask_ratio = mask_ratio
        self.patch_size = patch_size
        self.mask_generator = TubeMaskingGenerator(window_size, mask_ratio=mask_ratio)

        # 遞迴地獲取所有 CSV 檔案
        self.file_paths = []
        for subdir, _, files in os.walk(root_dir):
            for file in files:
                if file.endswith('.csv'):
                    self.file_paths.append(os.path.join(subdir, file))

        # 自訂排序函數：按檔案名中的數字時間戳排序
        def extract_sort_key(path):
            filename = os.path.basename(path)
            match = re.match(r"(.+?)_(\d+)\.csv", filename)  # 擷取類別名稱 & 數字
            if match:
                category = match.group(1)  # 取得類別名稱 (如 1119-1in3out-mv-mc-mid)
                time_value = int(match.group(2))  # 取得數字時間戳
                return (category, time_value)
            return (filename, 0)  # 若無法匹配則放最後

        # 排序後按類別分組
        self.file_paths.sort(key=extract_sort_key)
        self.groups = defaultdict(list)  # 每個類別對應的 CSV 檔案
        for path in self.file_paths:
            category = extract_sort_key(path)[0]  # 取得類別名稱
            self.groups[category].append(path)

        # 按照 num_frames 切割成時間段
        self.sequence_groups = []
        for category, files in self.groups.items():
            if len(files) >= num_frames:
                for i in range(len(files) - num_frames + 1):
                    self.sequence_groups.append(files[i: i + num_frames])

        if not self.sequence_groups:
            raise ValueError(f"CSV 檔案數量不足 {num_frames}，請檢查 `{root_dir}` 資料夾！")

        # 顯示前 5 組數據
        print("= 加載的 CSV 時間序列組: (顯示前 5 組) =")
        for i, seq in enumerate(self.sequence_groups[:5]):
            print(f"組 {i+1}: {[os.path.basename(f) for f in seq]}")

    def __len__(self):
        return len(self.sequence_groups)

    def __getitem__(self, idx):
        file_list = self.sequence_groups[idx]  # 取得對應的一組 num_frames CSV 檔案
        frames = []
        bool_masked_pos = []  # 用來儲存每個 frame 的 mask 訊息

        # 讀取每個影片的 frames 和對應的 mask
        for csv_path in file_list:
            data = np.genfromtxt(csv_path, delimiter=',')
            frames.append(data)

            # 生成 mask，這部分可以使用您先前的 `TubeMaskingGenerator`
            mask = np.random.binomial(1, self.mask_ratio, data.shape)  # 隨機生成的 mask
            bool_masked_pos.append(mask)

        frames = np.stack(frames)  # (T, H, W)
        frames = np.expand_dims(frames, axis=0)  # 增加 Channel 維度 (C=1, T, H, W)
        frames = torch.tensor(frames, dtype=torch.float32)

        bool_masked_pos = np.stack(bool_masked_pos)  # (T, H, W)
        bool_masked_pos = torch.tensor(bool_masked_pos, dtype=torch.bool)

        return (frames, bool_masked_pos)

# 使用示例：
csv_folder = "/path/to/your/csv/folder"
dataset = CustomCSVMAE(root_dir=csv_folder, num_frames=16, mask_ratio=0.75, patch_size=16)

# 打印某一組資料，檢查資料是否正確
sample_frames, sample_mask = dataset[0]  # 顯示第一組資料
print("Sample Frames Shape:", sample_frames.shape)
print("Sample Mask Shape:", sample_mask.shape)
