In [None]:
import csv
import re
from collections import defaultdict


file_path = "/home/atuin/g102ea/g102ea12/dataset/charades/anotations/Charades/Charades_v1_test.csv"  

actions_info_dict = defaultdict(lambda: {"number": 0, "ratio": 0, "video_ids": []})

total_actions = 0


all_videos = set()  # save all video info

with open(file_path, mode='r', encoding='utf-8') as file:
    reader = csv.DictReader(file)

    for row in reader:
        actions_str = row['actions'] 
        video_id = row['id']  
        all_videos.add(video_id)  
        action_codes = re.findall(r'c\d+', actions_str)  # match actions index
        
        # update actions_info_dict
        for action in action_codes:
            actions_info_dict[action]["number"] += 1
            total_actions += 1  # count the number of actions appealing
            if video_id not in actions_info_dict[action]["video_ids"]:
                actions_info_dict[action]["video_ids"].append(video_id)


for action, info in actions_info_dict.items():
    info["ratio"] = info["number"] / total_actions if total_actions > 0 else 0.0

# sort
sorted_actions_info_dict = dict(sorted(actions_info_dict.items(), key=lambda x: x[1]["number"]))

for action, info in sorted_actions_info_dict.items():
    print(f"{action}: {info}")

In [None]:
sample_by_ratio = False
TARGET_RATIO = 0.5  # subsample by ratio
TARGET_NUMBER = 1500  # subsample by number

In [None]:
import random

######################## This is subsampled by ratio##############################
if sample_by_ratio:

    total_videos = len(all_videos)
    target_sample_size = int(total_videos * TARGET_RATIO)  

    print(f"Total Videos: {total_videos}, Target Sample Size: {target_sample_size}")

    # first step: ensure every action appeals at least once
    actions_sampled_videos = {}
    selected_videos = set()  
    for action, info in actions_info_dict.items():
        available_videos = list(set(info["video_ids"]) - selected_videos)  # 避免已选视频
        chosen_video = random.choice(available_videos) if available_videos else random.choice(info["video_ids"])
        
        actions_sampled_videos[action] = {
            "number": 1,
            "ratio": 0.0,  
            "videos": [chosen_video]
        }
        selected_videos.add(chosen_video)  # record selected video

    
    remaining_sample_size = target_sample_size - len(selected_videos)
    if remaining_sample_size < 0:
        raise ValueError("TARGET_RATIO is too small, not every action can be sampled")

    # second step: sample rest actions
    if remaining_sample_size > 0:
        total_ratio = sum(info["number"] for info in actions_info_dict.values())  
        actions_extra_samples = {
            action: max(0, int(info["number"] / total_ratio * remaining_sample_size)) for action, info in actions_info_dict.items()
        }

        for action, extra_samples in sorted(actions_extra_samples.items(), key=lambda x: -x[1]):  # handle action with higher ratio
            available_videos = list(set(actions_info_dict[action]["video_ids"]) - set(actions_sampled_videos[action]["videos"]) - selected_videos)
            additional_samples = min(extra_samples, len(available_videos))

            if additional_samples > 0:
                sampled_videos = random.sample(available_videos, additional_samples)
                actions_sampled_videos[action]["videos"].extend(sampled_videos)
                actions_sampled_videos[action]["number"] += len(sampled_videos)
                selected_videos.update(sampled_videos)

    # **调整采样数以严格符合 target_sample_size**
    final_video_list = list(selected_videos)
    if len(final_video_list) > target_sample_size:
        final_video_list = random.sample(final_video_list, target_sample_size)  # 随机裁剪超出的
    elif len(final_video_list) < target_sample_size:
        remaining_videos = list(all_videos - set(final_video_list))  # 从未选过的视频里补充
        if remaining_videos:
            additional_videos = random.sample(remaining_videos, target_sample_size - len(final_video_list))
            final_video_list.extend(additional_videos)

    # **最终整理结果**
    final_video_set = set(final_video_list)

    # **重新计算 `ratio`（基于最终采样结果）**
    for action in actions_sampled_videos:
        actions_sampled_videos[action]["ratio"] = actions_sampled_videos[action]["number"] / target_sample_size

    # **按 `number` 升序排序**
    sorted_actions_sampled_videos = dict(sorted(actions_sampled_videos.items(), key=lambda x: x[1]["number"]))

    # **显示采样结果**
    print(f"\nTotal Sampled Videos: {len(final_video_set)}\n")
    for action, info in sorted_actions_sampled_videos.items():
        print(f"{action}: {info}")

    print("\nFinal Video Set:", final_video_set)
    print("\nLength of Final Video Set", len(final_video_set))

In [None]:
if not sample_by_ratio:
    # **第一阶段**：确保每个动作至少有 1 个视频
    actions_sampled_videos = {}
    selected_videos = set()  # 记录已选中的视频，减少重叠
    for action, info in actions_info_dict.items():
        available_videos = list(set(info["video_ids"]) - selected_videos)  # 避免已选视频
        chosen_video = random.choice(available_videos) if available_videos else random.choice(info["video_ids"])
        
        actions_sampled_videos[action] = {
            "number": 1, 
            "ratio": 0.0,  # 先初始化，后面再计算最终采样比
            "videos": [chosen_video]
        }
        selected_videos.add(chosen_video)  # 记录已选视频

    # 计算 **剩余可以采样的数量**
    remaining_sample_size = TARGET_NUMBER - len(selected_videos)
    if remaining_sample_size < 0:
        raise ValueError("TARGET_NUMBER 太小，无法保证所有动作至少有 1 个视频")

    # **第二阶段**：按 `ratio` 采样更多视频
    if remaining_sample_size > 0:
        # 计算每个动作应分配的额外采样数（按 `ratio` 分配）
        total_ratio = sum(info["number"] for info in actions_info_dict.values())  # 使用原始出现次数计算比例
        actions_extra_samples = {
            action: max(0, int(info["number"] / total_ratio * remaining_sample_size)) for action, info in actions_info_dict.items()
        }

        for action, extra_samples in sorted(actions_extra_samples.items(), key=lambda x: -x[1]):  # 先处理高比重动作
            available_videos = list(set(actions_info_dict[action]["video_ids"]) - set(actions_sampled_videos[action]["videos"]) - selected_videos)  # 只筛选该 action 里还没选过的视频
            
            # 1️⃣ **优先从 available_videos 选择**
            chosen_videos = random.sample(available_videos, min(extra_samples, len(available_videos)))

            # 2️⃣ **计算剩余需要采样的视频数**
            remaining_samples = extra_samples - len(chosen_videos)
            
            if remaining_samples > 0:
                # 3️⃣ **从该 action 的所有视频中补充（不管是否被其他动作选过）**
                remaining_videos = list(set(actions_info_dict[action]["video_ids"]) - set(actions_sampled_videos[action]["videos"]))  # 仅筛选该 action 内未选过的
                additional_videos = random.sample(remaining_videos, min(remaining_samples, len(remaining_videos)))
                
                chosen_videos.extend(additional_videos)

            # 4️⃣ **更新采样信息**
            actions_sampled_videos[action]["videos"].extend(chosen_videos)
            actions_sampled_videos[action]["number"] += len(chosen_videos)
            selected_videos.update(chosen_videos)  # 记录为已选

    # **调整采样数以严格符合 TARGET_NUMBER**
    # final_video_list = list(selected_videos)
    # if len(final_video_list) > TARGET_NUMBER:
    #     final_video_list = random.sample(final_video_list, TARGET_NUMBER)  # 随机裁剪超出的
    # elif len(final_video_list) < TARGET_NUMBER:
    #     remaining_videos = list(all_videos - set(final_video_list))  # 从未选过的视频里补充
    #     if remaining_videos:
    #         additional_videos = random.sample(remaining_videos, TARGET_NUMBER - len(final_video_list))
    #         final_video_list.extend(additional_videos)
    # 计算当前总数
    current_total = sum(len(info["videos"]) for info in actions_sampled_videos.values())

    # 计算需要补充的数量
    needed_samples = 1500 - current_total

    if needed_samples > 0:
        print(f"⚠️ current number of sample: {current_total}, target: 1500，need additional {needed_samples} samples")

        while needed_samples > 0:
            # **按照 number 递增排序**，优先补充数量最少的 action
            sorted_actions = sorted(actions_sampled_videos.items(), key=lambda x: x[1]["number"])

            for action, info in sorted_actions:
                if needed_samples <= 0:
                    break  # 采样已达标，停止补充

                available_videos = list(set(actions_info_dict[action]["video_ids"]) - set(info["videos"]))  # 仅从未选过的视频中补充
                
                if available_videos:
                    sampled_video = random.choice(available_videos)  # 每次补充 1 个视频
                    actions_sampled_videos[action]["videos"].append(sampled_video)
                    actions_sampled_videos[action]["number"] += 1
                    needed_samples -= 1  # 更新剩余缺少的数量
                    selected_videos.update(sampled_video)  # 记录为已选

    # **最终整理结果**
    final_video_list = list(selected_videos)
    final_video_set = set(final_video_list)

    # **重新计算新的 ratio**
    for action in actions_sampled_videos:
        actions_sampled_videos[action]["ratio"] = actions_sampled_videos[action]["number"] / TARGET_NUMBER

    # **按 number 排序（从大到小）**
    sorted_actions_sampled_videos = dict(sorted(actions_sampled_videos.items(), key=lambda x: x[1]["number"], reverse=False))

    # **显示排序后的采样结果**
    print(f"\nTotal sampled videos: {len(final_video_set)}\n")
    
    num = 0
    for action, info in actions_sampled_videos.items():
        num += info["number"]
        print(f"{action}: {info}")

    print("\nFinal Video Set:", final_video_set)
    print("\nFinal sample number:", num)
    print("\nLength of Final Video Set", len(final_video_set))

In [None]:
import json

# 定义 JSON 文件路径（保存在当前文件夹）
output_json_file = "subset_charades.json"

# 将数据保存为 JSON
with open(output_json_file, "w") as f:
    json.dump(actions_sampled_videos, f, indent=4)

print(f"\nSorted actions sampled videos saved to {output_json_file}")