In [33]:
from lutils import openf, writef
from tqdm import tqdm

from pathlib import Path

import sys

sys.path.append(str(Path().cwd().parent.parent))
from src.data.utils_asr import ChaptersASR
from tools.captions.caption_selection import (
    CaptionSelection,
    select_furthest_timestamps,
)

base_dir = Path("../../")
vidc_dir = base_dir / "dataset/"

subset = "sml1k_train"
subset = "sml300_val"
captions_dir = vidc_dir / "captions/HwwwH_MiniCPM-V-2/all"
video_ids = openf(vidc_dir / f"docs/subset_data/{subset}.json")

chapters = ChaptersASR(vidc_dir=vidc_dir, subset=subset)

In [40]:
from tools.captions.caption_selection import (
    ASRPreds,
    get_interval_timestamps,
    get_n_timestamps,
)

model = "Meta-Llama-3.2-11B-Vision-Instruct"
model = "Llama-3.2-1B-Instruct"
model = "Llama-3.2-3B-Instruct"
model = "Meta-Llama-3.1-8B-Instruct"

subset_train = "s8k-2_train"
subset_train = "sml10k-2_train"
subset_train = "s1k-2_train"
subset_train = "s10k-2_train"
asr_preds = ASRPreds(subset_train=subset_train, base_dir=base_dir, model=model)


def cs_asr(vid_id):
    return asr_preds.get_timestamps(vid_id)


def cs_10s(vid_id):
    duration = chapters.get_duration(vid_id)
    return get_interval_timestamps(duration, 10)


def cs_100f(vid_id):
    duration = chapters.get_duration(vid_id)
    return get_n_timestamps(duration, 100)


def cs_10f(vid_id):
    duration = chapters.get_duration(vid_id)
    return get_n_timestamps(duration, 10)


cs = CaptionSelection(
    sampling_methods=("shot-boundary",), base_dir=base_dir, vidc_dir=vidc_dir
)


def cs_sd(vid_id):
    duration = chapters.get_duration(vid_id)
    return cs(vid_id, duration)


model = "gemini-1.5-pro"
model = "gemini-2.0-flash"
model = "gpt-4o-mini"
model = "gpt-4o"
cs_proprietary = ASRPreds(subset_train="zero-shot", base_dir=base_dir, model=model)


cs_captions_preds = ASRPreds(
    base_dir=base_dir,
    prompt="captions",
    data_flags="10s",
    subset_train="s1k-2_no-asr_train",
)


def cs_midpoints(vid_id):
    if vid_id not in asr_preds:
        return None
    timestamps = cs_asr(vid_id)
    if not timestamps:
        return None
    vid_duration = chapters.get_duration(vid_id)
    timestamps = [0] + timestamps[1:]
    timestamps = (
        timestamps + [vid_duration] if timestamps[-1] != vid_duration else timestamps
    )
    timestamps_midpoints = [
        (s1 + s2) / 2 for s1, s2 in zip(timestamps[:-1], timestamps[1:])
    ]
    return timestamps_midpoints

In [41]:
id2timestamps = {}
vid_need_preds = []
for vid_id in tqdm(video_ids):
    vid_duration = chapters.get_duration(vid_id)

    if vid_id in chapters:
        # timestamps_s = cs_sd(vid_id) if vid_id in asr_preds else None
        # timestamps_s = cs_10f(vid_id)
        # timestamps_s = cs_proprietary(vid_id)
        timestamps_s = cs_asr(vid_id) if vid_id in asr_preds else None
        # timestamps_s = cs_midpoints(vid_id)
        if not timestamps_s:
            # assert timestamps_s, f"{vid_id} has ASR preds but no predictions"
            vid_need_preds.append(vid_id)
            continue
    else:
        timestamps_s = cs_10s(vid_id) if vid_duration < 60 * 10 else cs_100f(vid_id)
    timestamps_s = select_furthest_timestamps(timestamps_s)

    timestamps_s = [t for t in timestamps_s if t < vid_duration]

    caption_pth = captions_dir / f"{vid_id[:2]}" / f"{vid_id}.json"

    if not caption_pth.exists():
        id2timestamps[vid_id] = timestamps_s
        continue

    vid_captions = openf(caption_pth)
    n_frames = int(list(vid_captions.keys())[0].split("/")[1])
    vid_timestamps = [
        int(frm.split("/")[0]) * vid_duration / n_frames for frm in vid_captions
    ]

    todo_timestamps = []
    for t in timestamps_s:
        # Check if there's any existing timestamp within 1 second
        if not any(abs(t - vt) < 2.0 for vt in vid_timestamps):
            todo_timestamps.append(t)

    if todo_timestamps:
        id2timestamps[vid_id] = todo_timestamps


print(f"Number of videos needing predictions: {len(vid_need_preds)}")

# Calculate total number of missing captions needed
total_missing = sum(len(timestamps) for timestamps in id2timestamps.values())
print(f"Total number of missing captions needed: {total_missing}")

# Calculate statistics per video
missing_per_video = {
    vid_id: len(timestamps) for vid_id, timestamps in id2timestamps.items()
}
if missing_per_video:
    avg_missing = sum(missing_per_video.values()) / len(missing_per_video)
    max_missing = max(missing_per_video.values())
    min_missing = min(missing_per_video.values())
    print("\nPer video statistics:")
    print(f"Average missing captions per video: {avg_missing:.1f}")
    print(f"Max missing captions for a video: {max_missing}")
    print(f"Min missing captions for a video: {min_missing}")
    print(f"Number of videos needing captions: {len(missing_per_video)}")


not_found = sum([1 for vid_id in video_ids if vid_id not in chapters])
print(f"Percentage of videos not found: {not_found / len(video_ids):.2%}")

100%|██████████| 300/300 [00:00<00:00, 649.97it/s]

Number of videos needing predictions: 0
Total number of missing captions needed: 407

Per video statistics:
Average missing captions per video: 4.0
Max missing captions for a video: 38
Min missing captions for a video: 1
Number of videos needing captions: 101
Percentage of videos not found: 0.00%





In [None]:
out_dir = vidc_dir / "captions/missing_timestamps"
out_dir.mkdir(exist_ok=True)

writef(id2timestamps, out_dir / f"{subset}_0.json")
print(f"python tools/captions/caption_frames_timestamp.py 0 --subset={subset}")

In [42]:
out_dir = vidc_dir / "captions/missing_timestamps"
out_dir.mkdir(exist_ok=True)

# delete existing files
for pth in out_dir.glob("*.json"):
    pth.unlink()

print(f"Writing missing timestamps to {out_dir.resolve()}")

# Split data into n_files shards
num_parts = 8
total_items = len(id2timestamps)
part_size = total_items // num_parts
items = list(id2timestamps.items())

for i in range(num_parts):
    # Determine the start and end index for this part
    start_idx = i * part_size
    # Ensure the last part takes the remaining items in case of rounding issues
    end_idx = (i + 1) * part_size if i < num_parts - 1 else total_items

    partial_dict = dict(items[start_idx:end_idx])

    out_pth = out_dir / f"{chapters.subset}_{i}.json"
    writef(out_pth, partial_dict)

# print length of each part
for i in [0, num_parts - 1]:
    out_pth = out_dir / f"{chapters.subset}_{i}.json"
    print(f"Part {i + 1}/{num_parts}: {len(openf(out_pth))} videos")


print(f"Change subset for {chapters.subset}")
print(f"Change #SBATCH --array={0}-{i}")
print("git push and git pull")

captions_dir = "path/to/datasets/VidChapters/captions/"
print(f"rm -rf {captions_dir}/HwwwH_MiniCPM-V-2/all-missing")
print(f"sbatch tools/captions/run_captions_v100.sh --subset {chapters.subset}")

Writing missing timestamps to /storage/lucas/datasets/VidChapters/captions/missing_timestamps
Part 1/8: 12 videos
Part 8/8: 17 videos
Change subset for sml300_val
Change #SBATCH --array=0-7
git push and git pull
bash /lustre/fswork/projects/rech/cyq/ucp99db/datasets/VidChapters2/captions/HwwwH_MiniCPM-V-2/send_captions.sh 10000
rm -rf /lustre/fswork/projects/rech/cyq/ucp99db/datasets/VidChapters2/captions//missing_timestamps
scp -r athena:/storage/lucas/datasets/VidChapters/captions/missing_timestamps/ /lustre/fswork/projects/rech/cyq/ucp99db/datasets/VidChapters2/captions/
rm -rf /lustre/fswork/projects/rech/cyq/ucp99db/datasets/VidChapters2/captions/HwwwH_MiniCPM-V-2/all-missing
sbatch tools/captions/run_captions_v100.sh --subset sml300_val
