# Video Processing Pipeline

This notebook processes FishEye Collaborative videos to:
1. Extract audio as WAV files
2. Detect red boxes around audio detections in the spectrogram
3. Extract timestamps and save to CSV

## Import Libraries

In [1]:
!pip install -U numpy pandas opencv-python matplotlib

Collecting numpy
  Using cached numpy-2.3.5-cp311-cp311-macosx_14_0_arm64.whl.metadata (62 kB)
Collecting opencv-python
  Using cached opencv_python-4.12.0.88-cp37-abi3-macosx_13_0_arm64.whl.metadata (19 kB)
Collecting matplotlib
  Using cached matplotlib-3.10.7-cp311-cp311-macosx_11_0_arm64.whl.metadata (11 kB)
Using cached opencv_python-4.12.0.88-cp37-abi3-macosx_13_0_arm64.whl (37.9 MB)
Using cached matplotlib-3.10.7-cp311-cp311-macosx_11_0_arm64.whl (8.1 MB)
Installing collected packages: opencv-python, matplotlib
[2K  Attempting uninstall: opencv-python
[2K    Found existing installation: opencv-python 4.10.0
[1;31merror[0m: [1muninstall-no-record-file[0m

[31m×[0m Cannot uninstall opencv-python 4.10.0
[31m╰─>[0m The package's contents are unknown: no RECORD file was found for opencv-python.

[1;36mhint[0m: The package was installed by conda. You should check if it can uninstall the package.
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0/2[0m [opencv-

In [2]:
import cv2
import pandas as pd
import numpy as np
from pathlib import Path
import subprocess
import os

## Function: Process Video and Extract Timestamps

In [3]:
def detect_white_line_and_red_box(frame, spectrogram_roi=None):
    """
    Detect when the white vertical line intersects with red detection boxes in the spectrogram
    
    Args:
        frame: BGR image from video
        spectrogram_roi: (x, y, w, h) tuple for spectrogram region, or None for bottom area
    
    Returns:
        bool: True if white line is intersecting a red box, False otherwise
    """
    # Define spectrogram region (bottom portion of video where waveform is)
    if spectrogram_roi:
        x, y, w, h = spectrogram_roi
        roi = frame[y:y+h, x:x+w]
    else:
        # Default: bottom 20% of frame (where spectrogram typically is)
        height = frame.shape[0]
        roi = frame[int(height * 0.8):, :]
    
    roi_height, roi_width = roi.shape[:2]
    
    # Step 1: Find the white vertical line
    # Convert to grayscale for white line detection
    gray = cv2.cvtColor(roi, cv2.COLOR_BGR2GRAY)
    
    # Detect bright vertical structures (the white line)
    # Look for very bright pixels (white line should be near 255)
    _, white_mask = cv2.threshold(gray, 240, 255, cv2.THRESH_BINARY)
    
    # Sum along vertical axis to find the x-position with most white pixels
    vertical_profile = np.sum(white_mask, axis=0)
    
    # Find the x-position of the white line (peak in the profile)
    if np.max(vertical_profile) < roi_height * 0.3:  # Threshold: line should span at least 30% of height
        return False  # No clear white line detected
    
    white_line_x = np.argmax(vertical_profile)
    
    # Step 2: Detect red boxes at or near the white line position
    # Convert to HSV for red detection
    hsv = cv2.cvtColor(roi, cv2.COLOR_BGR2HSV)
    
    # Define red color range in HSV (for the red boxes)
    lower_red1 = np.array([0, 100, 100])
    upper_red1 = np.array([10, 255, 255])
    lower_red2 = np.array([160, 100, 100])
    upper_red2 = np.array([180, 255, 255])
    
    # Create masks for red color
    mask1 = cv2.inRange(hsv, lower_red1, upper_red1)
    mask2 = cv2.inRange(hsv, lower_red2, upper_red2)
    red_mask = cv2.bitwise_or(mask1, mask2)
    
    # Step 3: Check if there are red pixels near the white line position
    # Define a narrow region around the white line (±10 pixels)
    search_width = 10
    left_bound = max(0, white_line_x - search_width)
    right_bound = min(roi_width, white_line_x + search_width)
    
    # Extract the region around the white line
    line_region = red_mask[:, left_bound:right_bound]
    
    # Count red pixels in this region
    red_pixel_count = cv2.countNonZero(line_region)
    
    # If there are enough red pixels near the white line, it's a detection
    # Adjust threshold as needed (currently 100 pixels)
    return red_pixel_count > 100

## Function: Detect White Line Intersecting Red Boxes

In [4]:
def extract_audio(video_path, output_path):
    """
    Extract audio from video file and save as WAV
    Uses ffmpeg via subprocess
    """
    cmd = [
        'ffmpeg',
        '-i', str(video_path),
        '-vn',  # No video
        '-acodec', 'pcm_s16le',  # PCM 16-bit little-endian
        '-ar', '44100',  # Sample rate 44.1kHz
        '-ac', '2',  # Stereo
        '-y',  # Overwrite output file
        str(output_path)
    ]
    
    try:
        result = subprocess.run(cmd, capture_output=True, text=True)
        if result.returncode == 0:
            print(f"✓ Extracted audio: {output_path.name}")
            return True
        else:
            print(f"✗ Error extracting {video_path.name}: {result.stderr}")
            return False
    except Exception as e:
        print(f"✗ Exception: {e}")
        return False

## Function: Extract Audio from Video

In [6]:
# Set up paths
video_dir = Path("/Users/wendycao/fish/fisheye")   # root directory
output_audio_dir = Path("/Users/wendycao/fish/fisheye/audio_output")
output_csv_dir = Path("/Users/wendycao/fish/fisheye/detection_timestamps")

# Create output directories
output_audio_dir.mkdir(exist_ok=True)
output_csv_dir.mkdir(exist_ok=True)

# Recursively get all video files in all species subfolders
video_files = sorted(video_dir.rglob("*.mp4"))

print(f"Found {len(video_files)} video files:")
for vf in video_files:
    # Show path relative to project root for clarity
    print(f"  - {vf.relative_to(video_dir)}")


Found 312 video files:
  - Abudefduf saxatilis/FEC-00136 Abudefduf saxatilis.mp4
  - Abudefduf saxatilis/FEC-00136 Abudefduf saxatilis_cropped.mp4
  - Abudefduf saxatilis/FEC-00137 Abudefduf saxatilis.mp4
  - Abudefduf saxatilis/FEC-00137 Abudefduf saxatilis_cropped.mp4
  - Abudefduf saxatilis/FEC-00138 Abudefduf saxatilis.mp4
  - Abudefduf saxatilis/FEC-00138 Abudefduf saxatilis_cropped.mp4
  - Abudefduf saxatilis/FEC-00139 Abudefduf saxatilis.mp4
  - Abudefduf saxatilis/FEC-00139 Abudefduf saxatilis_cropped.mp4
  - Abudefduf saxatilis/FEC-00140 Abudefduf saxatilis.mp4
  - Abudefduf saxatilis/FEC-00140 Abudefduf saxatilis_cropped.mp4
  - Acanthurus coeruleus/FEC-00006 Acanthurus coeruleus.mp4
  - Acanthurus coeruleus/FEC-00006 Acanthurus coeruleus_cropped.mp4
  - Acanthurus coeruleus/FEC-00007 Acanthurus coeruleus.mp4
  - Acanthurus coeruleus/FEC-00007 Acanthurus coeruleus_cropped.mp4
  - Acanthurus coeruleus/FEC-00008 Acanthurus coeruleus.mp4
  - Acanthurus coeruleus/FEC-00008 Acanth

In [7]:
video_files = sorted([
    vf for vf in video_dir.rglob("*.mp4")
    if "_cropped" not in vf.stem  # exclude cropped versions
])

print(f"Found {len(video_files)} non-cropped video files:")
for vf in video_files:
    print(f"  - {vf.relative_to(video_dir)}")

Found 156 non-cropped video files:
  - Abudefduf saxatilis/FEC-00136 Abudefduf saxatilis.mp4
  - Abudefduf saxatilis/FEC-00137 Abudefduf saxatilis.mp4
  - Abudefduf saxatilis/FEC-00138 Abudefduf saxatilis.mp4
  - Abudefduf saxatilis/FEC-00139 Abudefduf saxatilis.mp4
  - Abudefduf saxatilis/FEC-00140 Abudefduf saxatilis.mp4
  - Acanthurus coeruleus/FEC-00006 Acanthurus coeruleus.mp4
  - Acanthurus coeruleus/FEC-00007 Acanthurus coeruleus.mp4
  - Acanthurus coeruleus/FEC-00008 Acanthurus coeruleus.mp4
  - Acanthurus coeruleus/FEC-00009 Acanthurus coeruleus.mp4
  - Acanthurus coeruleus/FEC-00010 Acanthurus coeruleus.mp4
  - Acanthurus tractus/FEC-00011 Acanthurus tractus.mp4
  - Acanthurus tractus/FEC-00012 Acanthurus tractus.mp4
  - Anisotremus virginicus/FEC-00103 Anisotremus virginicus.mp4
  - Anisotremus virginicus/FEC-00104 Anisotremus virginicus.mp4
  - Anisotremus virginicus/FEC-00105 Anisotremus virginicus.mp4
  - Anisotremus virginicus/FEC-00106 Anisotremus virginicus.mp4
  - Ani

## Process All Videos

In [14]:
def extract_detection_timestamps(video_path, spectrogram_roi=None):
    """
    Process video and extract time intervals when white line crosses red detection boxes.
    
    Returns:
        list of (float, float): [(start_time_1, end_time_1), (start_time_2, end_time_2), ...]
        Times are in seconds.
    """
    cap = cv2.VideoCapture(str(video_path))
    fps = cap.get(cv2.CAP_PROP_FPS)

    detection_intervals = []
    was_detecting = False
    detection_start_frame = None

    frame_count = 0

    while True:
        ret, frame = cap.read()
        if not ret:
            break

        has_detection = detect_white_line_and_red_box(frame, spectrogram_roi)

        if has_detection:
            if not was_detecting:
                # Start of new detection event
                detection_start_frame = frame_count
                was_detecting = True
        else:
            if was_detecting:
                # End of detection - record start and end timestamps
                detection_end_frame = frame_count - 1
                start_time = detection_start_frame / fps
                end_time = detection_end_frame / fps
                detection_intervals.append((start_time, end_time))
                was_detecting = False

        frame_count += 1

    # Handle case where detection extends to end of video
    if was_detecting:
        detection_end_frame = frame_count - 1
        start_time = detection_start_frame / fps
        end_time = detection_end_frame / fps
        detection_intervals.append((start_time, end_time))

    cap.release()
    print(f"  Found {len(detection_intervals)} detection interval(s)")

    return detection_intervals


In [15]:
import pandas as pd

all_results = []

for video_file in video_files:
    print(f"\nProcessing: {video_file.name}")
    print("=" * 60)
    
    # Step 1: Extract audio
    audio_filename = video_file.stem + ".wav"
    audio_path = output_audio_dir / audio_filename
    extract_audio(video_file, audio_path)
    
    # Step 2: Extract detection intervals
    print(f"Analyzing video for white line intersecting red boxes...")
    detection_intervals = extract_detection_timestamps(video_file)
    
    # Step 3: Store results - one row per detection interval
    for start_time, end_time in detection_intervals:
        # Optional midpoint if you still care about it:
        midpoint = (start_time + end_time) / 2.0
        
        all_results.append({
            'video_file': video_file.name,
            'audio_file': audio_filename,
            'detection_start': start_time,
            'detection_end': end_time,
            # uncomment if you want to keep this for continuity
            # 'detection_midpoint': midpoint,
        })

print("\n" + "=" * 60)
print(f"Processing complete! Total detections: {len(all_results)}")




Processing: FEC-00136 Abudefduf saxatilis.mp4
✓ Extracted audio: FEC-00136 Abudefduf saxatilis.wav
Analyzing video for white line intersecting red boxes...
  Found 1 detection interval(s)

Processing: FEC-00137 Abudefduf saxatilis.mp4
✓ Extracted audio: FEC-00137 Abudefduf saxatilis.wav
Analyzing video for white line intersecting red boxes...
  Found 1 detection interval(s)

Processing: FEC-00138 Abudefduf saxatilis.mp4
✓ Extracted audio: FEC-00138 Abudefduf saxatilis.wav
Analyzing video for white line intersecting red boxes...
  Found 4 detection interval(s)

Processing: FEC-00139 Abudefduf saxatilis.mp4
✓ Extracted audio: FEC-00139 Abudefduf saxatilis.wav
Analyzing video for white line intersecting red boxes...
  Found 1 detection interval(s)

Processing: FEC-00140 Abudefduf saxatilis.mp4
✓ Extracted audio: FEC-00140 Abudefduf saxatilis.wav
Analyzing video for white line intersecting red boxes...
  Found 1 detection interval(s)

Processing: FEC-00006 Acanthurus coeruleus.mp4
✓ Extra

In [16]:
# Save to CSV
df = pd.DataFrame(all_results)
csv_path = output_csv_dir / "detection_timestamps.csv"
df.to_csv(csv_path, index=False)

print(f"\n✓ Results saved to: {csv_path}")

print("\nSummary - First 10 detections:")
print(df.head(10))

print(f"\nTotal detections across all videos: {len(df)}")


✓ Results saved to: /Users/wendycao/fish/fisheye/detection_timestamps/detection_timestamps.csv

Summary - First 10 detections:
                           video_file                          audio_file  \
0   FEC-00136 Abudefduf saxatilis.mp4   FEC-00136 Abudefduf saxatilis.wav   
1   FEC-00137 Abudefduf saxatilis.mp4   FEC-00137 Abudefduf saxatilis.wav   
2   FEC-00138 Abudefduf saxatilis.mp4   FEC-00138 Abudefduf saxatilis.wav   
3   FEC-00138 Abudefduf saxatilis.mp4   FEC-00138 Abudefduf saxatilis.wav   
4   FEC-00138 Abudefduf saxatilis.mp4   FEC-00138 Abudefduf saxatilis.wav   
5   FEC-00138 Abudefduf saxatilis.mp4   FEC-00138 Abudefduf saxatilis.wav   
6   FEC-00139 Abudefduf saxatilis.mp4   FEC-00139 Abudefduf saxatilis.wav   
7   FEC-00140 Abudefduf saxatilis.mp4   FEC-00140 Abudefduf saxatilis.wav   
8  FEC-00006 Acanthurus coeruleus.mp4  FEC-00006 Acanthurus coeruleus.wav   
9  FEC-00007 Acanthurus coeruleus.mp4  FEC-00007 Acanthurus coeruleus.wav   

   detection_start  dete

## Extract audio/video clips

In [26]:
import ffmpeg
from pathlib import Path


In [29]:

# Set your project root
root_dir = Path("/Users/wendycao/fish/fisheye")

# Existing folders
audio_output_dir = root_dir / "audio_output"
detection_csv_dir = root_dir / "detection_timestamps"

# Output folders for clips
output_video_clips_dir = root_dir / "video_clips_5s"
output_audio_clips_dir = root_dir / "audio_clips_5s"
output_video_clips_dir.mkdir(exist_ok=True)
output_audio_clips_dir.mkdir(exist_ok=True)

# Load detections CSV
csv_path = detection_csv_dir / "detection_timestamps.csv"
df = pd.read_csv(csv_path)


In [31]:
def get_media_duration(path: Path) -> float:
    """
    Return media duration in seconds using ffprobe via ffmpeg-python.
    """
    probe = ffmpeg.probe(str(path))
    return float(probe["format"]["duration"])


In [32]:
def make_5s_detection_video(video_path: Path,
                            center_time: float,
                            out_path: Path,
                            target_len: float = 5.0):
    """
    Create ~5s video clip centered on center_time (clamped to video duration).
    """
    duration = get_media_duration(video_path)

    start = max(0.0, center_time - target_len / 2.0)
    end = min(duration, center_time + target_len / 2.0)
    clip_len = end - start

    if clip_len <= 0:
        print(f"  [WARN] Skipping video: invalid interval ({start:.3f}, {end:.3f})")
        return

    (
        ffmpeg
        .input(str(video_path), ss=start, t=clip_len)
        .output(
            str(out_path),
            vcodec="libx264",
            acodec="aac",
        )
        .overwrite_output()
        .run(quiet=True)
    )


In [33]:
def make_5s_detection_audio(audio_path: Path,
                            det_start: float,
                            det_end: float,
                            out_path: Path,
                            target_len: float = 5.0):
    """
    Create a 5s WAV where the detected sound is centered in time
    and the rest is silence (using ffmpeg-python).
    """
    audio_duration = get_media_duration(audio_path)

    # Clamp detection interval
    start = max(0.0, det_start)
    end = min(audio_duration, det_end)

    if end <= start:
        print(f"  [WARN] Skipping audio: invalid interval ({start:.3f}, {end:.3f})")
        return

    sound_len = end - start

    # Case 1: detection >= target_len → just take central 5s of detection
    if sound_len >= target_len:
        central_start = start + (sound_len - target_len) / 2.0

        (
            ffmpeg
            .input(str(audio_path), ss=central_start, t=target_len)
            .output(
                str(out_path),
                acodec="pcm_s16le",
                ar=44100,
                ac=1,
            )
            .overwrite_output()
            .run(quiet=True)
        )
        return

    # Case 2: detection < target_len → create 5s silence, overlay detection in middle
    silence_total = target_len - sound_len
    pad_before = silence_total / 2.0
    delay_ms = int(round(pad_before * 1000))

    # 5s silence base
    silence = ffmpeg.input(
        "anullsrc=r=44100:cl=mono",
        f="lavfi",
        t=target_len,
    )

    # Detection segment
    segment = ffmpeg.input(str(audio_path), ss=start, t=sound_len)

    # Delay detection so it starts at pad_before seconds
    delayed = segment.filter("adelay", f"{delay_ms}|{delay_ms}")

    # Mix delayed detection onto silence
    mixed = ffmpeg.filter([silence, delayed], "amix", inputs=2, normalize=0)

    (
        ffmpeg
        .output(
            mixed,
            str(out_path),
            acodec="pcm_s16le",
            ar=44100,
            ac=1,
        )
        .overwrite_output()
        .run(quiet=True)
    )


In [34]:
# All non-cropped videos under root_dir
video_map = {
    p.name: p
    for p in root_dir.rglob("*.mp4")
    if "_cropped" not in p.stem
}

# All pre-extracted audio files in audio_output
audio_map = {
    p.name: p
    for p in audio_output_dir.glob("*.wav")
}

In [35]:
def get_species_name(video_path: Path) -> str:
    return video_path.parent.name


In [36]:
video_det_count = {}

for idx, row in df.iterrows():
    video_name = row["video_file"]
    audio_name = row["audio_file"]

    det_start = float(row["detection_start"])
    det_end = float(row["detection_end"])

    # Compute midpoint (center of detection)
    det_mid = (det_start + det_end) / 2.0

    video_path = video_map.get(video_name)
    audio_path = audio_map.get(audio_name)

    if video_path is None:
        print(f"[WARN] Missing video on disk for: {video_name}")
        continue
    if audio_path is None:
        print(f"[WARN] Missing audio on disk for: {audio_name}")
        continue

    # Species = folder name containing video
    species = get_species_name(video_path)

    # Create species output dirs
    species_video_dir = output_video_clips_dir / species
    species_audio_dir = output_audio_clips_dir / species
    species_video_dir.mkdir(parents=True, exist_ok=True)
    species_audio_dir.mkdir(parents=True, exist_ok=True)

    # Detection index counter per video
    count = video_det_count.get(video_name, 0)
    video_det_count[video_name] = count + 1
    det_id = f"{count:03d}"

    video_out = species_video_dir / f"{video_path.stem}_det{det_id}.mp4"
    audio_out = species_audio_dir / f"{audio_path.stem}_det{det_id}.wav"

    print(f"\nDetection {idx} | {species} | {video_name}")
    print(f"  start={det_start:.3f}, end={det_end:.3f}, mid={det_mid:.3f}")

    # Create clips
    make_5s_detection_video(video_path, det_mid, video_out)
    make_5s_detection_audio(audio_path, det_start, det_end, audio_out)

print("\n✓ Finished exporting all 5-second clips into species folders.")



Detection 0 | Abudefduf saxatilis | FEC-00136 Abudefduf saxatilis.mp4
  start=8.000, end=8.167, mid=8.083

Detection 1 | Abudefduf saxatilis | FEC-00137 Abudefduf saxatilis.mp4
  start=8.000, end=8.267, mid=8.133

Detection 2 | Abudefduf saxatilis | FEC-00138 Abudefduf saxatilis.mp4
  start=5.500, end=5.767, mid=5.633

Detection 3 | Abudefduf saxatilis | FEC-00138 Abudefduf saxatilis.mp4
  start=7.333, end=7.833, mid=7.583

Detection 4 | Abudefduf saxatilis | FEC-00138 Abudefduf saxatilis.mp4
  start=8.600, end=8.867, mid=8.733

Detection 5 | Abudefduf saxatilis | FEC-00138 Abudefduf saxatilis.mp4
  start=10.600, end=10.833, mid=10.717

Detection 6 | Abudefduf saxatilis | FEC-00139 Abudefduf saxatilis.mp4
  start=8.000, end=8.233, mid=8.117

Detection 7 | Abudefduf saxatilis | FEC-00140 Abudefduf saxatilis.mp4
  start=8.000, end=8.233, mid=8.117

Detection 8 | Acanthurus coeruleus | FEC-00006 Acanthurus coeruleus.mp4
  start=8.000, end=8.333, mid=8.167

Detection 9 | Acanthurus coerul

In [1]:
from pathlib import Path
import ffmpeg

# Directory containing species subfolders of 5s clips
ROOT = Path("/Users/wendycao/fish/fisheye/video_clips_5s")

# Crop settings
CROP_W = 1920
CROP_H = 800
CROP_X = 0
CROP_Y = 60


def crop_video(input_path: Path):
    """Crop a single 5s video clip using ffmpeg-python."""
    output_path = input_path.with_name(input_path.stem + "_cropped.mp4")

    if output_path.exists():
        print(f"  Skipping (exists): {output_path.name}")
        return

    print(f"  Cropping → {output_path.name}")

    (
        ffmpeg
        .input(str(input_path))
        .filter("crop", CROP_W, CROP_H, CROP_X, CROP_Y)
        .output(
            str(output_path),
            vcodec="libx264",
            crf=18,
            preset="fast",
            acodec="copy",  # keep audio track intact
        )
        .overwrite_output()
        .run(quiet=True)
    )


def main():
    if not ROOT.exists():
        print(f"ERROR: Folder does not exist: {ROOT}")
        return

    # Recursively find all mp4 files under video_clips_5s
    videos = list(ROOT.rglob("*.mp4"))
    print(f"Found {len(videos)} video clips under {ROOT}")

    for vid in videos:
        print(f"\nProcessing: {vid.relative_to(ROOT)}")
        try:
            crop_video(vid)
        except Exception as e:
            print(f"  ERROR: {e}")


if __name__ == "__main__":
    main()


Found 265 video clips under /Users/wendycao/fish/fisheye/video_clips_5s

Processing: Aulostomus maculatus/FEC-00014 Aulostomus maculatus_det000.mp4
  Cropping → FEC-00014 Aulostomus maculatus_det000_cropped.mp4

Processing: Aulostomus maculatus/FEC-00015 Aulostomus maculatus_det000.mp4
  Cropping → FEC-00015 Aulostomus maculatus_det000_cropped.mp4

Processing: Aulostomus maculatus/FEC-00013 Aulostomus maculatus_det000.mp4
  Cropping → FEC-00013 Aulostomus maculatus_det000_cropped.mp4

Processing: Stegastes planifrons/FEC-00098 Stegastes planifrons_det000.mp4
  Cropping → FEC-00098 Stegastes planifrons_det000_cropped.mp4

Processing: Stegastes planifrons/FEC-00098 Stegastes planifrons_det001.mp4
  Cropping → FEC-00098 Stegastes planifrons_det001_cropped.mp4

Processing: Stegastes planifrons/FEC-00101 Stegastes planifrons_det000.mp4
  Cropping → FEC-00101 Stegastes planifrons_det000_cropped.mp4

Processing: Stegastes planifrons/FEC-00099 Stegastes planifrons_det000.mp4
  Cropping → FEC-0