In [20]:
import os
import re
from collections import defaultdict

In [27]:
# Define the image root folder
images_base_dir = r"C:\Users\swhit\OneDrive\Desktop\IFCB\DATA\images\redo\debris_removed"

In [28]:
# This will store sample_id → set of ROI numbers
roi_lookup = defaultdict(set)

In [29]:
# Recursively walk through all files
for root, dirs, files in os.walk(images_base_dir):
    for file in files:
        if file.endswith(".png") and re.search(r"_\d{5}\.png$", file):
            match = re.match(r"(D\d+T\d+_IFCB\d+)_0*(\d{1,5})\.png", file)
            if match:
                sample_id, roi = match.groups()
                roi_lookup[sample_id].add(int(roi))

print(f"✅ Loaded ROI data for {len(roi_lookup)} samples.")

✅ Loaded ROI data for 8 samples.


In [30]:
import pandas as pd

In [31]:
features_dir = r"C:\Users\swhit\OneDrive\Desktop\IFCB\DATA\features\redo"

In [32]:
for csv_file in os.listdir(features_dir):
    if csv_file.endswith(".csv") and "_features_v2" in csv_file:
        sample_id = csv_file.replace("_features_v2.csv", "")
        csv_path = os.path.join(features_dir, csv_file)

        if sample_id not in roi_lookup:
            print(f"⚠️ Skipping {csv_file}: no image ROIs found for {sample_id}")
            continue

        try:
            df = pd.read_csv(csv_path)
            if "roi_number" not in df.columns:
                print(f"⚠️ Skipping {csv_file}: no 'roi_number' column")
                continue

            valid_rois = roi_lookup[sample_id]
            df_filtered = df[df["roi_number"].isin(valid_rois)]

            out_path = os.path.join(features_dir, csv_file.replace(".csv", "_filtered.csv"))
            df_filtered.to_csv(out_path, index=False)

            print(f"✅ Saved {out_path}: {len(df_filtered)} / {len(df)} rows kept.")

        except Exception as e:
            print(f"❌ Error processing {csv_file}: {e}")

✅ Saved C:\Users\swhit\OneDrive\Desktop\IFCB\DATA\features\redo\D20250505T223819_IFCB121_features_v2_filtered.csv: 6649 / 6797 rows kept.
✅ Saved C:\Users\swhit\OneDrive\Desktop\IFCB\DATA\features\redo\D20250505T231847_IFCB121_features_v2_filtered.csv: 4811 / 4893 rows kept.
✅ Saved C:\Users\swhit\OneDrive\Desktop\IFCB\DATA\features\redo\D20250506T000014_IFCB121_features_v2_filtered.csv: 6716 / 6785 rows kept.
✅ Saved C:\Users\swhit\OneDrive\Desktop\IFCB\DATA\features\redo\D20250507T202756_IFCB121_features_v2_filtered.csv: 357 / 447 rows kept.
✅ Saved C:\Users\swhit\OneDrive\Desktop\IFCB\DATA\features\redo\D20250507T210825_IFCB121_features_v2_filtered.csv: 9909 / 10003 rows kept.
✅ Saved C:\Users\swhit\OneDrive\Desktop\IFCB\DATA\features\redo\D20250507T214808_IFCB121_features_v2_filtered.csv: 6357 / 6419 rows kept.
✅ Saved C:\Users\swhit\OneDrive\Desktop\IFCB\DATA\features\redo\D20250507T232457_IFCB121_features_v2_filtered.csv: 3097 / 3210 rows kept.
✅ Saved C:\Users\swhit\OneDrive\Des