In [1]:
import pandas as pd
from pathlib import Path


train_data_meta_path = "./notebook_results/mmaction_train_data_v14-inclusive/train_data.txt"
train_data_df = pd.read_csv(train_data_meta_path, sep=" ")

test_data_meta_path = "./notebook_results/mmaction_train_data_v14-inclusive/test_data.txt"
test_data_df = pd.read_csv(test_data_meta_path, sep=" ")

video_dir = Path("./notebook_results/mmaction_train_data_v14-inclusive/videos")


In [2]:
train_data_df["split"] = "train"
test_data_df["split"] = "test"
combined_df = pd.concat([train_data_df, test_data_df], ignore_index=True)


We filtered a small number of videos due to a decord bug: "https://github.com/dmlc/decord/issues/150". Lets keep consistency with V14-inclusive livecellaction data here by only using the videos included in these paths.

In [3]:
from pathlib import PosixPath


whitelist_paths = [PosixPath('notebook_results/mmaction_train_data_v14-inclusive/mmaction_train_data_video.txt'),
 PosixPath('notebook_results/mmaction_train_data_v14-inclusive/mmaction_test_data_video.txt'),
 PosixPath('notebook_results/mmaction_train_data_v14-inclusive/mmaction_train_data_mask.txt'),
 PosixPath('notebook_results/mmaction_train_data_v14-inclusive/mmaction_test_data_mask.txt'),
 PosixPath('notebook_results/mmaction_train_data_v14-inclusive/mmaction_train_data_combined.txt'),
 PosixPath('notebook_results/mmaction_train_data_v14-inclusive/mmaction_test_data_combined.txt'),
 PosixPath('notebook_results/mmaction_train_data_v14-inclusive/mmaction_train_data_all.txt'),
 PosixPath('notebook_results/mmaction_train_data_v14-inclusive/mmaction_test_data_all.txt')]

In [4]:
# Read the paths from the whitelist files

whitelist_video_paths = []
for df_path in whitelist_paths:
    _df = pd.read_csv(df_path, sep=" ", header=None)
    cur_paths = _df[0].tolist()
    whitelist_video_paths.extend(cur_paths)

print("# whitelist videos:", len(whitelist_video_paths))

# whitelist videos: 336168


In [5]:
whitelist_video_paths = set(whitelist_video_paths)

# Filter out the videos that are not in the whitelist from combined_df
filtered_combined_df = combined_df[combined_df["path"].isin(whitelist_video_paths)]

In [6]:
print("# combined videos:", len(combined_df))
print("# filtered videos:", len(filtered_combined_df))

# combined videos: 168147
# filtered videos: 168084


In [7]:
del combined_df

Decompose all the videos in combined_df to images

In [20]:
out_path = PosixPath("notebook_results/mmaction_train_data_v14-inclusive-imgs")
out_path.mkdir(exist_ok=True)
imgs_dir = out_path / "imgs"
imgs_dir.mkdir(exist_ok=True)
def decompose_to_images(video_path, row, imgs_dir: Path):
    # Read the video and decompose it to images, frame by frame. For each timeframe, save the image to the output path

    import cv2
    # Open the video file
    cap = cv2.VideoCapture(str(video_path))

    # Check if video opened successfully
    if not cap.isOpened(): 
        print("Error opening video file")

    frame_count = 0
    filename_without_ext = Path(video_path).stem
    data_rows = []
    while(cap.isOpened()):
        # Capture frame-by-frame
        ret, frame = cap.read()
        if ret == True:
            # Save the resulting frame
            _img_path = str(imgs_dir / f"{filename_without_ext}_{frame_count}.png")
            success = cv2.imwrite(_img_path, frame)
            assert success, "Failed to write image via cv2.imwrite"
            
            new_row = row.copy()
            new_row["img_path"] = _img_path
            new_row["frame_idx"] = frame_count
            data_rows.append(new_row)
            frame_count += 1
        else: 
            break

    # When everything done, release the video capture object
    cap.release()
    return data_rows

from livecellx.core.parallel import parallelize


inputs = []
for _, row in filtered_combined_df.iterrows():
    video_path = video_dir / row["path"]
    # data_rows = decompose_to_images(video_path, row, imgs_dir)
    inputs.append({
       "video_path": video_path,
        "imgs_dir": imgs_dir,
        "row": row
    })


outputs = parallelize(decompose_to_images, inputs)

data_rows = []
for _output in outputs:
    data_rows.extend(_output)
df = pd.DataFrame(data_rows)

100%|██████████| 168084/168084 [17:34<00:00, 159.40it/s] 


In [21]:
df.to_csv(out_path / "train_and_test.csv", index=False)

# save train and test splits
train_df = df[df["split"] == "train"]
test_df = df[df["split"] == "test"]
train_df.to_csv(out_path / "train.csv", index=False)
test_df.to_csv(out_path / "test.csv", index=False)

In [23]:
train_df.columns

Index(['path', 'label_index', 'padding_pixels', 'frame_type', 'src_dir',
       'track_id', 'start_time', 'end_time', 'first_sc_id',
       'mitosis_traj_type', 'split', 'img_path', 'frame_idx'],
      dtype='object')