# 1. 폴더 구조 정리

In [1]:
# 드라이브 마운트
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
import os

BASE_DIR      = "/content/drive/MyDrive/preprocessing"
RAW_ZIP_DIR   = f"{BASE_DIR}/raw_videos_zip"
RAW_DIR       = f"{BASE_DIR}/raw_videos"        # zip 풀린 원본 영상
CROPPED_DIR   = f"{BASE_DIR}/cropped_videos"    # 크롭된 영상
META_DIR      = f"{BASE_DIR}/metadata"

for d in [RAW_DIR, CROPPED_DIR, META_DIR]:
    os.makedirs(d, exist_ok=True)

print(BASE_DIR)
!ls "$BASE_DIR"

/content/drive/MyDrive/preprocessing
cropped_videos	crop_videos.ipynb  metadata  raw_videos  raw_videos_zip


# 2. zip 10개를 사람별 폴더로 압축 해제

zip 이름: sign_medical_videos_01.zip … sign_medical_videos_10.zip
→ 각각을 raw_videos/P01, raw_videos/P02, … 로 풀자.

In [3]:
import zipfile, glob, os

zip_paths = sorted(glob.glob(os.path.join(RAW_ZIP_DIR, "sign_medical_videos_*.zip")))
zip_paths

['/content/drive/MyDrive/preprocessing/raw_videos_zip/sign_medical_videos_01.zip',
 '/content/drive/MyDrive/preprocessing/raw_videos_zip/sign_medical_videos_02.zip',
 '/content/drive/MyDrive/preprocessing/raw_videos_zip/sign_medical_videos_03.zip',
 '/content/drive/MyDrive/preprocessing/raw_videos_zip/sign_medical_videos_04.zip',
 '/content/drive/MyDrive/preprocessing/raw_videos_zip/sign_medical_videos_05.zip',
 '/content/drive/MyDrive/preprocessing/raw_videos_zip/sign_medical_videos_06.zip',
 '/content/drive/MyDrive/preprocessing/raw_videos_zip/sign_medical_videos_07.zip',
 '/content/drive/MyDrive/preprocessing/raw_videos_zip/sign_medical_videos_08.zip',
 '/content/drive/MyDrive/preprocessing/raw_videos_zip/sign_medical_videos_09.zip',
 '/content/drive/MyDrive/preprocessing/raw_videos_zip/sign_medical_videos_10.zip']

In [5]:
for zip_path in zip_paths:
    basename = os.path.basename(zip_path)              # sign_medical_videos_01.zip
    person_idx = basename.split("_")[-1].split(".")[0] # "01"
    person_dir = os.path.join(RAW_DIR, f"P{person_idx}")

    os.makedirs(person_dir, exist_ok=True)
    print(f"Extracting {basename} -> {person_dir}")
    with zipfile.ZipFile(zip_path, 'r') as zf:
        zf.extractall(person_dir)

Extracting sign_medical_videos_01.zip -> /content/drive/MyDrive/preprocessing/raw_videos/P01
Extracting sign_medical_videos_02.zip -> /content/drive/MyDrive/preprocessing/raw_videos/P02
Extracting sign_medical_videos_03.zip -> /content/drive/MyDrive/preprocessing/raw_videos/P03
Extracting sign_medical_videos_04.zip -> /content/drive/MyDrive/preprocessing/raw_videos/P04
Extracting sign_medical_videos_05.zip -> /content/drive/MyDrive/preprocessing/raw_videos/P05
Extracting sign_medical_videos_06.zip -> /content/drive/MyDrive/preprocessing/raw_videos/P06
Extracting sign_medical_videos_07.zip -> /content/drive/MyDrive/preprocessing/raw_videos/P07
Extracting sign_medical_videos_08.zip -> /content/drive/MyDrive/preprocessing/raw_videos/P08
Extracting sign_medical_videos_09.zip -> /content/drive/MyDrive/preprocessing/raw_videos/P09
Extracting sign_medical_videos_10.zip -> /content/drive/MyDrive/preprocessing/raw_videos/P10


# 3. 22개의 단어 메타데이터 읽기

* 이 작업을 하는 이유는 노션에 자세히 정리해둘 것임.
(이 작업 없이는 동작 구간 크롭 불가)

In [6]:
import pandas as pd
import re
import os

# 경로는 아까와 동일하게 사용
BASE_DIR      = "/content/drive/MyDrive/preprocessing"
RAW_DIR       = f"{BASE_DIR}/raw_videos"
CROPPED_DIR   = f"{BASE_DIR}/cropped_videos"
META_DIR      = f"{BASE_DIR}/metadata"

meta_path = os.path.join(META_DIR, "medical_sign_22.csv")
meta = pd.read_csv(meta_path)
meta.head()

Unnamed: 0,video_name,angle,url,duration,exported_on,segment_start,segment_end,label,json_path
0,NIA_SL_WORD0029_REAL01_F.mp4,F,https://blackolivevideo.blob.core.windows.net/...,3.867,2020/12/10,1.055,2.793,검사,C:\Users\user\Desktop\01\NIA_SL_WORD0029_REAL0...
1,NIA_SL_WORD0033_REAL01_F.mp4,F,https://blackolivevideo.blob.core.windows.net/...,5.667,2020/12/10,2.373,4.57,당뇨병,C:\Users\user\Desktop\01\NIA_SL_WORD0033_REAL0...
2,NIA_SL_WORD0036_REAL01_F.mp4,F,https://blackolivevideo.blob.core.windows.net/...,4.467,2020/12/10,1.616,3.315,면역,C:\Users\user\Desktop\01\NIA_SL_WORD0036_REAL0...
3,NIA_SL_WORD0037_REAL01_F.mp4,F,https://blackolivevideo.blob.core.windows.net/...,4.4,2020/12/10,1.523,3.209,감기,C:\Users\user\Desktop\01\NIA_SL_WORD0037_REAL0...
4,NIA_SL_WORD0039_REAL01_F.mp4,F,https://blackolivevideo.blob.core.windows.net/...,4.934,2020/12/10,2.122,3.847,변비,C:\Users\user\Desktop\01\NIA_SL_WORD0039_REAL0...


## 3-1. WORD 번호 추출해서 단어별 구간만 정리

In [7]:
# WORD 번호 추출 (예: NIA_SL_WORD0029_REAL01_F.mp4 → 0029)
meta["word_id"] = meta["video_name"].str.extract(r"WORD(\d+)_")[0]

# 단어당 1행만 남기기
word_meta = meta[["word_id", "segment_start", "segment_end", "label"]].drop_duplicates("word_id")

print("단어 개수:", len(word_meta))   # 22여야 정상
word_meta.head()

단어 개수: 22


Unnamed: 0,word_id,segment_start,segment_end,label
0,29,1.055,2.793,검사
1,33,2.373,4.57,당뇨병
2,36,1.616,3.315,면역
3,37,1.523,3.209,감기
4,39,2.122,3.847,변비


# 4. raw_videos 안의 모든 mp4 인덱싱

P01~P10 안에 있는 mp4를 싹 스캔해서
word_id / person_id / angle(F,D,L,R,U)를 뽑는 단계

In [10]:
import pandas as pd
import re
import os

rows = []

for person_folder in sorted(os.listdir(RAW_DIR)):
    p_dir = os.path.join(RAW_DIR, person_folder)
    if not os.path.isdir(p_dir):
        continue

    for root, dirs, files in os.walk(p_dir):
        # 1) __MACOSX 폴더 자체는 통째로 무시(Mac에서 압축했을 때 생기는 쓰레기 파일)
        if "__MACOSX" in root:
            continue

        for fname in files:
            # 2) Mac 메타데이터 파일 (._로 시작) 무시
            if fname.startswith("._"):
                continue

            if not fname.endswith(".mp4"):
                continue

            m = re.match(r"NIA_SL_WORD(\d+)_REAL(\d+)_([DFLRU])\.mp4", fname)
            if not m:
                # 진짜 이상한 mp4만 있으면 이때만 프린트 (지금은 거의 없을 것)
                print("예상 밖 파일명, 확인 필요:", os.path.join(root, fname))
                continue

            word_id, person_idx, angle = m.groups()
            rows.append({
                "person_folder": person_folder,
                "person_id": person_idx,
                "word_id": word_id,
                "angle": angle,
                "filename": fname,
                "path": os.path.join(root, fname),
            })

raw_df = pd.DataFrame(rows)
print("실제 영상 개수:", len(raw_df))
raw_df.head()

실제 영상 개수: 1100


Unnamed: 0,person_folder,person_id,word_id,angle,filename,path
0,P01,1,39,D,NIA_SL_WORD0039_REAL01_D.mp4,/content/drive/MyDrive/preprocessing/raw_video...
1,P01,1,187,F,NIA_SL_WORD0187_REAL01_F.mp4,/content/drive/MyDrive/preprocessing/raw_video...
2,P01,1,885,L,NIA_SL_WORD0885_REAL01_L.mp4,/content/drive/MyDrive/preprocessing/raw_video...
3,P01,1,39,R,NIA_SL_WORD0039_REAL01_R.mp4,/content/drive/MyDrive/preprocessing/raw_video...
4,P01,1,1496,L,NIA_SL_WORD1496_REAL01_L.mp4,/content/drive/MyDrive/preprocessing/raw_video...


## 4-1. 영상 정보 + 단어 정보를 하나의 표로 합치기

* raw_df -> 원본 영상 파일에서 뽑은 정보
* word_meta -> medical_sign_22.csv에서 가져온 정보
(두 정보를 word_id 기준으로 합침)

In [11]:
raw_with_meta = raw_df.merge(word_meta, on="word_id", how="left")
print("segment_start 없는 행 수:", raw_with_meta["segment_start"].isna().sum())
raw_with_meta.head()

segment_start 없는 행 수: 0


Unnamed: 0,person_folder,person_id,word_id,angle,filename,path,segment_start,segment_end,label
0,P01,1,39,D,NIA_SL_WORD0039_REAL01_D.mp4,/content/drive/MyDrive/preprocessing/raw_video...,2.122,3.847,변비
1,P01,1,187,F,NIA_SL_WORD0187_REAL01_F.mp4,/content/drive/MyDrive/preprocessing/raw_video...,1.51,3.329,간호사
2,P01,1,885,L,NIA_SL_WORD0885_REAL01_L.mp4,/content/drive/MyDrive/preprocessing/raw_video...,1.149,3.163,치료제
3,P01,1,39,R,NIA_SL_WORD0039_REAL01_R.mp4,/content/drive/MyDrive/preprocessing/raw_video...,2.122,3.847,변비
4,P01,1,1496,L,NIA_SL_WORD1496_REAL01_L.mp4,/content/drive/MyDrive/preprocessing/raw_video...,1.679,2.58,병원


In [14]:
# 인덱스 저장
raw_with_meta.to_csv(os.path.join(META_DIR, "video_index_raw.csv"), index=False)

# 5. ffmpeg로 영상 크롭

In [16]:
# ffmpeg로 영상 크롭

import subprocess
import os

def crop_with_ffmpeg(src, dst, start, end):
    duration = max(0, float(end) - float(start))
    if duration < 0.2:
        print("too short, skip:", src)
        return

    os.makedirs(os.path.dirname(dst), exist_ok=True)

    cmd = [
        "ffmpeg",
        "-y",
        "-i", src,
        "-ss", f"{start:.3f}",
        "-t",  f"{duration:.3f}",
        "-c:v", "libx264",
        "-c:a", "aac",
        "-loglevel", "error",
        dst,
    ]
    subprocess.run(cmd, check=True)

In [17]:
!ffmpeg -version | head -n 1

ffmpeg version 4.4.2-0ubuntu0.22.04.1 Copyright (c) 2000-2021 the FFmpeg developers


In [19]:
from tqdm import tqdm
import os
import pandas as pd

BASE_DIR    = "/content/drive/MyDrive/preprocessing"
CROPPED_DIR = f"{BASE_DIR}/cropped_videos"
META_DIR    = f"{BASE_DIR}/metadata"

os.makedirs(CROPPED_DIR, exist_ok=True)

valid_rows = raw_with_meta.dropna(subset=["segment_start"])
print("크롭할 영상 수:", len(valid_rows))

cropped_rows = []

for _, row in tqdm(valid_rows.iterrows(), total=len(valid_rows)):
    word_id = row["word_id"]
    label   = row["label"]
    angle   = row["angle"]
    person  = row["person_id"]          # 예: "01"

    src_path = row["path"]

    # 폴더 구조: 단어 / 사람 / 파일
    word_folder_name = f"WORD{word_id}_{label}"
    person_folder_name = f"P{person}"   # P01, P02 ...

    out_folder = os.path.join(CROPPED_DIR, word_folder_name, person_folder_name)
    out_path   = os.path.join(out_folder, row["filename"])  # 원본 파일명 그대로 사용

    try:
        crop_with_ffmpeg(
            src=src_path,
            dst=out_path,
            start=float(row["segment_start"]),
            end=float(row["segment_end"])
        )
    except subprocess.CalledProcessError:
        print("ffmpeg error:", src_path)
        continue

    cropped_rows.append({
        "word_id": word_id,
        "label": label,
        "person_id": person,
        "angle": angle,
        "src_path": src_path,
        "cropped_path": out_path,
        "segment_start": row["segment_start"],
        "segment_end": row["segment_end"],
    })

cropped_df = pd.DataFrame(cropped_rows)
cropped_df.to_csv(os.path.join(META_DIR, "video_index_cropped.csv"), index=False)

print("완료된 크롭 영상 개수:", len(cropped_df))

크롭할 영상 수: 1100


  9%|▉         | 97/1100 [10:15<58:38,  3.51s/it]  

ffmpeg error: /content/drive/MyDrive/preprocessing/raw_videos/P01/sign_medical_videos_02/NIA_SL_WORD0400_REAL01_R.mp4


  9%|▉         | 100/1100 [10:29<1:01:31,  3.69s/it]

ffmpeg error: /content/drive/MyDrive/preprocessing/raw_videos/P01/sign_medical_videos_02/NIA_SL_WORD0400_REAL01_D.mp4


 10%|▉         | 109/1100 [11:11<1:07:29,  4.09s/it]

ffmpeg error: /content/drive/MyDrive/preprocessing/raw_videos/P01/sign_medical_videos_02/NIA_SL_WORD0400_REAL01_U.mp4


100%|██████████| 1100/1100 [1:54:44<00:00,  6.26s/it]

완료된 크롭 영상 개수: 1097





### 실패한 영상 3개 다시 시도

In [20]:
import pandas as pd

cropped_df = pd.DataFrame(cropped_rows)
print("성공한 크롭 개수:", len(cropped_df))

success_src = set(cropped_df["src_path"])
failed_df = raw_with_meta[~raw_with_meta["path"].isin(success_src)]

print("실패/스킵된 영상 수:", len(failed_df))
failed_df[["word_id", "label", "person_id", "angle", "path"]]

성공한 크롭 개수: 1097
실패/스킵된 영상 수: 3


Unnamed: 0,word_id,label,person_id,angle,path
96,400,정밀검사,1,R,/content/drive/MyDrive/preprocessing/raw_video...
99,400,정밀검사,1,D,/content/drive/MyDrive/preprocessing/raw_video...
108,400,정밀검사,1,U,/content/drive/MyDrive/preprocessing/raw_video...


In [22]:
import subprocess, os

def crop_with_ffmpeg_debug(src, dst, start, end):
    duration = max(0, float(end) - float(start))
    if duration < 0.2:
        print("too short, skip:", src)
        return

    os.makedirs(os.path.dirname(dst), exist_ok=True)

    cmd = [
        "ffmpeg",
        "-y",
        "-ss", f"{start:.3f}",
        "-i", src,
        "-t", f"{duration:.3f}",
        "-c", "copy",          # 재인코딩 없이 복사 크롭
        "-loglevel", "error",
        dst,
    ]
    print("▶ ffmpeg:", " ".join(cmd))
    subprocess.run(cmd, check=True)


In [23]:
from tqdm import tqdm

BASE_DIR    = "/content/drive/MyDrive/preprocessing"
CROPPED_DIR = f"{BASE_DIR}/cropped_videos"

new_rows = []

for _, row in tqdm(failed_df.iterrows(), total=len(failed_df)):
    word_id = row["word_id"]
    label   = row["label"]
    person  = row["person_id"]
    angle   = row["angle"]

    src_path = row["path"]

    word_folder_name   = f"WORD{word_id}_{label}"
    person_folder_name = f"P{person}"

    out_folder = os.path.join(CROPPED_DIR, word_folder_name, person_folder_name)
    out_path   = os.path.join(out_folder, row["filename"])

    if os.path.exists(out_path):
        print("이미 존재, 스킵:", out_path)
        continue

    try:
        crop_with_ffmpeg_debug(
            src=src_path,
            dst=out_path,
            start=float(row["segment_start"]),
            end=float(row["segment_end"])
        )
    except subprocess.CalledProcessError:
        print("다시 실패:", src_path)
        continue

    new_rows.append({
        "word_id": word_id,
        "label": label,
        "person_id": person,
        "angle": angle,
        "src_path": src_path,
        "cropped_path": out_path,
        "segment_start": row["segment_start"],
        "segment_end": row["segment_end"],
    })


  0%|          | 0/3 [00:00<?, ?it/s]

▶ ffmpeg: ffmpeg -y -ss 0.950 -i /content/drive/MyDrive/preprocessing/raw_videos/P01/sign_medical_videos_02/NIA_SL_WORD0400_REAL01_R.mp4 -t 1.990 -c copy -loglevel error /content/drive/MyDrive/preprocessing/cropped_videos/WORD0400_정밀검사/P01/NIA_SL_WORD0400_REAL01_R.mp4


 67%|██████▋   | 2/3 [00:00<00:00,  6.54it/s]

다시 실패: /content/drive/MyDrive/preprocessing/raw_videos/P01/sign_medical_videos_02/NIA_SL_WORD0400_REAL01_R.mp4
▶ ffmpeg: ffmpeg -y -ss 0.950 -i /content/drive/MyDrive/preprocessing/raw_videos/P01/sign_medical_videos_02/NIA_SL_WORD0400_REAL01_D.mp4 -t 1.990 -c copy -loglevel error /content/drive/MyDrive/preprocessing/cropped_videos/WORD0400_정밀검사/P01/NIA_SL_WORD0400_REAL01_D.mp4
다시 실패: /content/drive/MyDrive/preprocessing/raw_videos/P01/sign_medical_videos_02/NIA_SL_WORD0400_REAL01_D.mp4
▶ ffmpeg: ffmpeg -y -ss 0.950 -i /content/drive/MyDrive/preprocessing/raw_videos/P01/sign_medical_videos_02/NIA_SL_WORD0400_REAL01_U.mp4 -t 1.990 -c copy -loglevel error /content/drive/MyDrive/preprocessing/cropped_videos/WORD0400_정밀검사/P01/NIA_SL_WORD0400_REAL01_U.mp4


100%|██████████| 3/3 [00:00<00:00,  6.82it/s]

다시 실패: /content/drive/MyDrive/preprocessing/raw_videos/P01/sign_medical_videos_02/NIA_SL_WORD0400_REAL01_U.mp4



