In [None]:
# ==========================================
# Cell 0 : 추가 패키지 설치
# ==========================================

!pip install audio_separator
!pip install -U bitsandbytes

In [1]:
# ==========================================
# Cell 1: 경로 설정 및 토큰 활성화
# ==========================================

from pathlib import Path
import os, sys
import json

# ---- 프로젝트 루트 ----
PROJECT_ROOT = Path("/workspace/baseball_pipeline")
os.chdir(PROJECT_ROOT)

if str(PROJECT_ROOT) not in sys.path:
    sys.path.append(str(PROJECT_ROOT))

print("PROJECT_ROOT:", PROJECT_ROOT)

# ---- 데이터 디렉토리 ----
from src import DATA_DIR, FAISS_DIR, FISH_ROOT

INPUT_VIDEO_DIR = DATA_DIR / "input_videos"
STT_RAW_DIR = DATA_DIR / "stt_raw"
STT_SEG_DIR = DATA_DIR / "stt_segments"
LLM_OUT_DIR = DATA_DIR / "llm_outputs"
TTS_AUDIO_DIR = DATA_DIR / "tts_audio"
OUTPUT_VIDEO_DIR = DATA_DIR / "output_videos"
AUDIO_ROOT = DATA_DIR / "audio_separator"
FRAMES_ROOT = DATA_DIR / "frames"
SRC_ROOT = PROJECT_ROOT / "src"

if str(SRC_ROOT) not in sys.path:
    sys.path.append(str(SRC_ROOT))

# 디렉토리 생성
for d in (DATA_DIR, INPUT_VIDEO_DIR, STT_RAW_DIR, STT_SEG_DIR, 
          LLM_OUT_DIR, TTS_AUDIO_DIR, OUTPUT_VIDEO_DIR, AUDIO_ROOT, 
          FRAMES_ROOT, FAISS_DIR, SRC_ROOT):
    d.mkdir(parents=True, exist_ok=True)

print("\n✅ 디렉토리 생성 완료")

# ---- API 토큰 설정 ----
CLOVA_INVOKE_URL = ""
CLOVA_SECRET_KEY = ""

HF_TOKEN = ""
# OPENAI_API_KEY = "sk-proj-..."  # 필요시 입력

if HF_TOKEN and "xxx" not in HF_TOKEN:
    os.environ["HF_TOKEN"] = HF_TOKEN
    os.environ["HUGGINGFACE_HUB_TOKEN"] = HF_TOKEN
    os.environ["HUGGING_FACE_HUB_TOKEN"] = HF_TOKEN

# if OPENAI_API_KEY:
#     os.environ["OPENAI_API_KEY"] = OPENAI_API_KEY

print("✅ API 토큰 설정 완료\n")

PROJECT_ROOT: /workspace/baseball_pipeline

✅ 디렉토리 생성 완료
✅ API 토큰 설정 완료



In [2]:
from src.google_mp4_download import download_gdrive_video

# ====== 여기에 구글 드라이브 링크 입력 ======
gdrive_url = "https://drive.google.com/file/d/118P_6YtVh65rwyX3N5wNaAqUeBoMX4-4/view?usp=sharing"
VIDEO_NAME = "한화_삼성_10_21_2025_플레이오프_3차전.mp4"

local_video_path = download_gdrive_video(gdrive_url, dest_name=VIDEO_NAME)
video_stem = Path(VIDEO_NAME).stem

print(f"\n✅ 영상 다운로드 완료")
print(f"  video_stem: {video_stem}")
print(f"  경로: {local_video_path}\n")

[GDRIVE] 파일이 이미 존재합니다: /workspace/skn17_final_runpod_code/baseball_pipeline/data/input_videos/한화_삼성_10_21_2025_플레이오프_3차전.mp4

✅ 영상 다운로드 완료
  video_stem: 한화_삼성_10_21_2025_플레이오프_3차전
  경로: /workspace/skn17_final_runpod_code/baseball_pipeline/data/input_videos/한화_삼성_10_21_2025_플레이오프_3차전.mp4



In [3]:
# ==========================================
# Cell 3: audio_separator 음성 분리
# ==========================================
from src.audio_separator import separate_audio_sota

track_dict = separate_audio_sota(
    video_path=str(local_video_path),
    output_dir=str(AUDIO_ROOT),
    device="cuda"
)

vocals_path = track_dict["vocals"]
no_vocals_path = track_dict["no_vocals"]

print(f"\n✅ 음성 분리 완료! (SOTA Performance)")
print(f"  - 해설(Vocals)    : {vocals_path}")
print(f"  - 현장음(No Vocals): {no_vocals_path}\n")


2025-12-08 01:09:06,421 - INFO - separator - Separator version 0.40.0 instantiating with output_dir: /workspace/skn17_final_runpod_code/baseball_pipeline/data/audio_separator, output_format: wav
2025-12-08 01:09:06,421 - INFO - separator - Using model directory from model_file_dir parameter: /workspace/skn17_final_runpod_code/baseball_pipeline/data/audio_separator/models
2025-12-08 01:09:06,423 - INFO - separator - Operating System: Linux #68~22.04.1-Ubuntu SMP PREEMPT_DYNAMIC Tue Jul 15 18:06:34 UTC 2
2025-12-08 01:09:06,423 - INFO - separator - System: Linux Node: 2ba1c709481c Release: 6.8.0-65-generic Machine: x86_64 Proc: x86_64
2025-12-08 01:09:06,424 - INFO - separator - Python Version: 3.10.19
2025-12-08 01:09:06,424 - INFO - separator - PyTorch Version: 2.5.1+cu121
2025-12-08 01:09:06,484 - INFO - separator - FFmpeg installed: ffmpeg version 6.1.1-3ubuntu5 Copyright (c) 2000-2023 the FFmpeg developers
2025-12-08 01:09:06,492 - INFO - separator - ONNX Runtime CPU package install


 오디오 분리 시작: /workspace/skn17_final_runpod_code/baseball_pipeline/data/input_videos/한화_삼성_10_21_2025_플레이오프_3차전.mp4
  사용 모델: model_bs_roformer_ep_317_sdr_12.9755.ckpt (BS-RoFormer-ViperX)
  실행 장치: cuda


2025-12-08 01:09:06,647 - INFO - separator - CUDA is available in Torch, setting Torch device to CUDA
2025-12-08 01:09:06,649 - INFO - separator - Loading model model_bs_roformer_ep_317_sdr_12.9755.ckpt...


  >> 모델 로딩 중... (최초 실행 시 다운로드에 시간이 소요됩니다)


2025-12-08 01:09:11,749 - INFO - mdxc_separator - MDXC Separator initialisation complete
2025-12-08 01:09:11,755 - INFO - separator - Roformer loading stats: {'new_implementation_success': 1, 'total_failures': 0}
2025-12-08 01:09:11,756 - INFO - separator - Load model duration: 00:00:05
2025-12-08 01:09:11,761 - INFO - separator - Processing file: /workspace/skn17_final_runpod_code/baseball_pipeline/data/input_videos/한화_삼성_10_21_2025_플레이오프_3차전.mp4
2025-12-08 01:09:11,762 - INFO - separator - Starting separation process for audio_file_path: /workspace/skn17_final_runpod_code/baseball_pipeline/data/input_videos/한화_삼성_10_21_2025_플레이오프_3차전.mp4


  >> 분리 작업 수행 중...


100%|██████████| 113/113 [01:45<00:00,  1.07it/s]
2025-12-08 01:11:04,046 - INFO - mdxc_separator - Saving Instrumental stem to 한화_삼성_10_21_2025_플레이오프_3차전_(Instrumental)_model_bs_roformer_ep_317_sdr_12.wav...
2025-12-08 01:11:04,242 - INFO - common_separator - Audio duration is 0.25 hours (899.40 seconds).
2025-12-08 01:11:04,243 - INFO - common_separator - Using pydub for writing.
2025-12-08 01:11:04,467 - INFO - common_separator - Writing output with 16-bit depth
2025-12-08 01:11:07,008 - INFO - mdxc_separator - Saving Vocals stem to 한화_삼성_10_21_2025_플레이오프_3차전_(Vocals)_model_bs_roformer_ep_317_sdr_12.wav...
2025-12-08 01:11:07,175 - INFO - common_separator - Audio duration is 0.25 hours (899.40 seconds).
2025-12-08 01:11:07,177 - INFO - common_separator - Using pydub for writing.
2025-12-08 01:11:07,458 - INFO - common_separator - Writing output with 16-bit depth
2025-12-08 01:11:10,208 - INFO - common_separator - Clearing input audio file paths, sources and stems...
2025-12-08 01:11

  >> 파일 정리 및 이름 변경 중...

✅ 음성 분리 완료! (SOTA Performance)
  - 해설(Vocals)    : /workspace/skn17_final_runpod_code/baseball_pipeline/data/audio_separator/vocals.wav
  - 현장음(No Vocals): /workspace/skn17_final_runpod_code/baseball_pipeline/data/audio_separator/no_vocals.wav



In [4]:
# ==========================================
# Cell 4: STT (Clova Speech API)
# ==========================================
from pathlib import Path
import json

from src.stt_pipeline import run_stt_pipeline
from src.stt_event_splitter import stt_json_to_event_sets

print(f"[STT] Clova STT 시작")
print(f"  입력: {vocals_path}")

STT_KEYWORD_XLSX = PROJECT_ROOT / "stt.xlsx"
if STT_KEYWORD_XLSX.exists():
    xlsx_path = STT_KEYWORD_XLSX
    use_domain = False
    print("  키워드: stt.xlsx 사용 (엑셀 부스팅)")
else:
    xlsx_path = None
    use_domain = True
    print("  키워드: 도메인 부스팅만 사용")

# ---- STT 파이프라인 실행 ----
timeline_json_path = run_stt_pipeline(
    audio_path=vocals_path,
    invoke_url=CLOVA_INVOKE_URL,
    secret_key=CLOVA_SECRET_KEY,
    stt_raw_dir=STT_RAW_DIR,
    stt_seg_dir=STT_SEG_DIR,
    xlsx_keywords_path=xlsx_path,
    use_domain_boostings=use_domain,
    speaker_count_min=2,
    speaker_count_max=3,
    save_raw_json=True,
    pause_thresh_ms=50000,  # 필요시 조절
)

print(f"\n✅ STT 완료!")
print(f"timeline.json : {timeline_json_path}\n")

[STT] Clova STT 시작
  입력: /workspace/skn17_final_runpod_code/baseball_pipeline/data/audio_separator/vocals.wav
  키워드: stt.xlsx 사용 (엑셀 부스팅)
[STT_PIPELINE] Clova STT 요청 시작: /workspace/skn17_final_runpod_code/baseball_pipeline/data/audio_separator/vocals.wav
[STT_PIPELINE] raw JSON 저장 -> /workspace/skn17_final_runpod_code/baseball_pipeline/data/stt_raw/vocals.clova_raw.json
[STT_PIPELINE] timeline JSON 저장 -> /workspace/skn17_final_runpod_code/baseball_pipeline/data/stt_segments/vocals.timeline.json

✅ STT 완료!
timeline.json : /workspace/skn17_final_runpod_code/baseball_pipeline/data/stt_segments/vocals.timeline.json



In [5]:
# ==========================================
# Cell 5: STT 데이터 전처리 (이벤트 세트)
# ==========================================
from src.stt_event_splitter import stt_json_to_event_sets

timeline_json_stem = timeline_json_path.stem
# 같은 디렉터리에 *_output.json 으로 저장
json_after_split_path = timeline_json_path.with_name(f"{timeline_json_stem}_set_split.json")

# 입력 JSON 로드
with timeline_json_path.open("r", encoding="utf-8") as f:
    stt_json = json.load(f)

# 이벤트 세트 변환
event_sets = stt_json_to_event_sets(
    stt_json,
    caster_gap=10.0,   # 필요시 튜닝
    silence_gap=2.0,  # 필요시 튜닝
)

print(f"이벤트 세트 개수: {len(event_sets)}")

# 출력 JSON 저장
with json_after_split_path.open("w", encoding="utf-8") as f:
    json.dump(event_sets, f, ensure_ascii=False, indent=2)

print(f"저장 완료: {json_after_split_path.resolve()}")

이벤트 세트 개수: 55
저장 완료: /workspace/skn17_final_runpod_code/baseball_pipeline/data/stt_segments/vocals.timeline_set_split.json


In [6]:
# ==========================================
# Cell 6: 영상 이미지 추출
# ==========================================

import json

from src.image_extraction import capture_frames_for_sets

# 세트 json 로드
with json_after_split_path.open("r", encoding="utf-8") as f:
    sets = json.load(f)   # 리스트 형태여야 함

print(f"세트 개수: {len(sets)}")

# 5) 세트의 set_start_sec 기준으로 이미지 추출
results = capture_frames_for_sets(
    video_path=local_video_path,
    sets=sets,
    output_dir=FRAMES_ROOT
)

# 6) 요약 출력
success_count = sum(1 for r in results if r["success"])
print(f"성공적으로 저장된 이미지 수: {success_count}")


세트 개수: 55
[INFO] 파일: /workspace/skn17_final_runpod_code/baseball_pipeline/data/input_videos/한화_삼성_10_21_2025_플레이오프_3차전.mp4
[INFO] FPS = 59.94005994005994, 총 프레임 수 = 53908, 길이 ≈ 899.365초
[DEBUG] set_id=vocals-1, ts=40.932s, frame_idx=2453, ret=True
[INFO] 세트 vocals-1: 40.932s 프레임 저장 → /workspace/skn17_final_runpod_code/baseball_pipeline/data/frames/vocals-1.jpg
[DEBUG] set_id=vocals-2, ts=57.630s, frame_idx=3454, ret=True
[INFO] 세트 vocals-2: 57.630s 프레임 저장 → /workspace/skn17_final_runpod_code/baseball_pipeline/data/frames/vocals-2.jpg
[DEBUG] set_id=vocals-3, ts=69.690s, frame_idx=4177, ret=True
[INFO] 세트 vocals-3: 69.690s 프레임 저장 → /workspace/skn17_final_runpod_code/baseball_pipeline/data/frames/vocals-3.jpg
[DEBUG] set_id=vocals-4, ts=82.370s, frame_idx=4937, ret=True
[INFO] 세트 vocals-4: 82.370s 프레임 저장 → /workspace/skn17_final_runpod_code/baseball_pipeline/data/frames/vocals-4.jpg
[DEBUG] set_id=vocals-5, ts=107.750s, frame_idx=6459, ret=True
[INFO] 세트 vocals-5: 107.750s 프레임 저장 → /work

[h264 @ 0x40342f40] mmco: unref short failure
[h264 @ 0x40342f40] mmco: unref short failure


[DEBUG] set_id=vocals-20, ts=316.090s, frame_idx=18946, ret=True
[INFO] 세트 vocals-20: 316.090s 프레임 저장 → /workspace/skn17_final_runpod_code/baseball_pipeline/data/frames/vocals-20.jpg
[DEBUG] set_id=vocals-21, ts=328.180s, frame_idx=19671, ret=True
[INFO] 세트 vocals-21: 328.180s 프레임 저장 → /workspace/skn17_final_runpod_code/baseball_pipeline/data/frames/vocals-21.jpg
[DEBUG] set_id=vocals-22, ts=345.810s, frame_idx=20728, ret=True
[INFO] 세트 vocals-22: 345.810s 프레임 저장 → /workspace/skn17_final_runpod_code/baseball_pipeline/data/frames/vocals-22.jpg


[h264 @ 0x40342f40] mmco: unref short failure
[h264 @ 0x40342f40] mmco: unref short failure


[DEBUG] set_id=vocals-23, ts=354.420s, frame_idx=21244, ret=True
[INFO] 세트 vocals-23: 354.420s 프레임 저장 → /workspace/skn17_final_runpod_code/baseball_pipeline/data/frames/vocals-23.jpg
[DEBUG] set_id=vocals-24, ts=369.501s, frame_idx=22148, ret=True
[INFO] 세트 vocals-24: 369.501s 프레임 저장 → /workspace/skn17_final_runpod_code/baseball_pipeline/data/frames/vocals-24.jpg
[DEBUG] set_id=vocals-25, ts=416.690s, frame_idx=24976, ret=True
[INFO] 세트 vocals-25: 416.690s 프레임 저장 → /workspace/skn17_final_runpod_code/baseball_pipeline/data/frames/vocals-25.jpg


[h264 @ 0x40342f40] mmco: unref short failure


[DEBUG] set_id=vocals-26, ts=430.110s, frame_idx=25781, ret=True
[INFO] 세트 vocals-26: 430.110s 프레임 저장 → /workspace/skn17_final_runpod_code/baseball_pipeline/data/frames/vocals-26.jpg
[DEBUG] set_id=vocals-27, ts=438.220s, frame_idx=26267, ret=True
[INFO] 세트 vocals-27: 438.220s 프레임 저장 → /workspace/skn17_final_runpod_code/baseball_pipeline/data/frames/vocals-27.jpg
[SKIP] 세트 vocals-28: analyst_text가 null/빈 문자열 → 이미지 추출 건너뜀
[SKIP] 세트 vocals-29: analyst_text가 null/빈 문자열 → 이미지 추출 건너뜀
[SKIP] 세트 vocals-30: analyst_text가 null/빈 문자열 → 이미지 추출 건너뜀
[DEBUG] set_id=vocals-31, ts=501.570s, frame_idx=30064, ret=True
[INFO] 세트 vocals-31: 501.570s 프레임 저장 → /workspace/skn17_final_runpod_code/baseball_pipeline/data/frames/vocals-31.jpg
[DEBUG] set_id=vocals-32, ts=518.780s, frame_idx=31096, ret=True
[INFO] 세트 vocals-32: 518.780s 프레임 저장 → /workspace/skn17_final_runpod_code/baseball_pipeline/data/frames/vocals-32.jpg
[DEBUG] set_id=vocals-33, ts=536.588s, frame_idx=32163, ret=True
[INFO] 세트 vocals-33: 536.5

[h264 @ 0x40342f40] mmco: unref short failure


[INFO] 세트 vocals-36: 575.520s 프레임 저장 → /workspace/skn17_final_runpod_code/baseball_pipeline/data/frames/vocals-36.jpg
[DEBUG] set_id=vocals-37, ts=584.350s, frame_idx=35026, ret=True
[INFO] 세트 vocals-37: 584.350s 프레임 저장 → /workspace/skn17_final_runpod_code/baseball_pipeline/data/frames/vocals-37.jpg
[SKIP] 세트 vocals-38: analyst_text가 null/빈 문자열 → 이미지 추출 건너뜀
[DEBUG] set_id=vocals-39, ts=614.490s, frame_idx=36833, ret=True
[INFO] 세트 vocals-39: 614.490s 프레임 저장 → /workspace/skn17_final_runpod_code/baseball_pipeline/data/frames/vocals-39.jpg
[DEBUG] set_id=vocals-40, ts=626.212s, frame_idx=37535, ret=True
[INFO] 세트 vocals-40: 626.212s 프레임 저장 → /workspace/skn17_final_runpod_code/baseball_pipeline/data/frames/vocals-40.jpg
[DEBUG] set_id=vocals-41, ts=647.600s, frame_idx=38817, ret=True
[INFO] 세트 vocals-41: 647.600s 프레임 저장 → /workspace/skn17_final_runpod_code/baseball_pipeline/data/frames/vocals-41.jpg
[DEBUG] set_id=vocals-42, ts=669.700s, frame_idx=40142, ret=True
[INFO] 세트 vocals-42: 669.7

[h264 @ 0x40342f40] mmco: unref short failure
[h264 @ 0x40342f40] mmco: unref short failure


[DEBUG] set_id=vocals-47, ts=751.460s, frame_idx=45043, ret=True
[INFO] 세트 vocals-47: 751.460s 프레임 저장 → /workspace/skn17_final_runpod_code/baseball_pipeline/data/frames/vocals-47.jpg
[DEBUG] set_id=vocals-48, ts=762.506s, frame_idx=45705, ret=True
[INFO] 세트 vocals-48: 762.506s 프레임 저장 → /workspace/skn17_final_runpod_code/baseball_pipeline/data/frames/vocals-48.jpg
[DEBUG] set_id=vocals-49, ts=781.510s, frame_idx=46844, ret=True
[INFO] 세트 vocals-49: 781.510s 프레임 저장 → /workspace/skn17_final_runpod_code/baseball_pipeline/data/frames/vocals-49.jpg
[SKIP] 세트 vocals-50: analyst_text가 null/빈 문자열 → 이미지 추출 건너뜀
[DEBUG] set_id=vocals-51, ts=806.430s, frame_idx=48337, ret=True
[INFO] 세트 vocals-51: 806.430s 프레임 저장 → /workspace/skn17_final_runpod_code/baseball_pipeline/data/frames/vocals-51.jpg
[DEBUG] set_id=vocals-52, ts=821.360s, frame_idx=49232, ret=True
[INFO] 세트 vocals-52: 821.360s 프레임 저장 → /workspace/skn17_final_runpod_code/baseball_pipeline/data/frames/vocals-52.jpg
[DEBUG] set_id=vocals-53, 

[h264 @ 0x40342f40] mmco: unref short failure


In [8]:
# ==========================================
# Cell 7: VLM 스코어보드 추출 (최적화 버전)
# ==========================================

from pathlib import Path

from src.vlm_scoreboard import (
    load_scoreboard_model_and_processor,
    attach_scoreboard_to_sets,
)

# ==========================
# 모델 / 프로세서 로드
# ==========================

vlm_model, vlm_processor = load_scoreboard_model_and_processor()


# ==========================
# scoreboard 파이프라인 실행
# ==========================
json_after_split_stem = json_after_split_path.stem
# 같은 디렉터리에 *_output.json 으로 저장
scoreboard_json_path = json_after_split_path.with_name(f"{json_after_split_stem}_scoreboard.json")

updated_sets = attach_scoreboard_to_sets(
    json_after_split_path=json_after_split_path,
    output_json_path=scoreboard_json_path,
    frames_root=FRAMES_ROOT,
    video_path=local_video_path,
    model=vlm_model,
    processor=vlm_processor,
    retry_if_all_null=False,   # all-null이면 +2초 재시도
    retry_offset_sec=2.0,
)

print(f"총 세트 수: {len(updated_sets)}")
print(f"저장 완료: {scoreboard_json_path.resolve()}")

# 일부만 눈으로 확인
for s in updated_sets[:5]:
    print("=" * 80)
    print("set_id:", s["set_id"])
    print("set_start_sec:", s["set_start_sec"])
    print("scoreboard:", s["scoreboard"])

compute_dtype = torch.bfloat16


Loading checkpoint shards: 100%|██████████| 4/4 [00:14<00:00,  3.50s/it]


모델 / 프로세서 로드 완료
[INFO] 세트 개수: 55
[INFO] 파일: /workspace/skn17_final_runpod_code/baseball_pipeline/data/input_videos/한화_삼성_10_21_2025_플레이오프_3차전.mp4
[INFO] FPS = 59.94005994005994, 총 프레임 수 = 53908, 길이 ≈ 899.365초

[SET] set_id=vocals-1, set_start_sec=40.932


Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.
The following generation flags are not valid and may be ignored: ['temperature', 'top_p', 'top_k']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


[INFO] 1차 scoreboard: {'원정팀': '한화 이글스', '홈팀': '삼성 라이온즈', '원정팀 점수': None, '홈팀 점수': None, '이닝': None, '이닝 상황': None, '볼': None, '스트라이크': None, '아웃': None, '주자': None, '투수 이름': None, '투구 수': None, '타자 이름': None, '타자 타순': None, '타자 경기 기록': None}

[SET] set_id=vocals-2, set_start_sec=57.630
[INFO] 1차 scoreboard: {'원정팀': None, '홈팀': None, '원정팀 점수': None, '홈팀 점수': None, '이닝': None, '이닝 상황': None, '볼': None, '스트라이크': None, '아웃': None, '주자': None, '투수 이름': None, '투구 수': None, '타자 이름': None, '타자 타순': None, '타자 경기 기록': None}

[SET] set_id=vocals-3, set_start_sec=69.690
[INFO] 1차 scoreboard: {'원정팀': None, '홈팀': None, '원정팀 점수': None, '홈팀 점수': None, '이닝': None, '이닝 상황': None, '볼': None, '스트라이크': None, '아웃': None, '주자': None, '투수 이름': None, '투구 수': None, '타자 이름': None, '타자 타순': None, '타자 경기 기록': None}

[SET] set_id=vocals-4, set_start_sec=82.370
[INFO] 1차 scoreboard: {'원정팀': '한화', '홈팀': '삼성', '원정팀 점수': 0, '홈팀 점수': 0, '이닝': 1, '이닝 상황': '말', '볼': 0, '스트라이크': 1, '아웃': 2, '주자': {'1루': False, '2루': False,

In [9]:
from pathlib import Path
import json

# from pakchanho_commentary_generator import (
#     load_pakchanho_model,
#     generate_analyst_for_all_sets,
# )

from src.llm_generator import (
    load_pakchanho_model,
    generate_analyst_for_all_sets,
)

# Pakchanho LLM 적용 후 저장할 경로
json_llm_output_path = LLM_OUT_DIR / "vocals_timeline_set_split_scoreboard_pakchanho.json"

# 1) 모델 로드
model, tokenizer = load_pakchanho_model(
    base_model_name="kakaocorp/kanana-1.5-8b-instruct-2505",
    lora_model_id="SeHee8546/kanana-1.5-8b-pakchanho-lora-v2",
    load_in_4bit=True,
)

# 2) 세트별 analyst_text 재생성
game_title = "2025 KBO 준플레이오프 4차전 삼성 vs SSG"  # 경기 정보는 상황에 맞게 바꾸면 됨

result_sets = generate_analyst_for_all_sets(
    json_in_path=scoreboard_json_path,
    json_out_path=json_llm_output_path,
    model=model,
    tokenizer=tokenizer,
    game_title=game_title,
    temperature=0.7,
    top_p=0.9,
    repetition_penalty=1.1,
    no_repeat_ngram_size=3,
    base_max_new_tokens=512,
)

print(f"총 세트 수: {len(result_sets)}")
print(f"저장 완료: {json_llm_output_path.resolve()}")

# 3) 샘플 몇 개 확인
for row in result_sets[:5]:
    print("=" * 80)
    print("set_id:", row["set_id"])
    print("caster_text:", row["caster_text"])
    print("analyst_text:", row["analyst_text"])

Loading checkpoint shards: 100%|██████████| 4/4 [00:14<00:00,  3.69s/it]


[INFO] Pakchanho model + tokenizer loaded.
[INFO] 세트 개수: 55
[LLM] vocals-1: analyst_text 재생성 중...
[LLM] vocals-2: analyst_text 재생성 중...
[LLM] vocals-3: analyst_text 재생성 중...
[LLM] vocals-4: analyst_text 재생성 중...
[LLM] vocals-5: analyst_text 재생성 중...
[LLM] vocals-6: analyst_text 재생성 중...
[LLM] vocals-7: analyst_text 재생성 중...
[LLM] vocals-8: analyst_text 재생성 중...
[LLM] vocals-9: analyst_text 재생성 중...
[SKIP] vocals-10: analyst_text 없음 → LLM 미호출
[LLM] vocals-11: analyst_text 재생성 중...
[LLM] vocals-12: analyst_text 재생성 중...
[LLM] vocals-13: analyst_text 재생성 중...
[LLM] vocals-14: analyst_text 재생성 중...
[LLM] vocals-15: analyst_text 재생성 중...
[LLM] vocals-16: analyst_text 재생성 중...
[SKIP] vocals-17: analyst_text 없음 → LLM 미호출
[LLM] vocals-18: analyst_text 재생성 중...
[LLM] vocals-19: analyst_text 재생성 중...
[LLM] vocals-20: analyst_text 재생성 중...
[LLM] vocals-21: analyst_text 재생성 중...
[LLM] vocals-22: analyst_text 재생성 중...
[LLM] vocals-23: analyst_text 재생성 중...
[LLM] vocals-24: analyst_text 재생성 중...
[LL

In [10]:
# ==========================================
# JSON 기반 TTS 전체 파이프라인
# ==========================================

from pathlib import Path
import pandas as pd

from src.json_tts_pipeline import run_full_tts_pipeline_from_json


# 이 영상에 대한 정보들
video_stem = local_video_path.stem

# LLM을 통과한 최종 세트 JSON (지금 올려준 파일)
# 현재는 이름이 "vocals_timeline_set_split_scoreboard_pakchanho (4).json" 이니까
# 1) 파일명을 위 규칙으로 바꾸거나
# 2) 그냥 정확한 이름을 직접 Path 로 넣어도 됩니다.
# json_sets_path = DATA_DIR / "llm_outputs" / "vocals_timeline_set_split_scoreboard_pakchanho (4).json"

# Fish-Speech API 설정
FISH_API_URL = "http://127.0.0.1:8080/v1/tts"

CASTER_REF_WAVS = [DATA_DIR / "tts_refs" / "caster_prompt_1.wav"]
ANALYST_REF_WAVS = [DATA_DIR / "tts_refs" / "analyst_pakchanho_prompt_1.wav"]

print(f"[TTS] JSON 기반 TTS 파이프라인 시작")
print(f"  JSON 세트 파일: {json_llm_output_path}")
print(f"  원본 영상: {local_video_path}")
print(f"  캐스터 참조: {CASTER_REF_WAVS}")
print(f"  해설 참조: {ANALYST_REF_WAVS}")

try:
    final_tts_wav, aligned_csv, tts_csv_with_paths = run_full_tts_pipeline_from_json(
        json_sets_path=json_llm_output_path,
        video_path=local_video_path,
        caster_ref_wavs=CASTER_REF_WAVS,
        analyst_ref_wavs=ANALYST_REF_WAVS,
        fish_api_url=FISH_API_URL,
        # 아래 파라미터들은 기존 셋업 그대로 사용 (필요하면 튜닝 가능)
        min_text_chars=2,
        merge_same_role=True,
        merge_gap_thresh_sec=0.25,
        merge_short_thresh_sec=1.0,
        min_gap_sec=0.02,
        caster_extra_ratio=0.2,
        analyst_extra_ratio=2.0,
        max_analyst_expand_sec=7.0,
        analyst_priority_min_overlap_sec=0.5,
        min_gap_ms=60,
        tail_margin_ms=80,
        caster_max_speedup=1.3,
        analyst_max_speedup=1.8,
    )

    print("\n✅ JSON → TTS → 정렬 → WSOLA 전체 완료!")
    print(f"  - TTS CSV(with paths): {tts_csv_with_paths}")
    print(f"  - 정렬 CSV: {aligned_csv}")
    print(f"  - 최종 TTS 타임라인 wav: {final_tts_wav}\n")

    # 통계 출력 (선택)
    tts_df = pd.read_csv(tts_csv_with_paths)
    print("📊 TTS 통계:")
    print(f"  - 총 발화 수: {len(tts_df)}")
    print(f"  - TTS 성공: {tts_df['tts_wav_path'].notna().sum()}개")
    print(f"  - TTS 실패: {tts_df['tts_wav_path'].isna().sum()}개")

except Exception as e:
    print(f"\n❌ JSON 기반 TTS 파이프라인 실패: {e}")
    import traceback
    traceback.print_exc()
    raise


[TTS] JSON 기반 TTS 파이프라인 시작
  JSON 세트 파일: /workspace/skn17_final_runpod_code/baseball_pipeline/data/llm_outputs/vocals_timeline_set_split_scoreboard_pakchanho.json
  원본 영상: /workspace/skn17_final_runpod_code/baseball_pipeline/data/input_videos/한화_삼성_10_21_2025_플레이오프_3차전.mp4
  캐스터 참조: [PosixPath('/workspace/skn17_final_runpod_code/baseball_pipeline/data/tts_refs/caster_prompt_1.wav')]
  해설 참조: [PosixPath('/workspace/skn17_final_runpod_code/baseball_pipeline/data/tts_refs/analyst_pakchanho_prompt_1.wav')]
[JSON_TTS] JSON → CSV 변환 완료: /workspace/skn17_final_runpod_code/baseball_pipeline/data/llm_outputs/한화_삼성_10_21_2025_플레이오프_3차전.tts_phrases.from_json.csv
[JSON_TTS] 총 발화 수: 101 (rows)
[TTS_API] 입력 CSV: /workspace/skn17_final_runpod_code/baseball_pipeline/data/llm_outputs/한화_삼성_10_21_2025_플레이오프_3차전.tts_phrases.from_json.csv
[TTS_API] video_stem: 한화_삼성_10_21_2025_플레이오프_3차전
[TTS_API] 출력 디렉토리: /workspace/skn17_final_runpod_code/baseball_pipeline/data/tts_audio/한화_삼성_10_21_2025_플레이오프_3차전
[TTS_A

In [11]:
# ==========================================
# Cell 12: 최종 영상 인코딩
# ==========================================

import subprocess

OUTPUT_VIDEO_DIR = DATA_DIR / "final_videos"
OUTPUT_VIDEO_DIR.mkdir(parents=True, exist_ok=True)

# Demucs 등에서 만든 no_vocals wav 경로 (기존 파이프라인에 맞게 설정)
vocals_path = DATA_DIR / "demucs" / f"{video_stem}.vocals.wav"
no_vocals_path = DATA_DIR / "demucs" / f"{video_stem}.no_vocals.wav"

def merge_audio_and_encode_video(
    original_video_path: Path,
    tts_vocals_path: Path,
    bg_no_vocals_path: Path,
    output_video_path: Path,
    tts_volume: float = 1.0,
    bg_volume: float = 0.7,
) -> Path:
    """
    1) TTS vocals + 배경음(no_vocals) 믹싱
    2) 원본 비디오와 합쳐서 최종 영상 생성
    """
    output_video_path.parent.mkdir(parents=True, exist_ok=True)
    
    cmd = [
        "ffmpeg", "-y",
        "-i", str(original_video_path),
        "-i", str(tts_vocals_path),
        "-i", str(bg_no_vocals_path),
        "-filter_complex",
        f"[1:a]volume={tts_volume}[a1];"
        f"[2:a]volume={bg_volume}[a2];"
        f"[a1][a2]amix=inputs=2:duration=first[amix]",
        "-map", "0:v:0",
        "-map", "[amix]",
        "-c:v", "copy",
        "-c:a", "aac",
        "-shortest",
        str(output_video_path),
    ]
    
    print("[ENCODING] 최종 영상 인코딩 중...")
    print(" ".join(cmd))
    subprocess.run(cmd, check=True)
    
    print(f"[ENCODING] 완료: {output_video_path}")
    return output_video_path


final_video_path = OUTPUT_VIDEO_DIR / f"{video_stem}.final.mp4"

print(f"[ENCODING] 최종 영상 생성 시작")
print(f"  원본 비디오: {local_video_path}")
print(f"  TTS 음성: {final_tts_wav}")
print(f"  배경음: {no_vocals_path}")
print(f"  출력: {final_video_path}")

try:
    final_video = merge_audio_and_encode_video(
        original_video_path=local_video_path,
        tts_vocals_path=final_tts_wav,
        bg_no_vocals_path=no_vocals_path,
        output_video_path=final_video_path,
        tts_volume=1.0,
        bg_volume=0.7,
    )
    
    print("\n" + "="*80)
    print("🎉 전체 파이프라인 완료!")
    print("="*80)
    print(f"최종 영상: {final_video}")
    print(f"파일 크기: {final_video.stat().st_size / (1024**2):.2f} MB")
    print("="*80 + "\n")

except Exception as e:
    print(f"\n❌ 최종 인코딩 실패: {e}")
    import traceback
    traceback.print_exc()
    raise


[ENCODING] 최종 영상 생성 시작
  원본 비디오: /workspace/skn17_final_runpod_code/baseball_pipeline/data/input_videos/한화_삼성_10_21_2025_플레이오프_3차전.mp4
  TTS 음성: /workspace/skn17_final_runpod_code/baseball_pipeline/data/tts_audio/한화_삼성_10_21_2025_플레이오프_3차전/한화_삼성_10_21_2025_플레이오프_3차전.tts_timeline.wav
  배경음: /workspace/skn17_final_runpod_code/baseball_pipeline/data/demucs/한화_삼성_10_21_2025_플레이오프_3차전.no_vocals.wav
  출력: /workspace/skn17_final_runpod_code/baseball_pipeline/data/final_videos/한화_삼성_10_21_2025_플레이오프_3차전.final.mp4
[ENCODING] 최종 영상 인코딩 중...
ffmpeg -y -i /workspace/skn17_final_runpod_code/baseball_pipeline/data/input_videos/한화_삼성_10_21_2025_플레이오프_3차전.mp4 -i /workspace/skn17_final_runpod_code/baseball_pipeline/data/tts_audio/한화_삼성_10_21_2025_플레이오프_3차전/한화_삼성_10_21_2025_플레이오프_3차전.tts_timeline.wav -i /workspace/skn17_final_runpod_code/baseball_pipeline/data/demucs/한화_삼성_10_21_2025_플레이오프_3차전.no_vocals.wav -filter_complex [1:a]volume=1.0[a1];[2:a]volume=0.7[a2];[a1][a2]amix=inputs=2:duration=first[amix

ffmpeg version 6.1.1-3ubuntu5 Copyright (c) 2000-2023 the FFmpeg developers
  built with gcc 13 (Ubuntu 13.2.0-23ubuntu3)
  configuration: --prefix=/usr --extra-version=3ubuntu5 --toolchain=hardened --libdir=/usr/lib/x86_64-linux-gnu --incdir=/usr/include/x86_64-linux-gnu --arch=amd64 --enable-gpl --disable-stripping --disable-omx --enable-gnutls --enable-libaom --enable-libass --enable-libbs2b --enable-libcaca --enable-libcdio --enable-libcodec2 --enable-libdav1d --enable-libflite --enable-libfontconfig --enable-libfreetype --enable-libfribidi --enable-libglslang --enable-libgme --enable-libgsm --enable-libharfbuzz --enable-libmp3lame --enable-libmysofa --enable-libopenjpeg --enable-libopenmpt --enable-libopus --enable-librubberband --enable-libshine --enable-libsnappy --enable-libsoxr --enable-libspeex --enable-libtheora --enable-libtwolame --enable-libvidstab --enable-libvorbis --enable-libvpx --enable-libwebp --enable-libx265 --enable-libxml2 --enable-libxvid --enable-libzimg --ena

CalledProcessError: Command '['ffmpeg', '-y', '-i', '/workspace/skn17_final_runpod_code/baseball_pipeline/data/input_videos/한화_삼성_10_21_2025_플레이오프_3차전.mp4', '-i', '/workspace/skn17_final_runpod_code/baseball_pipeline/data/tts_audio/한화_삼성_10_21_2025_플레이오프_3차전/한화_삼성_10_21_2025_플레이오프_3차전.tts_timeline.wav', '-i', '/workspace/skn17_final_runpod_code/baseball_pipeline/data/demucs/한화_삼성_10_21_2025_플레이오프_3차전.no_vocals.wav', '-filter_complex', '[1:a]volume=1.0[a1];[2:a]volume=0.7[a2];[a1][a2]amix=inputs=2:duration=first[amix]', '-map', '0:v:0', '-map', '[amix]', '-c:v', 'copy', '-c:a', 'aac', '-shortest', '/workspace/skn17_final_runpod_code/baseball_pipeline/data/final_videos/한화_삼성_10_21_2025_플레이오프_3차전.final.mp4']' returned non-zero exit status 254.