In [None]:
!pip install ffmpeg-python pillow
!git clone https://github.com/soCzech/TransNetV2.git
%cd TransNetV2/inference

In [None]:
import os
import cv2
import json
import glob
import torch
import ffmpeg
import numpy as np
from tqdm.auto import tqdm
from transnetv2 import TransNetV2
from timeit import default_timer as timer

In [None]:
# Defined input directories
input_dirs = [
    '/kaggle/input/aic2024-videos-part1-1', # Contains L01 to L06
    # '/kaggle/input/aic2024-videos-part1'     # Contains L07 to L12
]

all_video_paths = dict()

# Iterate through both directories
for videos_dir in input_dirs:
    if not os.path.exists(videos_dir):
        print(f"Directory not found: {videos_dir}")
        continue
        
    for part in sorted(os.listdir(videos_dir)):
        # Check if folder name matches "Videos_Lxx" pattern
        if not part.startswith("Videos_"):
            continue
            
        data_part = part.split('_')[-1] # Extracts L01, L02...
        
        # Initialize dictionary for this part if not exists
        if data_part not in all_video_paths:
            all_video_paths[data_part] = dict()

        data_part_path = f'{videos_dir}/Videos_{data_part}/video'
        
        if not os.path.exists(data_part_path):
            continue
            
        video_paths = sorted(os.listdir(data_part_path))
        # Filter for mp4 only to be safe
        video_ids = [vp.replace('.mp4', '').split('_')[-1] for vp in video_paths if vp.endswith('.mp4')]
        
        for video_id, video_path in zip(video_ids, video_paths):
            if not video_path.endswith('.mp4'): continue
            video_path_full = f'{data_part_path}/{video_path}'
            all_video_paths[data_part][video_id] = video_path_full

print(f"Total Parts Found: {sorted(all_video_paths.keys())}")

In [None]:
num_batch = 1
BATCH_ID = 0
MEMBER_ID = 0    
num_member = 1   # Ch·∫°y 100% video

all_videos = [x for v in all_video_paths.values() for x in v.values()]

# Safety check
if len(all_videos) > 0:
    # T√≠nh s·ªë l∆∞·ª£ng video m·ªói member c·∫ßn l√†m
    # D√πng math.ceil ƒë·ªÉ ƒë·∫£m b·∫£o chia h·∫øt ho·∫∑c d∆∞ v√†o batch cu·ªëi
    import math
    batch_len = math.ceil(len(all_videos) / num_batch / num_member)
else:
    batch_len = 0

all_batches_info = {n: {} for n in range(num_batch)}
current_idx = 0

for n in range(num_batch):
    for m in range(num_member):
        start = current_idx
        end = current_idx + batch_len
        
        # ƒê·∫£m b·∫£o kh√¥ng v∆∞·ª£t qu√° t·ªïng s·ªë video
        if end > len(all_videos):
            end = len(all_videos)
            
        # --- FIX QUAN TR·ªåNG T·∫†I ƒê√ÇY ---
        # Lu√¥n g√°n v√†o dict con [m], KH√îNG BAO GI·ªú g√°n tr·ª±c ti·∫øp list v√†o [n]
        # Ngay c·∫£ khi num_member = 1, ta v·∫´n d√πng key [0]
        all_batches_info[n][m] = all_videos[start:end]
        
        current_idx = end
        
# Debug: Ki·ªÉm tra xem n√≥ c√≥ ph·∫£i l√† List kh√¥ng
print(f"Type check: {type(all_batches_info[BATCH_ID][MEMBER_ID])}") 
# N√≥ ph·∫£i in ra <class 'list'> th√¨ m·ªõi ƒë√∫ng. Tr∆∞·ªõc ƒë√≥ n√≥ in ra <class 'str'> n√™n m·ªõi l·ªói.

with open("/kaggle/working/batch_info.json", 'w') as f:
    # Convert keys to str for JSON serialization if needed, though int keys are coerced
    json.dump(all_batches_info, f)
    
print(f"Total videos to process: {len(all_videos)}")
print(f"Videos assigned to this worker: {len(all_batches_info[BATCH_ID][MEMBER_ID])}")

In [None]:
model = TransNetV2()

In [None]:
%%time
import os
import json

save_dir = '/kaggle/working/scenes'
os.makedirs(save_dir, exist_ok=True)

# L·∫•y danh s√°ch video t·ª´ config ·ªü Cell 4
videos_to_process = all_batches_info[BATCH_ID][MEMBER_ID]
print(f"üöÄ Start processing {len(videos_to_process)} videos for Scene Detection...")

count = 0
for i, video_path in enumerate(videos_to_process):
    try:
        # --- LOGIC FIX L·ªñI T√äN FILE ---
        filename = video_path.split('/')[-1]
        parts = filename.split('_')
        
        if len(parts) >= 2:
            # Chu·∫©n: L01_V001.mp4 -> Batch: L01
            video_batch = parts[0]
            video_name = "_".join(parts[1:])
        else:
            # L·ªói: test.mp4 -> L·∫•y Batch t·ª´ t√™n th∆∞ m·ª•c cha (Videos_L01)
            parent_dir = video_path.split('/')[-3] 
            if "Videos_" in parent_dir:
                video_batch = parent_dir.split('_')[-1]
            else:
                video_batch = "Uncategorized"
            video_name = filename

        video_name = video_name.replace('.mp4', '')
        
        # T·∫°o th∆∞ m·ª•c con: /scenes/L01
        batch_save_dir = os.path.join(save_dir, video_batch)
        os.makedirs(batch_save_dir, exist_ok=True)
        
        json_path = f"{batch_save_dir}/{video_name}.json"
        
        # N·∫øu file ƒë√£ c√≥ r·ªìi th√¨ b·ªè qua (ƒë·ªÉ resume n·∫øu b·ªã ng·∫Øt)
        if os.path.exists(json_path):
            continue

        # --- G·ªåI TRANSNET MODEL ---
        _, single_frame_predictions, _ = model.predict_video(video_path)
        scenes = model.predictions_to_scenes(single_frame_predictions)
        
        with open(json_path, 'w') as f:
            json.dump(scenes.tolist(), f)
            
        count += 1
        if count % 10 == 0:
            print(f"   ...Processed {count}/{len(videos_to_process)} scenes.")

    except Exception as e:
        print(f"‚ö†Ô∏è Error on {video_path}: {e}")
        continue

print(f"‚úÖ Scene Detection FINISHED! Created {count} json files.")

In [None]:
# CELL 7: Optimized Save Frames with Compression
def save_frames(video_path: str, frame_numbers: np.ndarray, save_dir: str):
    video = cv2.VideoCapture(video_path)
    frame_numbers = np.sort(np.unique(frame_numbers))
    
    frame_idx = 0
    if len(frame_numbers) == 0:
        video.release()
        return

    frame_it = frame_numbers[frame_idx]
    max_frame = frame_numbers[-1] + 1
    
    for i in range(max_frame):
        ret, frame = video.read()       
        if not ret: break
            
        if i == frame_it:
            filename = "{}/{:0>4d}.jpg".format(f'{save_dir}', i)
            # N√©n ·∫£nh JPG ch·∫•t l∆∞·ª£ng 80-85 ƒë·ªÉ ti·∫øt ki·ªám dung l∆∞·ª£ng
            cv2.imwrite(filename, frame, [int(cv2.IMWRITE_JPEG_QUALITY), 80])
            
            frame_idx += 1
            if frame_idx < len(frame_numbers):
                frame_it = frame_numbers[frame_idx]
            else:
                break
    video.release()

In [None]:
# --- CELL 8: Split Strategy & Safe Direct-Write ---
import numpy as np
from tqdm.auto import tqdm
import cv2
import os
import json
import zipfile
import shutil
import glob

# ==========================================
# ‚öôÔ∏è C·∫§U H√åNH BATCH C·∫¶N CH·∫†Y T·∫†I ƒê√ÇY
# ==========================================
# Run 1: Ch·∫°y L01 -> L05
TARGET_BATCHES = ['L01', 'L02', 'L03', 'L04', 'L05']
OUTPUT_ZIP_NAME = 'keyframes_part1_set1.zip'

# Run 2 (Notebook kh√°c ho·∫∑c l·∫ßn ch·∫°y sau): Uncomment d√≤ng d∆∞·ªõi
# TARGET_BATCHES = ['L06'] 
# OUTPUT_ZIP_NAME = 'keyframes_part1_set2.zip'
# ==========================================

# ƒê∆∞·ªùng d·∫´n l√†m vi·ªác
work_dir = '/kaggle/working'
scene_json_dirs = '/kaggle/working/scenes'
final_zip_path = os.path.join(work_dir, OUTPUT_ZIP_NAME)

# D·ªçn d·∫πp file c≈© n·∫øu c√≥ ƒë·ªÉ tr√°nh l·ªói append
if os.path.exists(final_zip_path):
    print(f"‚ö†Ô∏è Found existing {OUTPUT_ZIP_NAME}, removing to start fresh...")
    os.remove(final_zip_path)

# --- H√ÄM CHI·∫æN L∆Ø·ª¢C FRAME (GI·ªÆ NGUY√äN) ---
def get_adaptive_frames(scenes):
    frames_to_capture = []
    for start, end in scenes:
        duration = end - start
        if duration <= 1: continue 
        if duration < 25: 
            frames_to_capture.append((start + end) // 2)
        elif duration < 150: 
            frames_to_capture.extend([start, (start + end) // 2, end - 1])
        else: 
            step = 50 
            sampled = range(start, end, step)
            frames_to_capture.extend(sampled)
            if (end - 1) not in frames_to_capture:
                frames_to_capture.append(end - 1)
    return sorted(list(set(frames_to_capture)))

# --- H√ÄM KI·ªÇM TRA DUNG L∆Ø·ª¢NG TR·ªêNG ---
def get_free_space_gb():
    total, used, free = shutil.disk_usage(work_dir)
    return free / (1024**3)

# --- V√íNG L·∫∂P CH√çNH ---
print(f"üöÄ Starting Extraction for Batches: {TARGET_BATCHES}")
print(f"üíæ Saving to: {OUTPUT_ZIP_NAME}")

# M·ªü file Zip M·ªòT L·∫¶N DUY NH·∫§T ·ªü ch·∫ø ƒë·ªô 'w'
# ƒêi·ªÅu n√†y gi√∫p t·ªëi ∆∞u I/O v√† tr√°nh l·ªói ph√¢n m·∫£nh file
with zipfile.ZipFile(final_zip_path, 'w', zipfile.ZIP_DEFLATED) as zf:
    
    stop_processing = False
    
    for key in TARGET_BATCHES:
        if stop_processing: break
        
        # Ki·ªÉm tra xem Batch n√†y c√≥ trong d·ªØ li·ªáu ƒë·∫ßu v√†o kh√¥ng
        if key not in all_video_paths:
            print(f"‚ö†Ô∏è Batch {key} not found in input sources. Skipping.")
            continue
            
        video_paths_dict = all_video_paths[key]
        video_ids = sorted(video_paths_dict.keys())
        
        print(f"üëâ Processing Batch {key} ({len(video_ids)} videos)...")
        
        # Progress bar cho t·ª´ng video
        pbar = tqdm(video_ids, desc=f"Zipping {key}", unit="vid")
        
        for video_id in pbar:
            # --- SAFETY CHECK: D·ª™NG N·∫æU ·ªî C·ª®NG S·∫ÆP ƒê·∫¶Y (< 1GB) ---
            if get_free_space_gb() < 1.0:
                print(f"\nüõë CRITICAL WARNING: Disk space low ({get_free_space_gb():.2f} GB left).")
                print("üõë Stopping gracefully to save current Zip file.")
                stop_processing = True
                break

            # Logic t√¨m file Scene JSON
            scene_path_v1 = f'{scene_json_dirs}/{key}/{key}_{video_id}.json'
            scene_path_v2 = f'{scene_json_dirs}/{key}/{video_id}.json'
            final_scene_path = scene_path_v1 if os.path.exists(scene_path_v1) else (scene_path_v2 if os.path.exists(scene_path_v2) else None)
            
            if not final_scene_path: continue

            try:
                with open(final_scene_path) as f:
                    scenes = json.load(f)
                scenes = np.array([list(row) for row in scenes])
                if len(scenes) == 0: continue

                frame_numbers = get_adaptive_frames(scenes)
                if not frame_numbers: continue
                
                # M·ªü Video & Extract th·∫≥ng v√†o RAM -> Zip
                video_path = video_paths_dict[video_id]
                vid_cap = cv2.VideoCapture(video_path)
                
                f_idx = 0
                max_f = frame_numbers[-1] + 1
                
                for i in range(max_f):
                    ret, frame = vid_cap.read()
                    if not ret: break
                    
                    if i == frame_numbers[f_idx]:
                        # N√©n ·∫£nh JPG v√†o RAM (Quality 80)
                        ret_enc, buffer = cv2.imencode('.jpg', frame, [int(cv2.IMWRITE_JPEG_QUALITY), 80])
                        
                        if ret_enc:
                            # C·∫•u tr√∫c: keyframes/L01/V001/0001.jpg
                            zip_entry_name = f"keyframes/{key}/{video_id}/{i:04d}.jpg"
                            zf.writestr(zip_entry_name, buffer.tobytes())
                        
                        f_idx += 1
                        if f_idx >= len(frame_numbers): break
                            
                vid_cap.release()
                
            except Exception as e:
                # print(f"Error {video_id}: {e}")
                pass

    if stop_processing:
        print("‚ö†Ô∏è Process stopped early due to disk limits.")
    else:
        print(f"‚úÖ Finished processing all requested batches: {TARGET_BATCHES}")

print(f"üì¶ Closing Zip file...")
# Khi tho√°t kh·ªèi block 'with', file zip s·∫Ω t·ª± ƒë·ªông ƒë∆∞·ª£c finalize an to√†n

final_size = os.path.getsize(final_zip_path) / (1024**3)
print(f"üéâ DONE! File created: {OUTPUT_ZIP_NAME} ({final_size:.2f} GB)")

In [None]:
# %cd /kaggle/working/

# print("üì¶ Zipping Scenes...")
# !zip -rq scenes.zip scenes/

# import os
# def get_size(path):
#     if os.path.exists(path):
#         size = os.path.getsize(path) / (1024 * 1024 * 1024)
#         print(f"   -> {path}: {size:.2f} GB")
#     else:
#         print(f"   -> {path}: NOT FOUND")

# print("üìä Checking Output Sizes:")
# get_size('keyframes.zip')
# get_size('scenes.zip')

# from IPython.display import FileLink
# print("\n‚¨áÔ∏è DOWNLOAD LINKS:")
# display(FileLink('keyframes.zip'))
# display(FileLink('scenes.zip'))

# # Cleanup (Optional)
# # !rm -rf scenes
# # !rm -rf temp_keyframes