In [None]:
from datasets import load_dataset
from pymongo import MongoClient
import webvtt
import av
import tempfile
from io import BytesIO
from PIL import Image

# Setup MongoDB connection
mongo_client = MongoClient("mongodb://localhost:27017/")
mongo_db = mongo_client["video_processing_db"]
video_frames_collection = mongo_db["video_frames"]

def convert_vtt_timestamp_to_seconds(timestamp):
    """Convert VTT timestamp to seconds"""
    time_parts = timestamp.split(":")
    if len(time_parts) == 3:  # HH:MM:SS.sss
        return sum(float(part) * 60 ** idx for idx, part in enumerate(reversed(time_parts[:3])))
    return float(timestamp)  # Fallback for unexpected formats

def extract_subtitles_from_vtt(vtt_data):
    """Extract subtitle content and times from VTT file"""
    subtitles_list = []
    for caption in webvtt.read_buffer(BytesIO(vtt_data)):
        try:
            start_time = convert_vtt_timestamp_to_seconds(caption.start.split(".")[0])
            end_time = convert_vtt_timestamp_to_seconds(caption.end.split(".")[0])
            subtitle_text = caption.text.replace("\n", " ").strip()
            subtitles_list.append((start_time, end_time, subtitle_text))
        except Exception as e:
            print(f"Error processing subtitle: {str(e)}")
    return sorted(subtitles_list, key=lambda x: x[0])

def process_video_sample(video_sample):
    try:
        video_data = video_sample["mp4"]
        subtitle_data = video_sample["en.vtt"]
        video_metadata = video_sample.get("info.json", {})
        
        # Extract video metadata
        video_details = {
            "video_id": video_metadata.get("id", ""),
            "video_title": video_metadata.get("title", ""),
        }
        
        # Extract subtitles from VTT data
        subtitles = extract_subtitles_from_vtt(subtitle_data)
        
        with tempfile.NamedTemporaryFile(suffix=".mp4") as temp_video_file:
            temp_video_file.write(video_data)
            temp_video_file.flush()
            
            container = av.open(temp_video_file.name)
            video_stream = container.streams.video[0]
            
            previous_subtitle = None  # Track the last used subtitle
            
            for video_frame in container.decode(video_stream):
                frame_timestamp = float(video_frame.pts * video_stream.time_base)
                
                # Find the subtitle matching the current frame's timestamp
                matching_subtitle = None
                for start, end, text in subtitles:
                    if start <= frame_timestamp <= end:
                        matching_subtitle = text
                        break
                
                # Skip if the subtitle has not changed (to avoid redundancy)
                if matching_subtitle == previous_subtitle:
                    continue
                
                # Save the frame with the new subtitle or as a keyframe
                if matching_subtitle:
                    pil_image = video_frame.to_image()
                    image_buffer = BytesIO()
                    pil_image.save(image_buffer, format="JPEG")
                    
                    # Create a document with video details and frame data
                    frame_document = {
                        **video_details,  # Include video metadata like video_id, video_title
                        "timestamp": frame_timestamp,
                        "subtitle_text": matching_subtitle,
                        "image_data": image_buffer.getvalue(),
                    }
                    
                    # Insert the frame document into MongoDB
                    video_frames_collection.insert_one(frame_document)
                    previous_subtitle = matching_subtitle  # Update the last used subtitle

        print(f"Successfully processed video: {video_details['video_id']} - {video_details['video_title']}")
    
    except Exception as e:
        print(f"Error processing video {video_details.get('video_id', 'unknown')}: {str(e)}")

# Load dataset from webdataset
dataset = load_dataset("webdataset", data_files="dataset/youtube_dataset.tar", streaming=True).with_format("torch")

# Iterate over the dataset and process each video sample
for video_sample in dataset["train"]:
    process_video_sample(video_sample)

  from .autonotebook import tqdm as notebook_tqdm


Successfully processed video: 9CGGh6ivg68 - 1 introduction
Successfully processed video: WXoOohWU28Y - 2 convolution
Successfully processed video: TV-DjM8242s - 3 cnn architectures
Successfully processed video: rCVlIVKqqGE - 4 image classification
Successfully processed video: lb_5AdUpfuA - 5 what cnns learn
Successfully processed video: FCQ-rih6cHY - 6 residual networks
Successfully processed video: eQ6UE968Xe4 - 1 the learning problem classification
Successfully processed video: eFgkZKhNUdM - 2 logistic regression
