<a href="https://colab.research.google.com/github/vikaspathak0911/MyPython/blob/main/video_notes.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [5]:
!pip install -U yt-dlp
!pip install --upgrade scenedetect[opencv]


Collecting scenedetect[opencv]
  Downloading scenedetect-0.6.6-py3-none-any.whl.metadata (4.0 kB)
Downloading scenedetect-0.6.6-py3-none-any.whl (131 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m131.6/131.6 kB[0m [31m4.3 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: scenedetect
Successfully installed scenedetect-0.6.6


In [6]:
import os
import json
import shutil
import logging
import concurrent.futures
import subprocess
from PIL import Image
import cv2
from scenedetect import VideoManager, SceneManager
from scenedetect.detectors import ContentDetector

# --- Configuration ---
import getpass

In [7]:
# User input for playlist URL and output directory
YOUTUBE_PLAYLIST_URL = input('Paste your YouTube playlist URL: ').strip()
OUTPUT_BASE_DIR = input('Enter Google Drive folder path for PDFs (e.g., /content/drive/MyDrive/Slides): ').strip()
TEMP_VIDEO_DIR = '/content/temp_videos'
SCENE_DETECTION_THRESHOLD = 15.0  # Adjust as needed

# Ensure output directory exists
os.makedirs(OUTPUT_BASE_DIR, exist_ok=True)
os.makedirs(TEMP_VIDEO_DIR, exist_ok=True)

# --- Logging Setup ---
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')


Paste your YouTube playlist URL: https://www.youtube.com/playlist?list=PLxWXx9SeSNsMYhCRYkb5lDXJKmIiv6wfh
Enter Google Drive folder path for PDFs (e.g., /content/drive/MyDrive/Slides): /content/drive/My Drive/YoutubeVideoNotes


In [8]:
# --- Helper Functions ---

def get_video_urls_from_playlist(playlist_url):
    """Extracts video URLs from a YouTube playlist using yt-dlp with safe JSON parsing."""
    logging.info("Extracting video URLs from playlist...")
    command = [
        'yt-dlp', '--flat-playlist', '--print-json', playlist_url
    ]
    try:
        result = subprocess.run(command, capture_output=True, text=True, check=True)
        video_urls = []
        for line in result.stdout.strip().split('\n'):
            try:
                data = json.loads(line)
                if data.get('_type') == 'url':
                    video_urls.append(f"https://www.youtube.com/watch?v={data['id']}")
            except Exception:
                continue
        logging.info(f"Found {len(video_urls)} videos in the playlist.")
        return video_urls
    except Exception as e:
        logging.error(f"Error extracting playlist URLs: {e}")
        return []





In [None]:
# import glob

# def download_video(video_url, output_dir):
#     """Downloads a single video using yt-dlp and returns the downloaded file path."""
#     os.makedirs(output_dir, exist_ok=True)
#     # Use a simple output template for easier file detection
#     output_template = os.path.join(output_dir, '%(id)s.%(ext)s')
#     command = [
#         'yt-dlp',
#         '-f', '22/best',
#         '-o', output_template,
#         '--no-playlist',
#         video_url
#     ]
#     try:
#         process = subprocess.run(command, capture_output=True, text=True, check=True)
#         # After download, search for the file by video ID
#         video_id = video_url.split('v=')[1]
#         # Find any file in output_dir that starts with the video ID
#         files = glob.glob(os.path.join(output_dir, f"{video_id}.*"))
#         if files:
#             logging.info(f"Downloaded video: {files[0]}")
#             return files[0]
#         else:
#             logging.warning(f"Video not downloaded or unavailable: {video_url}")
#             return None
#     except subprocess.CalledProcessError as e:
#         logging.error(f"yt-dlp failed for {video_url}: {e.stderr}")
#         return None


In [9]:
import glob
import re

def sanitize_filename(name):
    # Remove characters that are invalid in filenames
    return re.sub(r'[\\/*?:"<>|]', "", name)

def download_video(video_url, output_dir):
    """Downloads a single video using yt-dlp and returns the downloaded file path and video title."""
    os.makedirs(output_dir, exist_ok=True)
    # Get video title
    info_command = [
        'yt-dlp',
        '--skip-download',
        '--print', '%(title)s',
        '--print', '%(id)s',
        video_url
    ]
    try:
        info_process = subprocess.run(info_command, capture_output=True, text=True, check=True)
        lines = info_process.stdout.strip().split('\n')
        video_title = sanitize_filename(lines[0])
        video_id = lines[1]
        output_template = os.path.join(output_dir, f'{video_id}.%(ext)s')
        # Download the video
        download_command = [
            'yt-dlp',
            '-f', 'bestvideo[ext=mp4][height<=2160]+bestaudio[ext=m4a]/best[ext=mp4]/best',
            '-o', output_template,
            '--no-playlist',
            video_url
        ]
        subprocess.run(download_command, check=True)
        # Find the downloaded file
        files = glob.glob(os.path.join(output_dir, f"{video_id}.*"))
        if files:
            logging.info(f"Downloaded video: {files[0]}")
            return files[0], video_title
        else:
            logging.warning(f"Video not downloaded or unavailable: {video_url}")
            return None, None
    except subprocess.CalledProcessError as e:
        logging.error(f"yt-dlp failed for {video_url}: {e.stderr}")
        return None, None


In [13]:
from scenedetect import open_video, SceneManager
from scenedetect.detectors import ContentDetector

def extract_slide_frames(video_path, output_dir, threshold):
    """Detects slide changes and saves frames for each change using the new PySceneDetect API."""
    import cv2
    import os
    os.makedirs(output_dir, exist_ok=True)
    frame_paths = []

    # Open the video using the new API
    video = open_video(video_path)
    scene_manager = SceneManager()
    scene_manager.add_detector(ContentDetector(threshold=threshold))

    # Detect scenes
    scene_manager.detect_scenes(video)
    scene_list = scene_manager.get_scene_list()

    # Extract frames at scene boundaries
    cap = cv2.VideoCapture(video_path)
    for i, (start_time, end_time) in enumerate(scene_list):
        # Get frame number at scene start
        frame_num = int(start_time.get_frames())
        cap.set(cv2.CAP_PROP_POS_FRAMES, frame_num)
        ret, frame = cap.read()
        if ret:
            from PIL import Image
            import numpy as np
            frame_filename = os.path.join(output_dir, f"slide_{i+1:04d}.png")
            # Convert BGR (OpenCV) to RGB (Pillow)
            rgb_frame = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
            img = Image.fromarray(rgb_frame)
            img.save(frame_filename, format='PNG', compress_level=0)  # No compression
            frame_paths.append(frame_filename)

    cap.release()
    return frame_paths




In [14]:
def create_pdf_from_images(image_paths, pdf_filename):
    """Creates a PDF from a list of image paths using Pillow."""
    if not image_paths:
        logging.warning("No images to convert to PDF.")
        return
    image_paths.sort()
    images = []
    for img_path in image_paths:
        try:
            img = Image.open(img_path).convert('RGB')
            images.append(img)
        except Exception as e:
            logging.error(f"Error opening image {img_path}: {e}")
    if images:
        images[0].save(pdf_filename, save_all=True, append_images=images[1:])
        logging.info(f"PDF saved: {pdf_filename}")

def process_video(video_url):
    video_path, video_title = download_video(video_url, TEMP_VIDEO_DIR)
    if not video_path or not video_title:
        return
    frame_dir = os.path.join(TEMP_VIDEO_DIR, f"{os.path.splitext(os.path.basename(video_path))[0]}_frames")
    pdf_path = os.path.join(OUTPUT_BASE_DIR, f"{video_title}.pdf")
    try:
        frames = extract_slide_frames(video_path, frame_dir, SCENE_DETECTION_THRESHOLD)
        create_pdf_from_images(frames, pdf_path)
    finally:
        # Cleanup
        if os.path.exists(video_path):
            os.remove(video_path)
        if os.path.exists(frame_dir):
            shutil.rmtree(frame_dir)


In [None]:

# --- Main Execution ---

video_urls = get_video_urls_from_playlist(YOUTUBE_PLAYLIST_URL)
if not video_urls:
    logging.error("No videos found or error occurred. Exiting.")
else:
    logging.info(f"Processing {len(video_urls)} videos in parallel...")
    with concurrent.futures.ThreadPoolExecutor(max_workers=2) as executor:
        list(executor.map(process_video, video_urls))
    # Final cleanup
    if os.path.exists(TEMP_VIDEO_DIR):
        shutil.rmtree(TEMP_VIDEO_DIR)
    logging.info("All playlist videos processed. PDFs saved to your specified Google Drive folder.")

INFO:pyscenedetect:Detecting scenes...
INFO:pyscenedetect:Detecting scenes...
INFO:pyscenedetect:Detecting scenes...
INFO:pyscenedetect:Detecting scenes...
INFO:pyscenedetect:Detecting scenes...
INFO:pyscenedetect:Detecting scenes...
INFO:pyscenedetect:Detecting scenes...
INFO:pyscenedetect:Detecting scenes...
INFO:pyscenedetect:Detecting scenes...
INFO:pyscenedetect:Detecting scenes...
INFO:pyscenedetect:Detecting scenes...
INFO:pyscenedetect:Detecting scenes...
INFO:pyscenedetect:Detecting scenes...
INFO:pyscenedetect:Detecting scenes...
INFO:pyscenedetect:Detecting scenes...
INFO:pyscenedetect:Detecting scenes...
INFO:pyscenedetect:Detecting scenes...
INFO:pyscenedetect:Detecting scenes...


In [None]:
!yt-dlp -F "https://youtu.be/fMtvn0j1rOI?si=jWDaplKLB1pRUpy5"


In [None]:
!yt-dlp -f "bestvideo[ext=mp4]+bestaudio[ext=m4a]/best[ext=mp4]/best" -o "video.%(ext)s" "https://youtu.be/fMtvn0j1rOI?si=jWDaplKLB1pRUpy5"
