In [None]:
pip install youtube-transcript-api

In [None]:
pip install pyspark

In [None]:
# Import necessary libraries
from pyspark.sql import SparkSession
from youtube_transcript_api import YouTubeTranscriptApi
from youtube_transcript_api.formatters import SRTFormatter
from googleapiclient.discovery import build
import os
import re

# Initialize Spark Session
spark = SparkSession.builder \
    .appName("YouTubeTranscriptAnalysis") \
    .getOrCreate()

# Initialize the YouTube API client
api_key = os.getenv("YOUTUBE_API_KEY")
youtube = build('youtube', 'v3', developerKey=api_key)

def get_video_ids_with_captions(query, max_results=50):
    video_ids = []
    search_response = youtube.search().list(
        q=query,
        part='id',
        maxResults=max_results,
        type='video',
        videoCaption='closedCaption',
    ).execute()
    for search_result in search_response.get('items', []):
        video_ids.append(search_result['id']['videoId'])
    return video_ids

def sanitize_filename(filename):
    valid_filename = re.sub(r'[^\w\s-]', '', filename).strip().lower()
    return re.sub(r'[-\s]+', '-', valid_filename)

def download_transcript(video_id, output_folder):
    try:
        transcript = YouTubeTranscriptApi.get_transcript(video_id, languages=['en'])
        formatter = SRTFormatter()
        srt_transcript = formatter.format_transcript(transcript)
        if not os.path.exists(output_folder):
            os.makedirs(output_folder)
        filename = os.path.join(output_folder, f"{video_id}.srt")
        with open(filename, "w") as text_file:
            text_file.write(srt_transcript)
        print(f"Transcript for video {video_id} has been saved in '{output_folder}' folder.")
    except Exception as e:
        print(f"An error occurred with video {video_id}: {e}")

def process_video_ids(directory, output_folder):
    for filename in os.listdir(directory):
        if filename.endswith(".txt"):
            filepath = os.path.join(directory, filename)
            with open(filepath, 'r') as file:
                video_ids = file.read().splitlines()
                for video_id in video_ids:
                    download_transcript(video_id, output_folder)

if __name__ == "__main__":
    query = "dhruv rathee"
    video_ids = get_video_ids_with_captions(query)
    output_dir = "video_ids"
    if not os.path.exists(output_dir):
        os.makedirs(output_dir)
    filename = os.path.join(output_dir, f"{sanitize_filename(query)}.txt")
    with open(filename, "w") as file:
        for video_id in video_ids:
            file.write(f"{video_id}\n")
    print(f"Saved {len(video_ids)} video IDs to {filename}.")
    
    subtitles_output_folder = "subtitles"
    process_video_ids(output_dir, subtitles_output_folder)

    # Stop the Spark session
    spark.stop()
