In [4]:
#!pip install google-api-python-client youtube-transcript-api

In [1]:
import os, json, sys
from googleapiclient.discovery import build
from youtube_transcript_api import YouTubeTranscriptApi
from dotenv import load_dotenv
import requests

In [2]:
load_dotenv()
API_KEY = os.getenv("YTB_API_KEY")
urls_file = "url.txt"

In [3]:
def fetch_video_details(video_id):
    api_url = f"https://www.googleapis.com/youtube/v3/videos?id={video_id}&part=snippet&key={API_KEY}"
    response = requests.get(api_url)
    data = response.json()
    video_title = data["items"][0]["snippet"]["title"]
    published_dt = data["items"][0]["snippet"]["publishedAt"]
    host = data["items"][0]["snippet"]['channelTitle']
    return video_title, published_dt, host

In [4]:
def fetch_transcript(video_id):
    try:
        # Fetch the transcript
        transcript_list = YouTubeTranscriptApi.get_transcript(video_id, languages=['en'])
        
        # Combine transcript text without timestamps
        transcript_text = ' '.join([entry['text'] for entry in transcript_list])
        
        return transcript_text
    except Exception as e:
        return str(e)

In [5]:
def save_metadata(metadata, filename):
    with open(filename, 'w') as json_file:
        json.dump(metadata, json_file, indent=4)

def save_transcript(transcript, filename):
    with open(filename, 'w') as md_file:
        md_file.write(transcript)

In [6]:
with open(urls_file, 'r') as file:
    youtube_urls = [line.strip() for line in file if line.strip()]
video_ids = [url.split('v=')[1] for url in youtube_urls]

In [7]:
metadata = {}
transcript_folder = "transcripts"

for index, video_id in enumerate(video_ids, start=1):
    video_title, published_dt, host = fetch_video_details(video_id)
    unique_name = f"{host}_{index}"
    
    # Save metadata
    metadata[unique_name] = {
        "video_title": video_title,
        "published_dt": published_dt,
        "host": host
    }
    
    # Fetch and save transcript
    transcript = fetch_transcript(video_id)
    transcript = transcript.replace("\n", "").replace("-", " ")
    
    transcript_filename = os.path.join(transcript_folder, f"{unique_name}.md")
    save_transcript(transcript, transcript_filename)

# Save all metadata to JSON file
save_metadata(metadata, os.path.join(transcript_folder, 'video_metadata.json'))