In [2]:
video_list_f = 'videos_data/5760-L01-video_details.json'

In [3]:
# Import the json module to work with JSON data
import json

# Initialize an empty list to store video IDs
video_ids = []

# Open the JSON file containing the video list
with open(video_list_f) as f:
    # Load the JSON data from the file
    video_list = json.load(f)
    
    # Loop through each video in the video list
    for video in video_list:
        # Loop through each item in the video's items
        for item in video['items']:
            # Append the video ID to the list of video IDs
            video_ids.append(item['id'])


In [4]:
from youtube_transcript_api import YouTubeTranscriptApi
from youtube_transcript_api._errors import TranscriptsDisabled, NoTranscriptFound

def check_transcript_availability(video_id):
    try:
        transcript_list = YouTubeTranscriptApi.list_transcripts(video_id)
        return True
    except (TranscriptsDisabled, NoTranscriptFound):
        return False


In [5]:
for videos in video_ids:
    transcript_available = check_transcript_availability(videos)
    if transcript_available:
        print(f"Transcripts are available for video ID: {videos}")
    else:
        print(f"No transcripts found for video ID: {videos}")

Transcripts are available for video ID: zbFHYjaqjzw
Transcripts are available for video ID: sjV7NNwm1GU
Transcripts are available for video ID: -Xj7zDwwU_I
Transcripts are available for video ID: eKN22NFl58U
Transcripts are available for video ID: sM-VI3alvAI
Transcripts are available for video ID: QLT1vrnJXWI
Transcripts are available for video ID: dhRIHF1DENI
Transcripts are available for video ID: rphiCdR68TE
Transcripts are available for video ID: z7aqJpKfPC0
Transcripts are available for video ID: SXmVnHgwOZs


In [17]:
from youtube_transcript_api import YouTubeTranscriptApi
from youtube_transcript_api.formatters import JSONFormatter

video_id = video_ids[1]

# Must be a single transcript.
transcript = YouTubeTranscriptApi.get_transcript(video_id)

formatter = JSONFormatter()

# .format_transcript(transcript) turns the transcript into a JSON string.
json_formatted = formatter.format_transcript(transcript)


# Use the video ID as the filename.
filename = f'{video_id}.json'

# Now we can write it out to a file.
with open('outputs/transcripts/' + filename, 'w', encoding='utf-8') as json_file:
    json_file.write(json_formatted)

# Now should have a new JSON file that you can easily read back into Python.

In [18]:
from youtube_transcript_api import YouTubeTranscriptApi
import re

def extract_chapters(video_id):
    try:
        transcript = YouTubeTranscriptApi.get_transcript(video_id)
        
        chapters = []
        current_chapter = {"title": "Introduction", "content": []}
        
        for entry in transcript:
            text = entry['text']
            
            # Check for potential chapter markers
            if re.match(r'^\d+:\d+', text) or text.isupper():
                if current_chapter["content"]:
                    chapters.append(current_chapter)
                current_chapter = {"title": text, "content": []}
            else:
                current_chapter["content"].append(text)
        
        # Add the last chapter
        if current_chapter["content"]:
            chapters.append(current_chapter)
        
        return chapters
    
    except Exception as e:
        print(f"Error: {str(e)}")
        return None

def save_chapters_to_file(chapters, filename):
    with open(filename, 'w', encoding='utf-8') as f:
        for chapter in chapters:
            f.write(f"## {chapter['title']}\n\n")
            f.write(' '.join(chapter['content']) + '\n\n')

# Example usage
video_id = "sjV7NNwm1GU"  # Replace with your video ID
chapters = extract_chapters(video_id)

if chapters:
    save_chapters_to_file(chapters, "transcript_chapters.txt")
    print("Transcript saved with chapters to transcript_chapters.txt")
else:
    print("Failed to extract chapters from the transcript")


Transcript saved with chapters to transcript_chapters.txt
