In [None]:
"""
This script extracts and prints the transcript of a YouTube video using the `pytube` and `innertube` libraries.

Constants:
    PANEL_IDENTIFIER_TRANSCRIPT (str): Identifier for the transcript panel in the YouTube engagement panels.

Functions:
    extract_transcript_params(next_data):
        Extracts the transcript parameters from the engagement panels in the provided `next_data`.
        Args:
            next_data (dict): The data containing engagement panels.
        Returns:
            str: The parameters required to fetch the transcript.

Variables:
    video_id (str): The ID of the YouTube video to extract the transcript from.
    client (InnerTube): An instance of the InnerTube client to interact with YouTube's internal API.
    data (dict): The data retrieved from the InnerTube client for the specified video ID.
    transcript_params (str): The parameters required to fetch the transcript.
    transcript (dict): The transcript data retrieved using the transcript parameters.
    transcript_segments (list): The list of transcript segments extracted from the transcript data.

Main Logic:
    - Fetches the video data using the InnerTube client.
    - Extracts the transcript parameters from the video data.
    - Retrieves the transcript using the extracted parameters.
    - Iterates over the transcript segments and prints the text snippets or section headers with timestamps.
"""

In [17]:
from pytube import YouTube
from innertube import InnerTube
from datetime import timedelta

# Constant to identify the transcript panel in the YouTube engagement panels
PANEL_IDENTIFIER_TRANSCRIPT = "engagement-panel-searchable-transcript"

# Function to extract transcript parameters from the engagement panels in the provided `next_data`
def extract_transcript_params(next_data):
    engagement_panels = next_data["engagementPanels"]

    for engagement_panel in engagement_panels:
        engagement_panel_section = engagement_panel[
            "engagementPanelSectionListRenderer"
        ]

        # Check if the panel identifier matches the transcript panel identifier
        if (
            engagement_panel_section.get("panelIdentifier")
            != PANEL_IDENTIFIER_TRANSCRIPT
        ):
            continue

        # Return the parameters required to fetch the transcript
        return engagement_panel_section["content"]["continuationItemRenderer"][
            "continuationEndpoint"
        ]["getTranscriptEndpoint"]["params"]


video_url = "https://www.youtube.com/watch?v=1Dx7LDwINLU"
yt = YouTube(video_url)
video_id = yt.video_id

# Retrieve the video title with error handling
try:
    video_title = yt.title
    print(video_title)
except Exception as e:
    print(f"Error retrieving video title: {e}")
    video_title = "Unknown Title"


# Create an instance of the InnerTube client
client = InnerTube(client_name="WEB")

# Fetch the initial data for the video
data = client.next(video_id)

# Extract the transcript parameters from the fetched data
transcript_params = extract_transcript_params(data)

# Retrieve the transcript using the extracted parameters
transcript = client.get_transcript(transcript_params)

# Access the list of transcript segments
transcript_segments = transcript["actions"][0]["updateEngagementPanelAction"][
    "content"
]["transcriptRenderer"]["content"]["transcriptSearchPanelRenderer"]["body"][
    "transcriptSegmentListRenderer"
][
    "initialSegments"
]

# Open a new Markdown file for writing
with open("transcript.md", "w") as file:
    # Iterate over each segment in the list of transcript segments
    for transcript_segment in transcript_segments:
        # Check if the segment contains a transcript snippet
        if "transcriptSegmentRenderer" in transcript_segment:
            transcript_segment_renderer = transcript_segment["transcriptSegmentRenderer"]
            start_time = transcript_segment_renderer["startTimeText"]["simpleText"]
            snippet = transcript_segment_renderer["snippet"]["runs"][0]["text"]
            # Clean up the text snippet
            snippet = snippet.replace("\xa0\xa0", " ")
            snippet = snippet.replace("\xa0\n", " ")
            # Write the cleaned snippet to the file
            file.write(f"{snippet}\n")
        # Check if the segment contains chapter information
        elif "transcriptSectionHeaderRenderer" in transcript_segment:
            section = transcript_segment["transcriptSectionHeaderRenderer"]
            # Check if the required keys are present in the section
            if "startMs" in section and "endMs" in section and "accessibility" in section:
                start = float(section["startMs"]) / 1000.
                end = float(section["endMs"]) / 1000.
                label = section["accessibility"]["accessibilityData"]["label"]
                # Write the label along with the formatted start and end times to the file
                file.write(f"## {label}\n")
                file.write(f"{timedelta(seconds=start)} - {timedelta(seconds=end)}\n")
            else:
                # Write a message if the required keys are not found in the section
                file.write("Required keys not found in section\n")
        else:
            # Write a message if the segment type is unknown
            file.write("Unknown segment type\n")

Error retrieving video title: Exception while accessing title of https://youtube.com/watch?v=1Dx7LDwINLU. Please file a bug report at https://github.com/pytube/pytube
