In [1]:
!pip install youtube-transcript-api markdownify googletrans==4.0.0-rc1



In [3]:
# Import necessary libraries
from youtube_transcript_api import YouTubeTranscriptApi
import re
import markdownify
from googletrans import Translator

def fetch_youtube_transcript(video_id):
    """
    Fetches the transcript of a YouTube video using its video ID.
    """
    try:
        # Fetch the transcript
        transcript = YouTubeTranscriptApi.get_transcript(video_id)
        return transcript
    except Exception as e:
        print(f"Error fetching transcript: {e}")
        return None

def clean_transcript(transcript):
    """
    Cleans the transcript by removing unnecessary timestamps and formatting it for readability.
    """
    cleaned_text = ""
    for segment in transcript:
        # Extract text and remove extra spaces/newlines
        text = segment['text'].strip()
        text = re.sub(r'\s+', ' ', text)  # Remove multiple spaces
        cleaned_text += f"{text} "
    return cleaned_text.strip()

def translate_text(text, target_language="ur"):
    """
    Translates the given text into the specified target language (default: Urdu).
    """
    translator = Translator()
    try:
        translated = translator.translate(text, dest=target_language)
        return translated.text
    except Exception as e:
        print(f"Error during translation: {e}")
        return text  # Return original text if translation fails

def format_transcript_for_markdown(cleaned_text):
    """
    Formats the cleaned transcript into a structured Markdown format.
    """
    # Split the text into paragraphs (every 3 sentences form a paragraph)
    sentences = re.split(r'(?<=[.!?])\s+', cleaned_text)
    paragraphs = [' '.join(sentences[i:i+3]) for i in range(0, len(sentences), 3)]

    # Convert paragraphs into Markdown
    markdown_content = ""
    for idx, paragraph in enumerate(paragraphs, start=1):
        markdown_content += f"### Paragraph {idx}\n\n{paragraph}\n\n"
    return markdown_content

def save_to_markdown(markdown_content, output_file="transcript.md"):
    """
    Saves the formatted Markdown content to a file.
    """
    with open(output_file, "w", encoding="utf-8") as file:
        file.write(markdown_content)
    print(f"Markdown file saved as '{output_file}'")

def convert_markdown_to_text(markdown_file, text_file="transcript.txt"):
    """
    Converts a Markdown file into a plain text file.
    """
    with open(markdown_file, "r", encoding="utf-8") as md_file:
        markdown_content = md_file.read()

    # Remove Markdown headers and formatting
    plain_text = re.sub(r"#+\s*", "", markdown_content)  # Remove headers (e.g., ###)
    plain_text = re.sub(r"\n{2,}", "\n\n", plain_text)   # Normalize newlines

    with open(text_file, "w", encoding="utf-8") as txt_file:
        txt_file.write(plain_text)
    print(f"Plain text file saved as '{text_file}'")

# Main Execution
if __name__ == "__main__":
    # Input: YouTube Video ID
    video_id = input("Enter the YouTube Video ID: ").strip()

    # Step 1: Fetch Transcript
    transcript = fetch_youtube_transcript(video_id)
    if not transcript:
        print("Failed to fetch transcript. Exiting...")
        exit(1)

    # Step 2: Clean Transcript
    cleaned_text = clean_transcript(transcript)
    print("Transcript cleaned successfully.")

    # Step 3: Translate Transcript to Urdu
    translated_text = translate_text(cleaned_text, target_language="ur")
    print("Transcript translated to Urdu successfully.")

    # Step 4: Format Transcript for Markdown
    markdown_content = format_transcript_for_markdown(translated_text)
    print("Transcript formatted for Markdown successfully.")

    # Step 5: Save to Markdown File
    markdown_file = "transcript_urdu.md"
    save_to_markdown(markdown_content, markdown_file)

    # Step 6: Convert Markdown to Plain Text
    text_file = "transcript_urdu.txt"
    convert_markdown_to_text(markdown_file, text_file)

Enter the YouTube Video ID: e-OXgunuz_4
Transcript cleaned successfully.
Transcript translated to Urdu successfully.
Transcript formatted for Markdown successfully.
Markdown file saved as 'transcript_urdu.md'
Plain text file saved as 'transcript_urdu.txt'
