In [15]:
import requests
from urllib.parse import urljoin
import random

def get_random_user_agent():
    """Return a random modern browser user agent."""
    user_agents = [
        'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36',
        'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36',
        'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:121.0) Gecko/20100101 Firefox/121.0',
        'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/17.2 Safari/605.1.15'
    ]
    return random.choice(user_agents)

def fetch_youtube_channel(channel_url):
    """
    Fetch YouTube channel content while mimicking a browser request.
    
    Args:
        channel_url (str): The URL of the YouTube channel
        
    Returns:
        requests.Response: The response from the server
    """
    # Headers to mimic a browser request
    headers = {
        'User-Agent': get_random_user_agent(),
        'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8',
        'Accept-Language': 'en-US,en;q=0.5',
        'Accept-Encoding': 'gzip, deflate, br',
        'DNT': '1',  # Do Not Track
        'Connection': 'keep-alive',
        'Upgrade-Insecure-Requests': '1',
        'Sec-Fetch-Dest': 'document',
        'Sec-Fetch-Mode': 'navigate',
        'Sec-Fetch-Site': 'none',
        'Sec-Fetch-User': '?1',
        'Cache-Control': 'max-age=0'
    }

    # Additional parameters that browsers typically send
    params = {
        'hl': 'en',  # Language
        'gl': 'US',  # Geographic location
    }

    try:
        # Send the request with a timeout
        response = requests.get(
            channel_url,
            headers=headers,
            params=params,
            timeout=10,
            allow_redirects=True
        )
        
        # Raise an exception for bad status codes
        response.raise_for_status()
        
        return response
        
    except requests.exceptions.RequestException as e:
        print(f"Error fetching the channel: {str(e)}")
        return None


channel_url = 'https://www.youtube.com/@TED/videos'
response = fetch_youtube_channel(channel_url)

if response and response.status_code == 200:
    print(f"Successfully fetched the channel. Response length: {len(response.text)} bytes")
    # Here you can process the response content as needed
    # Note: YouTube's actual content might be loaded dynamically via JavaScript
    print(response.text)
else:
    print("Failed to fetch the channel")

Successfully fetched the channel. Response length: 827561 bytes
<!DOCTYPE html><html style="font-size: 10px;font-family: Roboto, Arial, sans-serif;" lang="en" darker-dark-theme darker-dark-theme-deprecate system-icons typography typography-spacing refresh><head><script data-id="_gd" nonce="40pG0Y5mKSGJVqw0zR0nsQ">window.WIZ_global_data = {"MUE6Ne":"youtube_web","MuJWjd":false,"UUFaWc":"%.@.null,1000,2]","cfb2h":"youtube.web-front-end-critical_20241201.10_p0","fPDxwd":[],"iCzhFc":false,"nQyAE":{},"oxN3nb":{"1":false,"0":false,"610401301":false,"899588437":false,"188588736":true,"691955189":true,"651175828":false,"653718497":false,"660014094":false},"u4g7r":"%.@.null,1000,2]","xnI9P":true,"xwAfE":true,"yFnxrf":2486};</script><meta http-equiv="origin-trial" content="ApvK67ociHgr2egd6c2ZjrfPuRs8BHcvSggogIOPQNH7GJ3cVlyJ1NOq/COCdj0+zxskqHt9HgLLETc8qqD+vwsAAABteyJvcmlnaW4iOiJodHRwczovL3lvdXR1YmUuY29tOjQ0MyIsImZlYXR1cmUiOiJQcml2YWN5U2FuZGJveEFkc0FQSXMiLCJleHBpcnkiOjE2OTUxNjc5OTksImlzU3ViZG9tYW

In [16]:
import json
import re
from typing import List, Dict, Optional

def extract_initial_data(html_content: str) -> Optional[dict]:
    """
    Extract ytInitialData from YouTube page HTML content.
    
    Args:
        html_content (str): The HTML content of the YouTube page
        
    Returns:
        Optional[dict]: Parsed ytInitialData or None if not found
    """
    # Look for ytInitialData in script tag
    pattern = r'var ytInitialData = ({.*?});</script>'
    match = re.search(pattern, html_content)
    
    if not match:
        return None
        
    try:
        return json.loads(match.group(1))
    except json.JSONDecodeError as e:
        print(f"Error parsing ytInitialData: {str(e)}")
        return None

def extract_video_metadata(html_content: str) -> List[Dict]:
    """
    Extract video metadata from YouTube channel page HTML content.
    
    Args:
        html_content (str): The HTML content of the YouTube channel page
        
    Returns:
        List[Dict]: List of dictionaries containing video metadata
    """
    initial_data = extract_initial_data(html_content)
    if not initial_data:
        return []
    
    videos = []
    try:
        # Navigate through the JSON structure to find video content
        tabs = (initial_data.get('contents', {})
                .get('twoColumnBrowseResultsRenderer', {})
                .get('tabs', []))
        
        # Find the Videos tab
        videos_tab = next(
            (tab.get('tabRenderer', {}).get('content', {})
             for tab in tabs
             if tab.get('tabRenderer', {}).get('title') == 'Videos'),
            {}
        )
        
        # Get the video list
        contents = (videos_tab.get('richGridRenderer', {})
                   .get('contents', []))
        
        for item in contents:
            video_renderer = (item.get('richItemRenderer', {})
                            .get('content', {})
                            .get('videoRenderer', {}))
            
            if not video_renderer:
                continue
            
            # Extract basic metadata
            video_id = video_renderer.get('videoId')
            if not video_id:
                continue
            
            # Get the highest quality thumbnail
            thumbnails = (video_renderer.get('thumbnail', {})
                         .get('thumbnails', []))
            # Sort thumbnails by width to get the highest quality
            sorted_thumbnails = sorted(thumbnails, 
                                     key=lambda x: x.get('width', 0), 
                                     reverse=True)
            thumbnail_url = sorted_thumbnails[0].get('url') if sorted_thumbnails else None
            
            # Extract title
            title = ''
            title_runs = (video_renderer.get('title', {})
                         .get('runs', []))
            if title_runs:
                title = title_runs[0].get('text', '')
            
            # Extract additional metadata
            view_count = (video_renderer.get('viewCountText', {})
                         .get('simpleText', '0 views'))
            
            published_time = (video_renderer.get('publishedTimeText', {})
                            .get('simpleText', ''))
            
            duration = (video_renderer.get('lengthText', {})
                       .get('simpleText', ''))
            
            description = ''
            desc_runs = (video_renderer.get('descriptionSnippet', {})
                        .get('runs', []))
            if desc_runs:
                description = desc_runs[0].get('text', '')
            
            # Get accessibility label which sometimes contains additional info
            accessibility_label = (video_renderer.get('title', {})
                                 .get('accessibility', {})
                                 .get('accessibilityData', {})
                                 .get('label', ''))
            
            # Create video metadata object
            video_data = {
                'video_id': video_id,
                'title': title,
                'thumbnail_url': thumbnail_url,
                'view_count': view_count,
                'published_time': published_time,
                'duration': duration,
                'description': description,
                'accessibility_label': accessibility_label,
                'url': f'https://www.youtube.com/watch?v={video_id}'
            }
            
            videos.append(video_data)
            
    except Exception as e:
        print(f"Error extracting video metadata: {str(e)}")
    
    return videos

def save_metadata_to_file(videos: List[Dict], filename: str = 'youtube_videos.json'):
    """
    Save the extracted video metadata to a JSON file.
    
    Args:
        videos (List[Dict]): List of video metadata dictionaries
        filename (str): Output filename
    """
    try:
        with open(filename, 'w', encoding='utf-8') as f:
            json.dump(videos, f, ensure_ascii=False, indent=2)
        print(f"Successfully saved metadata to {filename}")
    except Exception as e:
        print(f"Error saving metadata to file: {str(e)}")


In [17]:
videos = extract_video_metadata(response.text)
        
if videos:
    print(f"Successfully extracted metadata for {len(videos)} videos")
    # Save to file
    save_metadata_to_file(videos)
    
    # Print first video as example
    print("\nExample of first video metadata:")
    print(json.dumps(videos[0], indent=2))
else:
    print("No videos found in the response")

Successfully extracted metadata for 30 videos
Successfully saved metadata to youtube_videos.json

Example of first video metadata:
{
  "video_id": "YbnS39GAcQo",
  "title": "Your Relationship Expectations Could Be Holding You Back | Stephanie R. Yates-Anyabwile | TED",
  "thumbnail_url": "https://i.ytimg.com/vi/YbnS39GAcQo/hqdefault.jpg?sqp=-oaymwEcCNACELwBSFXyq4qpAw4IARUAAIhCGAFwAcABBg==&rs=AOn4CLA02BBVQQagxDRm68KD6kJ8kXT3Ew",
  "view_count": "21,102 views",
  "published_time": "16 hours ago",
  "duration": "8:27",
  "description": "What if the secret to a happy relationship isn\u2019t following the rules, but rewriting them? In this refreshing talk, couples therapist Stephanie R. Yates-Anyabwile explores why letting go of...",
  "accessibility_label": "Your Relationship Expectations Could Be Holding You Back | Stephanie R. Yates-Anyabwile | TED by TED 21,102 views 16 hours ago 8 minutes, 27 seconds",
  "url": "https://www.youtube.com/watch?v=YbnS39GAcQo"
}


In [23]:
print(videos)



In [19]:
video_list = [v['video_id'] for v in videos]

In [24]:
print(video_list)

['YbnS39GAcQo', 'VEgltjydwZ8', 'g3AU44HfpfE', 'do27uAjfKbg', 'Nh1QvWm0BrQ', 'dDKozwjlt-Q', 'SoI9-PP5Tmk', 'dzBj5rRmTv8', 'HLwvyD1MbSM', 'Tq6_7XyCyyI', 'wRa3sycERxA', 'UFNRxEmoMaw', '1Bj-zXkVi68', 'YKufhUZRJ1E', '44lbeIb6TjA', 'MMaWtHnyP04', 'VuJbzKpbIVk', 'vh8q8ySORFs', 'hHYe3O7_TUA', 'PYRFxrLt30A', 'OjuYFNR1aWo', 'YP-iH-7WRLI', '2LkDU0iKaro', 'HDHbjYNBwXI', 'RmXrwKydM9k', 'Efp1EiSwDO4', 'Kcq-FxK-GK0', 'wUpZ181ATfI', 'DnqNS6fThuY', 'cbtkoZUOR1A']


In [25]:
# your_custom_script.py

from youtube_transcript_api import YouTubeTranscriptApi
from youtube_transcript_api.formatters import JSONFormatter

# Must be a single transcript.
transcript_list = YouTubeTranscriptApi.get_transcripts(video_list[0:5], languages=['en'])

formatter = JSONFormatter()

# .format_transcript(transcript) turns the transcript into a JSON string.
json_formatted = formatter.format_transcript(transcript_list)


# Now we can write it out to a file.
with open('transcripts.json', 'w', encoding='utf-8') as json_file:
    json_file.write(json_formatted)

# Now should have a new JSON file that you can easily read back into Python.

In [35]:
import json
from typing import Dict, List

def clean_text(text: str) -> str:
    """
    深度清理文本中的特殊字符和转义符号
    """
    # 第一步：替换所有Unicode字符
    text = text.replace('\u2019', "'")
    text = text.replace('\u201c', '"')
    text = text.replace('\u201d', '"')
    
    # 第二步：处理转义字符
    text = text.replace('\\n', ' ')
    text = text.replace('\\"', '"')
    text = text.replace("\\'", "'")
    text = text.replace('\\', '')
    
    # 第三步：规范化空格
    text = ' '.join(text.split())
    
    return text.strip()

def process_transcripts(data: List[Dict]) -> Dict[str, str]:
    """
    处理视频文本，合并为清理过的段落
    """
    transcript_dict = {}
    
    if not data:
        return transcript_dict
        
    transcript_data = data[0]
    
    for video_id, segments in transcript_data.items():
        text_segments = []
        
        for segment in segments:
            text = segment.get('text', '').strip()
            
            # 跳过音效标注
            if text.startswith('(') and text.endswith(')'):
                continue
                
            if text:
                cleaned_text = clean_text(text)
                if cleaned_text:
                    text_segments.append(cleaned_text)
        
        full_transcript = ' '.join(text_segments)
        full_transcript = clean_text(full_transcript)
        transcript_dict[video_id] = full_transcript
            
    return transcript_dict

def save_transcripts(processed_transcripts: Dict[str, str], output_file: str) -> None:
    """
    保存处理后的文本，确保没有多余的转义字符
    """
    class NonEscapingJSONEncoder(json.JSONEncoder):
        def encode(self, obj):
            return super().encode(obj).replace('\\', '')
    
    output = {
        'transcripts': {
            video_id: {
                'paragraph': clean_text(paragraph)
            }
            for video_id, paragraph in processed_transcripts.items()
        }
    }
    
    with open(output_file, 'w', encoding='utf-8') as f:
        json.dump(output, f, 
                 cls=NonEscapingJSONEncoder,
                 ensure_ascii=False, 
                 indent=2)

def main():
    # 读取JSON数据
    with open('transcripts.json', 'r', encoding='utf-8') as f:
        data = json.load(f)

    # 处理文本
    processed_transcripts = process_transcripts(data)
    
    # 保存结果
    save_transcripts(processed_transcripts, 'processed_transcripts-v2.json')
    
    # 打印示例
    for video_id, text in processed_transcripts.items():
        print(f"\nVideo {video_id}:")
        print(text[:200] + "...")  # 只打印前200个字符

if __name__ == "__main__":
    main()


Video YbnS39GAcQo:
I'm a couples therapist and an absolute romance fiend. I'm talking about everything from "The Notebook" to "Twilight" to a show some of you may remember called "The Flavor of Love." It's a reality com...

Video VEgltjydwZ8:
Indonesia does not have a food crisis. What we have is actually a food policy crisis. We have the answer, actually, in our food biodiversity. We have over 1,200 grains, over 600 edible roots, not to m...

Video g3AU44HfpfE:
(Sakha) Neruen-Nerguy. (English) This is the way Indigenous Sakha people greet one another. This phrase has no exact English translation, but it means "I greet the universe in your person." My name is...

Video do27uAjfKbg:
I call myself an "apocalyptic optimist," but I wasn't always this way. I used to believe that technology could save us from the climate crisis, that all the big brains in the world would come up with ...

Video Nh1QvWm0BrQ:
Two years ago, I became an accidental intrapreneur. Meaning an entrepreneur, but wit

In [37]:
from openai import OpenAI
client = OpenAI()

# System prompt to guide the model
system_prompt = """
Analyze the given text and extract key phrases or expressions that are:
1. Important concepts or ideas
2. Innovative or unique phrases
3. Compelling expressions

Format the response as a JSON object with:
phrases json objects like below example
output example
{
  en: "compelling ideas",
  cn: "引人注目的想法",
  color: "text-blue-600",
},
{
  en: "innovative solutions",
  cn: "创新解决方案",
  color: "text-green-600",
},

"""

text = "We're here today to hear compelling ideas, new innovations and thinking\nin science and medicine. And innovative solutions to our most\nvexing problems in society. So they said to me, \"Eric, do you have anything\nthat you can add to this compelling list\nof stories and ideas? Something you can talk about\nhere on the TED stage?\" And I said, \"Of course I do.\" And the question I want\nto explore with you today is: Why don’t vampires\ncast reflections in mirrors? So, you've probably seen\nthis before in movies. The humans are suspicious\nof the new dark stranger, and they band together\nin this moment of confrontation"
completion = client.chat.completions.create(
  model="gpt-4o-mini",
  messages=[
    {"role": "system", "content": system_prompt},
    {"role": "user", "content": text}
  ],
  response_format={"type": "json_object"}
)


print(completion.choices[0].message)

ChatCompletionMessage(content='{\n  "phrases": [\n    {\n      "en": "compelling ideas",\n      "cn": "引人注目的想法",\n      "color": "text-blue-600"\n    },\n    {\n      "en": "new innovations",\n      "cn": "新的创新",\n      "color": "text-green-600"\n    },\n    {\n      "en": "innovative solutions",\n      "cn": "创新解决方案",\n      "color": "text-green-600"\n    },\n    {\n      "en": "vexing problems",\n      "cn": "令人困惑的问题",\n      "color": "text-yellow-600"\n    },\n    {\n      "en": "TED stage",\n      "cn": "TED舞台",\n      "color": "text-purple-600"\n    },\n    {\n      "en": "dark stranger",\n      "cn": "黑暗陌生人",\n      "color": "text-gray-600"\n    },\n    {\n      "en": "confrontation moment",\n      "cn": "对抗时刻",\n      "color": "text-red-600"\n    },\n    {\n      "en": "explore the question",\n      "cn": "探讨问题",\n      "color": "text-blue-600"\n    }\n  ]\n}', refusal=None, role='assistant', audio=None, function_call=None, tool_calls=None)


In [41]:
video_id="YbnS39GAcQo"
video_transcript = YouTubeTranscriptApi.get_transcript(video_id)

In [42]:
print(video_transcript)



In [46]:
from openai import OpenAI
from typing import List, Dict
import json

def process_transcript(text: str) -> str:
    """Process transcript text using OpenAI API."""
    
    client = OpenAI()
    
    # Clean text data - remove \n and (Laughter) tags
    text = text.replace('\n', ' ').replace('(Laughter)', '')
    
    system_prompt = """
    You are a professional translator and subtitle editor. Convert the provided text into a JSON array of bilingual subtitles.
    Requirements:
    1. Break the text into natural segments of 5-20 seconds
    2. Keep the context and meaning complete in each segment
    3. Provide natural and accurate Chinese translations
    4. Maintain conversational flow and speaker's style
    
    Output format:
    {
      "subtitles": [
        {
          "timestamp": "00:04",  // MM:SS format
          "en": "English text here",
          "cn": "Chinese translation here"
        }
      ]
    }
    
    Guidelines:
    - Break at natural pauses and complete thoughts
    - Keep subtitle length comfortable for reading
    - Translate for clarity and cultural context
    - Keep translations concise but accurate
    """

    try:
        completion = client.chat.completions.create(
            model="gpt-4o-mini",  # or your preferred model
            messages=[
                {"role": "system", "content": system_prompt},
                {"role": "user", "content": text}
            ],
            response_format={"type": "json_object"}
        )
        
        return completion.choices[0].message.content

    except Exception as e:
        print(f"Error processing transcript: {e}")
        return None

result = process_transcript(video_transcript)
if result:
    print(json.dumps(json.loads(result), indent=2, ensure_ascii=False))

In [51]:

from openai import OpenAI
from typing import List, Dict
import json

def process_transcript(segments: List[Dict]) -> str:
    """Process transcript segments using OpenAI API."""
    
    client = OpenAI()
    
    # Combine segments into single text while preserving timing
    processed_text = ""
    current_time = 0
    
    for segment in segments:
        if '(Laughter)' not in segment['text']:
            # Clean text and add timing info
            clean_text = segment['text'].replace('\n', ' ')
            time_info = f"[{segment['start']:.1f}s] "
            processed_text += time_info + clean_text + " "
    
    system_prompt = """
    You are a professional translator and subtitle editor. Convert the provided text (with timestamps) into a JSON array of bilingual subtitles.
    Requirements:
    1. Use the provided timestamps [Xs] to create segments of 5-20 seconds
    2. Keep the context and meaning complete in each segment
    3. Provide natural and accurate Chinese translations
    4. Maintain conversational flow and speaker's style
    
    Output format:
    {
      "subtitles": [
        {
          "timestamp": "MM:SS",
          "en": "English text here",
          "cn": "Chinese translation here"
        }
      ]
    }
    
    Guidelines:
    - Break at natural pauses and complete thoughts
    - Keep subtitle length comfortable for reading
    - Translate for clarity and cultural context
    - Keep translations concise but accurate
    - Use the provided timestamps to guide segmentation
    """

    try:
        completion = client.chat.completions.create(
            model="gpt-4o-mini",
            messages=[
                {"role": "system", "content": system_prompt},
                {"role": "user", "content": processed_text}
            ],
            response_format={"type": "json_object"},
            temperature=0.3
        )
        
        return completion.choices[0].message.content

    except Exception as e:
        print(f"Error processing transcript: {e}")
        return None


In [52]:
result = process_transcript(video_transcript)
if result:
    print(json.dumps(json.loads(result), indent=2, ensure_ascii=False))

{
  "subtitles": [
    {
      "timestamp": "00:04",
      "en": "I’m a couples therapist and an absolute romance fiend.",
      "cn": "我是一名情侣治疗师，绝对是个浪漫爱好者。"
    },
    {
      "timestamp": "00:09",
      "en": "I'm talking about everything from 'The Notebook' to 'Twilight'",
      "cn": "我说的包括《恋恋笔记本》到《暮光之城》。"
    },
    {
      "timestamp": "00:13",
      "en": "to a show some of you may remember called 'The Flavor of Love.'",
      "cn": "还有一档你们可能记得的节目，叫做《爱的味道》。"
    },
    {
      "timestamp": "00:18",
      "en": "It's a reality competition show",
      "cn": "这是一档真人秀竞赛节目，"
    },
    {
      "timestamp": "00:20",
      "en": "where the prize was the love of Flavor Flav.",
      "cn": "奖品是Flavor Flav的爱。"
    },
    {
      "timestamp": "00:24",
      "en": "I think about relationships a lot,",
      "cn": "我经常思考人际关系，"
    },
    {
      "timestamp": "00:26",
      "en": "and something that comes up a lot in my work",
      "cn": "而在我的工作中，常常会出现一个问题，"
    },
    {
      "timestamp": 