In [4]:
# get youtube transcript
from youtube_transcript_api import YouTubeTranscriptApi

video_id = 'HDHbjYNBwXI'

In [None]:
YouTubeTranscriptApi.get_transcript(video_id)

In [8]:
import requests
from urllib.parse import urljoin
import random

def get_random_user_agent():
    """Return a random modern browser user agent."""
    user_agents = [
        'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36',
        'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36',
        'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:121.0) Gecko/20100101 Firefox/121.0',
        'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/17.2 Safari/605.1.15'
    ]
    return random.choice(user_agents)

def fetch_youtube_channel(channel_url):
    """
    Fetch YouTube channel content while mimicking a browser request.
    
    Args:
        channel_url (str): The URL of the YouTube channel
        
    Returns:
        requests.Response: The response from the server
    """
    # Headers to mimic a browser request
    headers = {
        'User-Agent': get_random_user_agent(),
        'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8',
        'Accept-Language': 'en-US,en;q=0.5',
        'Accept-Encoding': 'gzip, deflate, br',
        'DNT': '1',  # Do Not Track
        'Connection': 'keep-alive',
        'Upgrade-Insecure-Requests': '1',
        'Sec-Fetch-Dest': 'document',
        'Sec-Fetch-Mode': 'navigate',
        'Sec-Fetch-Site': 'none',
        'Sec-Fetch-User': '?1',
        'Cache-Control': 'max-age=0'
    }

    # Additional parameters that browsers typically send
    params = {
        'hl': 'en',  # Language
        'gl': 'US',  # Geographic location
    }

    try:
        # Send the request with a timeout
        response = requests.get(
            channel_url,
            headers=headers,
            params=params,
            timeout=10,
            allow_redirects=True
        )
        
        # Raise an exception for bad status codes
        response.raise_for_status()
        
        return response
        
    except requests.exceptions.RequestException as e:
        print(f"Error fetching the channel: {str(e)}")
        return None



# def main():
#     channel_url = 'https://www.youtube.com/@TED/videos'
#     response = fetch_youtube_channel(channel_url)
    
#     if response and response.status_code == 200:
#         print(f"Successfully fetched the channel. Response length: {len(response.text)} bytes")
#         # Here you can process the response content as needed
#         # Note: YouTube's actual content might be loaded dynamically via JavaScript
#         return response.text
#     else:
#         print("Failed to fetch the channel")
#         return None

# if __name__ == "__main__":
#     main()

channel_url = 'https://www.youtube.com/@TED/videos'
response = fetch_youtube_channel(channel_url)

if response and response.status_code == 200:
    print(f"Successfully fetched the channel. Response length: {len(response.text)} bytes")
    # Here you can process the response content as needed
    # Note: YouTube's actual content might be loaded dynamically via JavaScript
    print(response.text)
else:
    print("Failed to fetch the channel")

Successfully fetched the channel. Response length: 799073 bytes
<!DOCTYPE html><html style="font-size: 10px;font-family: Roboto, Arial, sans-serif;" lang="en" darker-dark-theme darker-dark-theme-deprecate system-icons typography typography-spacing refresh><head><script data-id="_gd" nonce="TI61BQHde9aBDkMBb42rtA">window.WIZ_global_data = {"MUE6Ne":"youtube_web","MuJWjd":false,"UUFaWc":"%.@.null,1000,2]","cfb2h":"youtube.web-front-end-critical_20241030.05_p0","fPDxwd":[],"iCzhFc":false,"nQyAE":{},"oxN3nb":{"1":false,"0":false,"610401301":false,"899588437":false,"188588736":true,"676937399":true,"651175828":false,"653718497":false,"660014094":false},"u4g7r":"%.@.null,1000,2]","xnI9P":true,"xwAfE":true,"yFnxrf":2486};</script><meta http-equiv="origin-trial" content="ApvK67ociHgr2egd6c2ZjrfPuRs8BHcvSggogIOPQNH7GJ3cVlyJ1NOq/COCdj0+zxskqHt9HgLLETc8qqD+vwsAAABteyJvcmlnaW4iOiJodHRwczovL3lvdXR1YmUuY29tOjQ0MyIsImZlYXR1cmUiOiJQcml2YWN5U2FuZGJveEFkc0FQSXMiLCJleHBpcnkiOjE2OTUxNjc5OTksImlzU3ViZG9tYW

In [14]:
import json
import re
from typing import List, Dict, Optional

def extract_initial_data(html_content: str) -> Optional[dict]:
    """
    Extract ytInitialData from YouTube page HTML content.
    
    Args:
        html_content (str): The HTML content of the YouTube page
        
    Returns:
        Optional[dict]: Parsed ytInitialData or None if not found
    """
    # Look for ytInitialData in script tag
    pattern = r'var ytInitialData = ({.*?});</script>'
    match = re.search(pattern, html_content)
    
    if not match:
        return None
        
    try:
        return json.loads(match.group(1))
    except json.JSONDecodeError as e:
        print(f"Error parsing ytInitialData: {str(e)}")
        return None

def extract_video_metadata(html_content: str) -> List[Dict]:
    """
    Extract video metadata from YouTube channel page HTML content.
    
    Args:
        html_content (str): The HTML content of the YouTube channel page
        
    Returns:
        List[Dict]: List of dictionaries containing video metadata
    """
    initial_data = extract_initial_data(html_content)
    if not initial_data:
        return []
    
    videos = []
    try:
        # Navigate through the JSON structure to find video content
        tabs = (initial_data.get('contents', {})
                .get('twoColumnBrowseResultsRenderer', {})
                .get('tabs', []))
        
        # Find the Videos tab
        videos_tab = next(
            (tab.get('tabRenderer', {}).get('content', {})
             for tab in tabs
             if tab.get('tabRenderer', {}).get('title') == 'Videos'),
            {}
        )
        
        # Get the video list
        contents = (videos_tab.get('richGridRenderer', {})
                   .get('contents', []))
        
        for item in contents:
            video_renderer = (item.get('richItemRenderer', {})
                            .get('content', {})
                            .get('videoRenderer', {}))
            
            if not video_renderer:
                continue
            
            # Extract basic metadata
            video_id = video_renderer.get('videoId')
            if not video_id:
                continue
            
            # Get the highest quality thumbnail
            thumbnails = (video_renderer.get('thumbnail', {})
                         .get('thumbnails', []))
            # Sort thumbnails by width to get the highest quality
            sorted_thumbnails = sorted(thumbnails, 
                                     key=lambda x: x.get('width', 0), 
                                     reverse=True)
            thumbnail_url = sorted_thumbnails[0].get('url') if sorted_thumbnails else None
            
            # Extract title
            title = ''
            title_runs = (video_renderer.get('title', {})
                         .get('runs', []))
            if title_runs:
                title = title_runs[0].get('text', '')
            
            # Extract additional metadata
            view_count = (video_renderer.get('viewCountText', {})
                         .get('simpleText', '0 views'))
            
            published_time = (video_renderer.get('publishedTimeText', {})
                            .get('simpleText', ''))
            
            duration = (video_renderer.get('lengthText', {})
                       .get('simpleText', ''))
            
            description = ''
            desc_runs = (video_renderer.get('descriptionSnippet', {})
                        .get('runs', []))
            if desc_runs:
                description = desc_runs[0].get('text', '')
            
            # Get accessibility label which sometimes contains additional info
            accessibility_label = (video_renderer.get('title', {})
                                 .get('accessibility', {})
                                 .get('accessibilityData', {})
                                 .get('label', ''))
            
            # Create video metadata object
            video_data = {
                'video_id': video_id,
                'title': title,
                'thumbnail_url': thumbnail_url,
                'view_count': view_count,
                'published_time': published_time,
                'duration': duration,
                'description': description,
                'accessibility_label': accessibility_label,
                'url': f'https://www.youtube.com/watch?v={video_id}'
            }
            
            videos.append(video_data)
            
    except Exception as e:
        print(f"Error extracting video metadata: {str(e)}")
    
    return videos

def save_metadata_to_file(videos: List[Dict], filename: str = 'youtube_videos.json'):
    """
    Save the extracted video metadata to a JSON file.
    
    Args:
        videos (List[Dict]): List of video metadata dictionaries
        filename (str): Output filename
    """
    try:
        with open(filename, 'w', encoding='utf-8') as f:
            json.dump(videos, f, ensure_ascii=False, indent=2)
        print(f"Successfully saved metadata to {filename}")
    except Exception as e:
        print(f"Error saving metadata to file: {str(e)}")



# def main():
#     # Reuse the previous scraper code
#     from youtube_scraper import fetch_youtube_channel
    
#     channel_url = 'https://www.youtube.com/@TED/videos'
#     response = fetch_youtube_channel(channel_url)
    
#     if response and response.status_code == 200:
#         # Extract metadata from the response
#         videos = extract_video_metadata(response.text)
        
#         if videos:
#             print(f"Successfully extracted metadata for {len(videos)} videos")
#             # Save to file
#             save_metadata_to_file(videos)
            
#             # Print first video as example
#             print("\nExample of first video metadata:")
#             print(json.dumps(videos[0], indent=2))
#         else:
#             print("No videos found in the response")
#     else:
#         print("Failed to fetch the channel")

# if __name__ == "__main__":
#     main()


In [18]:
videos = extract_video_metadata(response.text)
        
if videos:
    print(f"Successfully extracted metadata for {len(videos)} videos")
    # Save to file
    save_metadata_to_file(videos)
    
    # Print first video as example
    print("\nExample of first video metadata:")
    print(json.dumps(videos[0], indent=2))
else:
    print("No videos found in the response")

Successfully extracted metadata for 30 videos
Successfully saved metadata to youtube_videos.json

Example of first video metadata:
{
  "video_id": "2LkDU0iKaro",
  "title": "Networking Doesn\u2019t Have to Feel Gross | Daniel Hallak | TED",
  "thumbnail_url": "https://i.ytimg.com/vi/2LkDU0iKaro/hqdefault.jpg?sqp=-oaymwEjCNACELwBSFryq4qpAxUIARUAAAAAGAElAADIQj0AgKJDeAE=&rs=AOn4CLDiAkoBtGLV8g0HJUSkOPUBbXBgcg",
  "view_count": "24,616 views",
  "published_time": "2 days ago",
  "duration": "14:09",
  "description": "Networking doesn\u2019t always have to feel like a self-serving transaction, says executive coach Daniel Hallak. Highlighting the importance of focusing on giving rather than taking when it comes...",
  "accessibility_label": "Networking Doesn\u2019t Have to Feel Gross | Daniel Hallak | TED by TED 24,616 views 2 days ago 14 minutes, 9 seconds",
  "url": "https://www.youtube.com/watch?v=2LkDU0iKaro"
}


In [16]:
print(videos)

[{'video_id': '2LkDU0iKaro', 'title': 'Networking Doesn’t Have to Feel Gross | Daniel Hallak | TED', 'thumbnail_url': 'https://i.ytimg.com/vi/2LkDU0iKaro/hqdefault.jpg?sqp=-oaymwEjCNACELwBSFryq4qpAxUIARUAAAAAGAElAADIQj0AgKJDeAE=&rs=AOn4CLDiAkoBtGLV8g0HJUSkOPUBbXBgcg', 'view_count': '24,616 views', 'published_time': '2 days ago', 'duration': '14:09', 'description': 'Networking doesn’t always have to feel like a self-serving transaction, says executive coach Daniel Hallak. Highlighting the importance of focusing on giving rather than taking when it comes...', 'accessibility_label': 'Networking Doesn’t Have to Feel Gross | Daniel Hallak | TED by TED 24,616 views 2 days ago 14 minutes, 9 seconds', 'url': 'https://www.youtube.com/watch?v=2LkDU0iKaro'}, {'video_id': 'HDHbjYNBwXI', 'title': 'Poetry and Music That Reaches Across the Digital Void | Elle Cordova | TED', 'thumbnail_url': 'https://i.ytimg.com/vi/HDHbjYNBwXI/hqdefault.jpg?sqp=-oaymwEjCNACELwBSFryq4qpAxUIARUAAAAAGAElAADIQj0AgKJDeAE=

In [19]:
print(type(videos))

<class 'list'>


In [27]:
video_list = [v['video_id'] for v in videos]

In [28]:
print(video_list)

['2LkDU0iKaro', 'HDHbjYNBwXI', 'RmXrwKydM9k', 'Efp1EiSwDO4', 'Kcq-FxK-GK0', 'wUpZ181ATfI', 'DnqNS6fThuY', 'cbtkoZUOR1A', 'JfeRLwlnuHo', 'x6oM9hQMjUY', 'PnBMdJ5KeHk', 'TiwcAjRuhoE', 'WrxJKj71c9o', 'jFl9kFms7nA', 'Z6bxX3mcfJg', 'sgHHRVH0NFo', 'Y1kO-0Yo1R0', '4R7mX6pChSA', '_cPj3b9BDPI', 'hktbamn2jX4', 'u6m2rwNfkrU', 'V3Cav6WhwZc', 'WicBE2Uwz0A', '1GRt0j698T4', 'vNCVrtwrAWg', 'RsXLT2z3X8g', 'cYW8ntaw_v8', 'PkGCtSkbnjQ', '8jkFNm8lKOs', 'VDP27kIe7-s']


In [25]:
YouTubeTranscriptApi.get_transcripts(video_list, languages=['en'])

({'2LkDU0iKaro': [{'text': 'When I was in graduate school,',
    'start': 4.543,
    'duration': 2.335},
   {'text': 'there was a student who I looked up to.',
    'start': 6.92,
    'duration': 2.294},
   {'text': 'His name was Peter.', 'start': 9.839, 'duration': 1.21},
   {'text': 'Peter was the type of person\nyou wanted to be like.',
    'start': 11.55,
    'duration': 3.211},
   {'text': 'He was smart, articulate and winsome.',
    'start': 15.136,
    'duration': 3.504},
   {'text': 'One day I saw Peter in the library.',
    'start': 19.224,
    'duration': 2.461},
   {'text': 'It was his final quarter in our program\nand he was about to graduate.',
    'start': 21.726,
    'duration': 4.463},
   {'text': '"Peter, congratulations.', 'start': 26.648, 'duration': 2.377},
   {'text': 'You must be so excited."', 'start': 29.067, 'duration': 2.419},
   {'text': 'His response surprised me.', 'start': 32.445, 'duration': 1.71},
   {'text': '"I am, but I haven\'t built\nmy network like 

In [29]:
# your_custom_script.py

from youtube_transcript_api import YouTubeTranscriptApi
from youtube_transcript_api.formatters import JSONFormatter

# Must be a single transcript.
transcript_list = YouTubeTranscriptApi.get_transcripts(video_list, languages=['en'])

formatter = JSONFormatter()

# .format_transcript(transcript) turns the transcript into a JSON string.
json_formatted = formatter.format_transcript(transcript_list)


# Now we can write it out to a file.
with open('transcripts.json', 'w', encoding='utf-8') as json_file:
    json_file.write(json_formatted)

# Now should have a new JSON file that you can easily read back into Python.

ParseError: no element found: line 1, column 0 (<string>)

In [31]:
transcript_list = YouTubeTranscriptApi.get_transcripts(video_list[:9], languages=['en'])

In [32]:
print(type(transcript_list))

<class 'tuple'>


In [33]:
transcript_list

({'2LkDU0iKaro': [{'text': 'When I was in graduate school,',
    'start': 4.543,
    'duration': 2.335},
   {'text': 'there was a student who I looked up to.',
    'start': 6.92,
    'duration': 2.294},
   {'text': 'His name was Peter.', 'start': 9.839, 'duration': 1.21},
   {'text': 'Peter was the type of person\nyou wanted to be like.',
    'start': 11.55,
    'duration': 3.211},
   {'text': 'He was smart, articulate and winsome.',
    'start': 15.136,
    'duration': 3.504},
   {'text': 'One day I saw Peter in the library.',
    'start': 19.224,
    'duration': 2.461},
   {'text': 'It was his final quarter in our program\nand he was about to graduate.',
    'start': 21.726,
    'duration': 4.463},
   {'text': '"Peter, congratulations.', 'start': 26.648, 'duration': 2.377},
   {'text': 'You must be so excited."', 'start': 29.067, 'duration': 2.419},
   {'text': 'His response surprised me.', 'start': 32.445, 'duration': 1.71},
   {'text': '"I am, but I haven\'t built\nmy network like 

In [36]:
transcript_list2 = YouTubeTranscriptApi.get_transcripts(video_list[10:20], languages=['en'])

In [35]:
transcript_list3 = YouTubeTranscriptApi.get_transcripts(video_list[21:30], languages=['en'])

In [37]:
formatter = JSONFormatter()

In [38]:
json_formatted = formatter.format_transcript(transcript_list)

In [39]:
print(json_formatted)



In [40]:
with open('transcript_list0.json', 'w', encoding='utf-8') as json_file:
    json_file.write(json_formatted)

In [41]:
import json
from typing import Dict, List

def process_transcripts(data: List[Dict]) -> Dict[str, str]:
    """
    Process video transcripts by combining all text segments into a single paragraph for each video.
    
    Args:
        data: List containing transcripts for videos
        
    Returns:
        Dict mapping video IDs to their combined transcript paragraphs
    """
    transcript_dict = {}
    
    # Get first item which contains the transcripts
    if data and len(data) > 0:
        transcript_data = data[0]
        
        # Process each video's transcript
        for video_id, segments in transcript_data.items():
            # Combine all text segments, filtering out sound effects/actions in parentheses
            text_segments = []
            
            for segment in segments:
                text = segment.get('text', '').strip()
                
                # Skip pure sound effect/action segments
                if text.startswith('(') and text.endswith(')'):
                    continue
                    
                if text:
                    text_segments.append(text)
            
            # Join all text segments with spaces
            full_transcript = ' '.join(text_segments)
            
            # Store in dictionary
            transcript_dict[video_id] = full_transcript
            
    return transcript_dict

# Load JSON data from file
with open('transcript_list0.json', 'r', encoding='utf-8') as f:
    data = json.load(f)

# Process transcripts
processed_transcripts = process_transcripts(data)

# Save processed transcripts to new JSON file
output = {
    'transcripts': {
        video_id: {
            'paragraph': paragraph
        }
        for video_id, paragraph in processed_transcripts.items()
    }
}

print(output)



In [44]:
with open('processed_transcripts.json', 'w', encoding='utf-8') as f:
    json.dump(output, f, ensure_ascii=False, indent=2)

# Print sample of processed data
print("\nProcessed transcripts for {} videos".format(len(processed_transcripts)))
if processed_transcripts:
    sample_id = next(iter(processed_transcripts))
    print(f"\nSample transcript for video {sample_id}:")
    print(processed_transcripts[sample_id][:200] + "...")


Processed transcripts for 10 videos

Sample transcript for video PnBMdJ5KeHk:
Few insects have captured our imagination
like the monarch butterfly. Their migration is one of the most iconic
wildlife spectacles in North America, but they are also one of the best
environmental in...


In [11]:
# 1. python-slugify (最流行的选择)
from slugify import slugify

# def slugify_example():
#     # 基础用法
#     title = "How to Learn Python 2024 完整教程!"
#     basic_slug = slugify(title)  # 输出: how-to-learn-python-2024-wan-zheng-jiao-cheng
    
#     # 自定义配置
    # custom_slug = slugify(title,
    #     lowercase=True,          # 转换为小写
    #     separator='-',           # 分隔符
    #     allow_unicode=True,      # 允许 Unicode 字符
    #     replacements=[('&', 'and')],  # 自定义替换
    #     max_length=100          # 最大长度
    # )
    
#     return basic_slug, custom_slug

# 2. awesome-slugify (更好的Unicode支持)
# from slugify import Slugify, LOWERCASE, UPPERCASE

# def awesome_slugify_example():
#     custom_slugify = Slugify(to_lower=True)
    
#     # 添加自定义替换规则
#     custom_slugify.separator = '-'
#     custom_slugify.safe_chars = '-'
#     custom_slugify.stop_words = ('and', 'the', 'in')
    
#     title = "Python Programming & Data Science 教程"
#     slug = custom_slugify(title)  # 输出: python-programming-data-science-jiao-cheng
    
#     return slug

titles = [
    "Poetry and Music That Reaches Across the Digital Void | Elle Cordova | TED",
    "The Tipping Point I Got Wrong | Malcolm Gladwell | TED",
    "A Food System That Fights Climate Change — Instead of Causing It | Gonzalo Muñoz | TED",
]

# 使用不同的库生成slug
for title in titles:
    print(f"\n原标题: {title}")
    print(f"python-slugify: {slugify(title,allow_unicode=True)}")


原标题: Poetry and Music That Reaches Across the Digital Void | Elle Cordova | TED
python-slugify: poetry-and-music-that-reaches-across-the-digital-void-elle-cordova-ted

原标题: The Tipping Point I Got Wrong | Malcolm Gladwell | TED
python-slugify: the-tipping-point-i-got-wrong-malcolm-gladwell-ted

原标题: A Food System That Fights Climate Change — Instead of Causing It | Gonzalo Muñoz | TED
python-slugify: a-food-system-that-fights-climate-change-instead-of-causing-it-gonzalo-muñoz-ted


In [1]:
!pip install python-slugify

Collecting python-slugify
  Downloading python_slugify-8.0.4-py2.py3-none-any.whl.metadata (8.5 kB)
Collecting text-unidecode>=1.3 (from python-slugify)
  Downloading text_unidecode-1.3-py2.py3-none-any.whl.metadata (2.4 kB)
Downloading python_slugify-8.0.4-py2.py3-none-any.whl (10 kB)
Downloading text_unidecode-1.3-py2.py3-none-any.whl (78 kB)
Installing collected packages: text-unidecode, python-slugify
Successfully installed python-slugify-8.0.4 text-unidecode-1.3

[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m24.2[0m[39;49m -> [0m[32;49m24.3.1[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip install --upgrade pip[0m


In [16]:
from openai import OpenAI
client = OpenAI()

# System prompt to guide the model
system_prompt = """
Analyze the given text and extract key phrases or expressions that are:
1. Important concepts or ideas
2. Innovative or unique phrases
3. Compelling expressions

Format the response as a JSON object with:
phrases json objects like below example
output example
{
  en: "compelling ideas",
  cn: "引人注目的想法",
  color: "text-blue-600",
},
{
  en: "innovative solutions",
  cn: "创新解决方案",
  color: "text-green-600",
},

"""

text = "We're here today to hear compelling ideas, new innovations and thinking\nin science and medicine. And innovative solutions to our most\nvexing problems in society. So they said to me, \"Eric, do you have anything\nthat you can add to this compelling list\nof stories and ideas? Something you can talk about\nhere on the TED stage?\" And I said, \"Of course I do.\" And the question I want\nto explore with you today is: Why don’t vampires\ncast reflections in mirrors? So, you've probably seen\nthis before in movies. The humans are suspicious\nof the new dark stranger, and they band together\nin this moment of confrontation"
completion = client.chat.completions.create(
  model="gpt-4o-mini",
  messages=[
    {"role": "system", "content": system_prompt},
    {"role": "user", "content": text}
  ],
  response_format={"type": "json_object"}
)


print(completion.choices[0].message)

ChatCompletionMessage(content='{\n  "keyPhrases": [\n    {\n      "en": "compelling ideas",\n      "cn": "引人注目的想法",\n      "color": "text-blue-600"\n    },\n    {\n      "en": "new innovations",\n      "cn": "新创新",\n      "color": "text-green-600"\n    },\n    {\n      "en": "science and medicine",\n      "cn": "科学与医学",\n      "color": "text-purple-600"\n    },\n    {\n      "en": "innovative solutions",\n      "cn": "创新解决方案",\n      "color": "text-orange-600"\n    },\n    {\n      "en": "vexing problems",\n      "cn": "棘手问题",\n      "color": "text-red-600"\n    },\n    {\n      "en": "TED stage",\n      "cn": "TED舞台",\n      "color": "text-blue-800"\n    },\n    {\n      "en": "dark stranger",\n      "cn": "黑暗的陌生人",\n      "color": "text-gray-600"\n    },\n    {\n      "en": "moment of confrontation",\n      "cn": "对抗的时刻",\n      "color": "text-yellow-600"\n    },\n    {\n      "en": "vampires cast reflections",\n      "cn": "吸血鬼的倒影",\n      "color": "text-silver-600"\n    }\n  ]\n}',

In [None]:
        try:
            # System prompt to guide the model
            system_prompt = """
            Analyze the given text and extract key phrases or expressions that are:
            1. Important concepts or ideas
            2. Innovative or unique phrases
            3. Compelling expressions
            
            Format the response as a JSON object with:
            - The original text
            - An array of highlights, each containing:
              - phrase: the extracted phrase
              - color: suggested highlight color (choose from: text-blue-600, text-green-600, text-yellow-600)
            """

            # Make the API call
            response = self.client.chat.completions.create(
                model="gpt-4",  # Or another appropriate model
                messages=[
                    {"role": "system", "content": system_prompt},
                    {"role": "user", "content": text}
                ],
                response_format={"type": "json_object"}
            )

            # Parse the response
            result = json.loads(response.choices[0].message.content)
            
            # Format the final output
            output = {
                "text": text,
                "highlights": result.get("highlights", [])
            }

            return output

        except Exception as e:
            return {
                "error": str(e),
                "text": text,
                "highlights": []
            }