In [1]:
import pandas as pd
import os
from openai import OpenAI
import json
from tqdm import tqdm
import time

In [11]:
def generate_ground_truth(df, output_file='podcast_ground_truth.json', api_key=None, max_episodes=None, sample_random=False):
    """
    Generate ground truth data for podcast episodes using GPT API.
    
    Args:
        df: Pandas DataFrame containing podcast episode data with 'episode', 'title', and 'summary' columns
        output_file: Path to save the ground truth data as JSON
        api_key: OpenAI API key (will use environment variable if None)
        max_episodes: Maximum number of episodes to process (useful for testing with fewer episodes)
        sample_random: If True and max_episodes is set, randomly sample episodes instead of taking first few
    
    Returns:
        Dictionary containing ground truth data
    """
    # Initialize OpenAI client
    if api_key is None:
        api_key = os.environ.get("OPENAI_API_KEY")
        if api_key is None:
            raise ValueError("OpenAI API key not found. Please provide it as an argument or set the OPENAI_API_KEY environment variable.")
    
    client = OpenAI(api_key=api_key)
    
    system_prompt = """你是一位知道台灣Podcast「通勤第一品牌」(Commute For Me)的聽眾，熟悉台灣文化和用語。
你將獲得「通勤第一品牌」(Commute For Me) podcast的標題和內容摘要。
你的任務是：
1. 識別該集節目中討論的15個關鍵主題
2. 扮演只記得部分內容的聽眾，且不知道確切集數
3. 根據第1點與第2點，寫出4個搜尋關鍵詞與1個關鍵句子(名詞+動詞)，總共5個
4. 搜尋關鍵詞避免只出現"家倫"、"李毅誠"

輸出格式應為有效的JSON，格式如下：
'word': ["word1", "word2", ..., "word5"]
"""

    # Process each episode
    ground_truth = {}
    
    # Limit the number of episodes if specified
    if max_episodes is not None:
        if sample_random:
            df_to_process = df.sample(min(max_episodes, len(df)))
        else:
            df_to_process = df.head(max_episodes)
        print(f"Testing with {len(df_to_process)} episodes out of {len(df)} total episodes")
    else:
        df_to_process = df
    
    for _, row in tqdm(df_to_process.iterrows(), total=len(df_to_process), desc="Processing episodes"):
        episode_id = row['episode']
        title = row['title']
        summary = row['summary']
        
        # Skip if summary is NaN
        if pd.isna(summary):
            print(f"Skipping episode {episode_id} - No summary available")
            continue
            
        # Create user prompt for this episode
        user_prompt = f"Episode {episode_id}: {title}\n\nSummary:\n{summary}"
        
        # Call OpenAI API with retry mechanism for rate limits
        max_retries = 3
        for attempt in range(max_retries):
            try:
                response = client.chat.completions.create(
                    model="gpt-4o-mini",  # Using GPT-4o-mini as requested
                    messages=[
                        {"role": "system", "content": system_prompt},
                        {"role": "user", "content": user_prompt}
                    ],
                    temperature=0.2,  # Lower temperature for more consistent outputs
                    response_format={"type": "json_object"}
                )
                
                # Extract and parse JSON response
                response_text = response.choices[0].message.content
                episode_data = json.loads(response_text)
                
                # Ensure sentence field exists (for backward compatibility)
                if "word" not in episode_data:
                    episode_data["word"] = []
                
                # Store in ground truth dictionary
                ground_truth[str(episode_id)] = episode_data
                
                # Save intermediate results periodically
                if episode_id % 10 == 0:
                    with open(output_file, 'w', encoding='utf-8') as f:
                        json.dump(ground_truth, f, ensure_ascii=False, indent=2)
                
                # Successful response, break out of retry loop
                break
                
            except json.JSONDecodeError:
                print(f"Error: Invalid JSON response for episode {episode_id}. Retrying...")
                time.sleep(2)
                
            except Exception as e:
                print(f"Error processing episode {episode_id}: {str(e)}")
                if "rate limit" in str(e).lower():
                    wait_time = (attempt + 1) * 5  # Exponential backoff
                    print(f"Rate limited. Waiting {wait_time} seconds before retry...")
                    time.sleep(wait_time)
                else:
                    break  # Break on non-rate-limit errors
        
        # Add a small delay between requests to avoid rate limits
        time.sleep(0.5)
    
    # Save final results
    with open(output_file, 'w', encoding='utf-8') as f:
        json.dump(ground_truth, f, ensure_ascii=False, indent=2)
    
    print(f"Ground truth data generated and saved to {output_file}")
    return ground_truth

In [12]:
df = pd.read_csv('podcast_data.csv')
    
# Generate ground truth (using environment variable for API key)
ground_truth = generate_ground_truth(df, max_episodes=3, sample_random=True)

Testing with 3 episodes out of 444 total episodes


Processing episodes: 100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 3/3 [00:05<00:00,  1.80s/it]

Ground truth data generated and saved to podcast_ground_truth.json





In [13]:
ground_truth

{'57': {'word': ['成人之美', '情感回顧', '分手經驗', '友誼名片', '日本咖啡罐']},
 '215': {'word': ['國際視野', '葉耀元', '台灣研究', '教育制度', '考試制度']},
 '202': {'word': ['春困', '嗜睡症', '托嬰中心', '超商遊戲', '確診']}}

In [14]:
ground_truth = generate_ground_truth(df)

Processing episodes:   1%|██                                                                                                                                                                                  | 5/444 [00:08<13:10,  1.80s/it]

Skipping episode 6 - No summary available
Skipping episode 7 - No summary available
Skipping episode 8 - No summary available
Skipping episode 9 - No summary available
Skipping episode 10 - No summary available
Skipping episode 11 - No summary available
Skipping episode 12 - No summary available
Skipping episode 13 - No summary available


Processing episodes:  14%|█████████████████████████▍                                                                                                                                                         | 63/444 [01:44<10:48,  1.70s/it]

Skipping episode 64 - No summary available


Processing episodes:  33%|██████████████████████████████████████████████████████████▏                                                                                                                       | 145/444 [04:22<09:43,  1.95s/it]

Skipping episode 146 - No summary available


Processing episodes:  50%|████████████████████████████████████████████████████████████████████████████████████████▌                                                                                         | 221/444 [06:52<06:23,  1.72s/it]

Skipping episode 222 - No summary available


Processing episodes:  51%|██████████████████████████████████████████████████████████████████████████████████████████▏                                                                                       | 225/444 [06:57<05:31,  1.51s/it]

Skipping episode 226 - No summary available


Processing episodes:  51%|███████████████████████████████████████████████████████████████████████████████████████████▍                                                                                      | 228/444 [07:01<04:46,  1.33s/it]

Skipping episode 229 - No summary available


Processing episodes:  56%|████████████████████████████████████████████████████████████████████████████████████████████████████▏                                                                             | 250/444 [07:40<06:10,  1.91s/it]

Skipping episode 251 - No summary available


Processing episodes:  59%|█████████████████████████████████████████████████████████████████████████████████████████████████████████                                                                         | 262/444 [07:58<04:56,  1.63s/it]

Skipping episode 263 - No summary available


Processing episodes:  60%|███████████████████████████████████████████████████████████████████████████████████████████████████████████▍                                                                      | 268/444 [08:07<04:51,  1.66s/it]

Skipping episode 269 - No summary available


Processing episodes:  61%|████████████████████████████████████████████████████████████████████████████████████████████████████████████▏                                                                     | 270/444 [08:09<03:40,  1.26s/it]

Skipping episode 271 - No summary available


Processing episodes:  62%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████▏                                                                   | 275/444 [08:16<04:30,  1.60s/it]

Skipping episode 276 - No summary available


Processing episodes:  64%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████▍                                                                | 283/444 [08:29<04:43,  1.76s/it]

Skipping episode 284 - No summary available


Processing episodes:  68%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▎                                                         | 300/444 [08:59<05:08,  2.14s/it]

Skipping episode 301 - No summary available


Processing episodes:  72%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▎                                                 | 320/444 [09:38<07:05,  3.43s/it]

Skipping episode 321 - No summary available


Processing episodes:  73%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▎                                               | 325/444 [09:45<04:10,  2.11s/it]

Skipping episode 326 - No summary available


Processing episodes:  81%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▋                                 | 361/444 [10:44<02:33,  1.85s/it]

Skipping episode 362 - No summary available


Processing episodes:  82%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▉                                | 364/444 [10:48<02:00,  1.51s/it]

Skipping episode 365 - No summary available


Processing episodes:  84%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▎                           | 375/444 [11:05<01:52,  1.63s/it]

Skipping episode 376 - No summary available


Processing episodes:  87%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▎                       | 385/444 [11:21<01:42,  1.74s/it]

Skipping episode 386 - No summary available


Processing episodes:  90%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▎                 | 400/444 [11:45<01:10,  1.60s/it]

Skipping episode 401 - No summary available
Skipping episode 402 - No summary available
Skipping episode 403 - No summary available


Processing episodes:  91%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▉                | 404/444 [11:46<00:33,  1.20it/s]

Skipping episode 405 - No summary available
Skipping episode 406 - No summary available


Processing episodes:  92%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▏              | 407/444 [11:48<00:27,  1.36it/s]

Skipping episode 408 - No summary available


Processing episodes: 100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▏| 442/444 [12:51<00:03,  1.75s/it]

Skipping episode 443 - No summary available


Processing episodes: 100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 444/444 [12:53<00:00,  1.74s/it]

Ground truth data generated and saved to podcast_ground_truth.json





In [19]:
ground_truth['410']

{'word': ['真粉定義', '網路文化', '彩蛋解析', '深度討論', '社群反應']}

In [2]:
def generate_topic(df, output_file='podcast_topic.json', api_key=None, max_episodes=None, sample_random=False):
    """
    
    Args:
        df: Pandas DataFrame containing podcast episode data with 'episode', 'title', and 'summary' columns
        output_file: Path to save the ground truth data as JSON
        api_key: OpenAI API key (will use environment variable if None)
        max_episodes: Maximum number of episodes to process (useful for testing with fewer episodes)
        sample_random: If True and max_episodes is set, randomly sample episodes instead of taking first few
    
    Returns:
        Dictionary containing ground truth data
    """
    # Initialize OpenAI client
    if api_key is None:
        api_key = os.environ.get("OPENAI_API_KEY")
        if api_key is None:
            raise ValueError("OpenAI API key not found. Please provide it as an argument or set the OPENAI_API_KEY environment variable.")
    
    client = OpenAI(api_key=api_key)
    
    system_prompt = """你是一位知道台灣Podcast「通勤第一品牌」(Commute For Me)的聽眾，熟悉台灣文化和用語。
你將獲得「通勤第一品牌」(Commute For Me) podcast的標題和內容摘要。
你的任務是：
1. 識別該集節目中討論的10~20個關鍵主題或句子

輸出格式應為有效的JSON，格式如下：
'topic': ["topic1", "topic2", ..., "topic20"]
"""

    # Process each episode
    podcast_topic = {}
    
    # Limit the number of episodes if specified
    if max_episodes is not None:
        if sample_random:
            df_to_process = df.sample(min(max_episodes, len(df)))
        else:
            df_to_process = df.head(max_episodes)
        print(f"Testing with {len(df_to_process)} episodes out of {len(df)} total episodes")
    else:
        df_to_process = df
    
    for _, row in tqdm(df_to_process.iterrows(), total=len(df_to_process), desc="Processing episodes"):
        episode_id = row['episode']
        title = row['title']
        summary = row['summary']
        if pd.isnull(row['song_recommendation']):
            song = 'null'
        else:
            song = row['song_recommendation']
        
        # Skip if summary is NaN
        if pd.isna(summary):
            print(f"Skipping episode {episode_id} - No summary available")
            continue
            
        # Create user prompt for this episode
        user_prompt = f"Episode {episode_id}: {title}\n\nSummary:\n{summary}"
        
        # Call OpenAI API with retry mechanism for rate limits
        max_retries = 3
        for attempt in range(max_retries):
            try:
                response = client.chat.completions.create(
                    model="gpt-4o-mini",  # Using GPT-4o-mini as requested
                    messages=[
                        {"role": "system", "content": system_prompt},
                        {"role": "user", "content": user_prompt}
                    ],
                    temperature=0.2,  # Lower temperature for more consistent outputs
                    response_format={"type": "json_object"}
                )
                
                # Extract and parse JSON response
                response_text = response.choices[0].message.content
                episode_data = json.loads(response_text)
                
                # Ensure sentence field exists (for backward compatibility)
                if "topic" not in episode_data:
                    episode_data["topic"] = []

                episode_data["song"] = song
                
                # Store in ground truth dictionary
                podcast_topic[str(episode_id)] = episode_data
                
                # Save intermediate results periodically
                if episode_id % 10 == 0:
                    with open(output_file, 'w', encoding='utf-8') as f:
                        json.dump(podcast_topic, f, ensure_ascii=False, indent=2)
                
                # Successful response, break out of retry loop
                break
                
            except json.JSONDecodeError:
                print(f"Error: Invalid JSON response for episode {episode_id}. Retrying...")
                time.sleep(2)
                
            except Exception as e:
                print(f"Error processing episode {episode_id}: {str(e)}")
                if "rate limit" in str(e).lower():
                    wait_time = (attempt + 1) * 5  # Exponential backoff
                    print(f"Rate limited. Waiting {wait_time} seconds before retry...")
                    time.sleep(wait_time)
                else:
                    break  # Break on non-rate-limit errors
        
        # Add a small delay between requests to avoid rate limits
        time.sleep(0.5)
    
    # Save final results
    with open(output_file, 'w', encoding='utf-8') as f:
        json.dump(podcast_topic, f, ensure_ascii=False, indent=2)
    
    print(f"Process data generated and saved to {output_file}")
    return podcast_topic

In [3]:
df = pd.read_csv('podcast_data.csv')
    
# Generate ground truth (using environment variable for API key)
podcast_topic = generate_topic(df, max_episodes=3)

Testing with 3 episodes out of 444 total episodes


Processing episodes: 100%|████████████████████████████████████████████████████| 3/3 [00:11<00:00,  3.85s/it]

Process data generated and saved to podcast_topic.json





In [4]:
podcast_topic

{'1': {'topic': ['NMSL的起源',
   'CS遊戲文化',
   '丹麥二線玩家',
   'envy團體',
   'NMSL的含義',
   '刺青文化',
   '中泰twitter大戰',
   '避敏感詞的用法',
   '台灣的委婉表達',
   '吳淑珍的網路搜尋',
   '馬鶴凌的提及',
   '人類集體淺意識',
   '文字獄的討論',
   '5000年的傳統',
   '白色恐怖的歷史',
   '誠誠的遊戲名字',
   '手肘走路的練習',
   '社會政治的隱喻',
   '文化認同的探討'],
  'song': nan},
 '2': {'topic': ['多人運動',
   '偷情',
   'VR做愛',
   '通姦罪',
   '公投',
   '性成癮',
   '開放性關係',
   '羞辱感',
   '身心分離派',
   '空幹影片',
   '母系社會',
   '出軌外遇',
   '應變能力',
   '誠實應對',
   '名人稅',
   '林俊傑',
   '台獨',
   '羅志祥',
   '柯粉',
   '和解'],
  'song': nan},
 '3': {'topic': ['清大人社學費爭論',
   '宿舍費不退',
   '綽號文化',
   '條直性格',
   '自我介紹方式',
   '連戰的父親身份',
   '綽號的創造與使用',
   '李敖語錄',
   '家庭暴力話題',
   '性騷擾事件',
   '社交壓力',
   '父母對工作升遷的影響',
   '支持民眾黨',
   '西瓜挑戰',
   '家庭背景與努力的關係',
   '搞笑的原則',
   '社會期待與個人努力',
   '韓流影響政治',
   '印象管理'],
  'song': nan}}

In [5]:
podcast_topic = generate_topic(df, max_episodes=5, sample_random=True)

Testing with 5 episodes out of 444 total episodes


Processing episodes: 100%|████████████████████████████████████████████████████| 5/5 [00:18<00:00,  3.64s/it]

Process data generated and saved to podcast_topic.json





In [6]:
podcast_topic

{'203': {'topic': ['威滅的廣告劇',
   '何ㄟ女友和維力炸醬麵',
   '泡麵冷暴力',
   '咖啡盤子的用途',
   '奶茶機的回憶',
   '台大學習的理論',
   '整鰭器的使用',
   '電影私法爭鋒',
   '吳斐莉代言的鞋子',
   '電影阿凡達的評價',
   '品味的懷疑',
   '重看Star Trek 2009',
   '小何的故事',
   '網頁瀏覽器的分類',
   '情緒管理',
   '台裔醫師的勇氣',
   '釣魚行為的影響',
   '負面情緒的勾動'],
  'song': nan},
 '281': {'topic': ['夢境',
   '張菲',
   '睡袍',
   '乃哥家',
   '採訪',
   '打麻將',
   '成功學',
   '筆記',
   '超慢跑',
   '披薩',
   '減肥學',
   '減肥失敗',
   '成功學的可信度',
   '邊做邊吃',
   '生活方式',
   '幽默',
   '文化',
   '台灣名人',
   '休閒活動'],
  'song': '超時空要塞'},
 '41': {'topic': ['春艷的合作經歷',
   '第一次拿到的報酬',
   '黑貓宅急便的工作經歷',
   '家庭背景與佛法信仰',
   '冥想在監獄生活中的作用',
   '集體意志的重要性',
   '夜生活的享受與失控',
   '對於他人觀點的看法',
   '慾望的空虛',
   '夜生活中的抓馬事件',
   '人生的需求與想要的區分',
   '交大八社的烤香腸',
   '音樂與情感的連結',
   '夜貓組的代言歌',
   '社交與人際關係的挑戰',
   '對於生活中發生事情的接受',
   '倫的學習與生活態度',
   '春艷的音樂創作背景',
   '夜生活的社會文化影響',
   '對於失控的恐懼與接受'],
  'song': '/'},
 '152': {'topic': ['倫迷看治療蹄底皮膚指甲病的影片',
   '何ㄟ開始節食',
   '168斷食法',
   '186斷食法',
   '何特愛小熊軟糖',
   '哈根達斯冰淇淋',
   '倫是馬鈴薯狂粉',
   '薯餅蛋吐司',
   

In [7]:
podcast_topic = generate_topic(df)

Processing episodes:   1%|▌                                                 | 5/444 [00:17<26:21,  3.60s/it]

Skipping episode 6 - No summary available
Skipping episode 7 - No summary available
Skipping episode 8 - No summary available
Skipping episode 9 - No summary available
Skipping episode 10 - No summary available
Skipping episode 11 - No summary available
Skipping episode 12 - No summary available
Skipping episode 13 - No summary available


Processing episodes:  14%|██████▉                                          | 63/444 [03:57<25:34,  4.03s/it]

Skipping episode 64 - No summary available


Processing episodes:  33%|███████████████▋                                | 145/444 [09:25<19:37,  3.94s/it]

Skipping episode 146 - No summary available


Processing episodes:  50%|███████████████████████▉                        | 221/444 [14:31<13:26,  3.62s/it]

Skipping episode 222 - No summary available


Processing episodes:  51%|████████████████████████▎                       | 225/444 [14:42<11:51,  3.25s/it]

Skipping episode 226 - No summary available


Processing episodes:  51%|████████████████████████▋                       | 228/444 [14:50<10:57,  3.04s/it]

Skipping episode 229 - No summary available


Processing episodes:  56%|███████████████████████████                     | 250/444 [16:16<16:25,  5.08s/it]

Skipping episode 251 - No summary available


Processing episodes:  59%|████████████████████████████▎                   | 262/444 [16:58<11:03,  3.64s/it]

Skipping episode 263 - No summary available


Processing episodes:  60%|████████████████████████████▉                   | 268/444 [17:19<10:53,  3.71s/it]

Skipping episode 269 - No summary available


Processing episodes:  61%|█████████████████████████████▏                  | 270/444 [17:22<08:01,  2.77s/it]

Skipping episode 271 - No summary available


Processing episodes:  62%|█████████████████████████████▋                  | 275/444 [17:40<10:35,  3.76s/it]

Skipping episode 276 - No summary available


Processing episodes:  64%|██████████████████████████████▌                 | 283/444 [18:01<08:09,  3.04s/it]

Skipping episode 284 - No summary available


Processing episodes:  68%|████████████████████████████████▍               | 300/444 [19:11<09:51,  4.11s/it]

Skipping episode 301 - No summary available


Processing episodes:  72%|██████████████████████████████████▌             | 320/444 [20:24<07:28,  3.61s/it]

Skipping episode 321 - No summary available


Processing episodes:  73%|███████████████████████████████████▏            | 325/444 [20:41<08:12,  4.14s/it]

Skipping episode 326 - No summary available


Processing episodes:  81%|███████████████████████████████████████         | 361/444 [23:02<05:15,  3.80s/it]

Skipping episode 362 - No summary available


Processing episodes:  82%|███████████████████████████████████████▎        | 364/444 [23:07<03:36,  2.71s/it]

Skipping episode 365 - No summary available


Processing episodes:  84%|████████████████████████████████████████▌       | 375/444 [23:50<04:47,  4.16s/it]

Skipping episode 376 - No summary available


Processing episodes:  87%|█████████████████████████████████████████▌      | 385/444 [24:24<03:54,  3.98s/it]

Skipping episode 386 - No summary available


Processing episodes:  90%|███████████████████████████████████████████▏    | 400/444 [25:21<02:36,  3.56s/it]

Skipping episode 401 - No summary available
Skipping episode 402 - No summary available
Skipping episode 403 - No summary available


Processing episodes:  91%|███████████████████████████████████████████▋    | 404/444 [25:24<01:13,  1.83s/it]

Skipping episode 405 - No summary available
Skipping episode 406 - No summary available


Processing episodes:  92%|████████████████████████████████████████████    | 407/444 [25:28<00:56,  1.53s/it]

Skipping episode 408 - No summary available


Processing episodes: 100%|███████████████████████████████████████████████▊| 442/444 [27:48<00:07,  3.66s/it]

Skipping episode 443 - No summary available


Processing episodes: 100%|████████████████████████████████████████████████| 444/444 [27:51<00:00,  3.77s/it]

Process data generated and saved to podcast_topic.json



