In [2]:
import pandas as pd
import os
from openai import OpenAI
import json
from tqdm import tqdm
import time

In [8]:
def generate_ground_truth(df, output_file='podcast_ground_truth.json', api_key=None, max_episodes=None, sample_random=False):
    """
    Generate ground truth data for podcast episodes using GPT API.
    
    Args:
        df: Pandas DataFrame containing podcast episode data with 'episode', 'title', and 'summary' columns
        output_file: Path to save the ground truth data as JSON
        api_key: OpenAI API key (will use environment variable if None)
        max_episodes: Maximum number of episodes to process (useful for testing with fewer episodes)
        sample_random: If True and max_episodes is set, randomly sample episodes instead of taking first few
    
    Returns:
        Dictionary containing ground truth data
    """
    # Initialize OpenAI client
    if api_key is None:
        api_key = os.environ.get("OPENAI_API_KEY")
        if api_key is None:
            raise ValueError("OpenAI API key not found. Please provide it as an argument or set the OPENAI_API_KEY environment variable.")
    
    client = OpenAI(api_key=api_key)
    
    # Create a system prompt template in Traditional Chinese
    system_prompt = """你是一位精通中文的podcast分析專家，熟悉台灣文化和用語。
你將獲得「通勤第一品牌」(Commute For Me) podcast的標題和內容摘要。
你的任務是：
1. 識別該集節目中討論的5-10個關鍵主題
2. 並根據這些關鍵主題創建5個與節目內容相關的問題，這些問題可以用來搜索找到這一集

輸出格式應為有效的JSON，並不要用code blocks，格式如下：
["question1", "question2", ..., "question5"]
"""

    # Process each episode
    ground_truth = {}
    
    # Limit the number of episodes if specified
    if max_episodes is not None:
        if sample_random:
            df_to_process = df.sample(min(max_episodes, len(df)))
        else:
            df_to_process = df.head(max_episodes)
        print(f"Testing with {len(df_to_process)} episodes out of {len(df)} total episodes")
    else:
        df_to_process = df
    
    for _, row in tqdm(df_to_process.iterrows(), total=len(df_to_process), desc="Processing episodes"):
        episode_id = row['episode']
        title = row['title']
        summary = row['summary']
        
        # Skip if summary is NaN
        if pd.isna(summary):
            print(f"Skipping episode {episode_id} - No summary available")
            continue
            
        # Create user prompt for this episode
        user_prompt = f"Episode {episode_id}: {title}\n\nSummary:\n{summary}"
        
        # Call OpenAI API with retry mechanism for rate limits
        max_retries = 3
        for attempt in range(max_retries):
            try:
                response = client.chat.completions.create(
                    model="gpt-4o-mini",  # Using GPT-4o-mini as requested
                    messages=[
                        {"role": "system", "content": system_prompt},
                        {"role": "user", "content": user_prompt}
                    ],
                    temperature=0.2,  # Lower temperature for more consistent outputs
                    response_format={"type": "json_object"}
                )
                
                # Extract and parse JSON response
                response_text = response.choices[0].message.content
                episode_data = json.loads(response_text)
                
                # Ensure questions field exists (for backward compatibility)
                if "questions" not in episode_data:
                    episode_data["questions"] = []
                
                # Store in ground truth dictionary
                ground_truth[str(episode_id)] = episode_data
                
                # Save intermediate results periodically
                if episode_id % 10 == 0:
                    with open(output_file, 'w', encoding='utf-8') as f:
                        json.dump(ground_truth, f, ensure_ascii=False, indent=2)
                
                # Successful response, break out of retry loop
                break
                
            except json.JSONDecodeError:
                print(f"Error: Invalid JSON response for episode {episode_id}. Retrying...")
                time.sleep(2)
                
            except Exception as e:
                print(f"Error processing episode {episode_id}: {str(e)}")
                if "rate limit" in str(e).lower():
                    wait_time = (attempt + 1) * 5  # Exponential backoff
                    print(f"Rate limited. Waiting {wait_time} seconds before retry...")
                    time.sleep(wait_time)
                else:
                    break  # Break on non-rate-limit errors
        
        # Add a small delay between requests to avoid rate limits
        time.sleep(0.5)
    
    # Save final results
    with open(output_file, 'w', encoding='utf-8') as f:
        json.dump(ground_truth, f, ensure_ascii=False, indent=2)
    
    print(f"Ground truth data generated and saved to {output_file}")
    return ground_truth

In [9]:
df = pd.read_csv('podcast_data.csv')
    
# Generate ground truth (using environment variable for API key)
ground_truth = generate_ground_truth(df, max_episodes=3)

ValueError: OpenAI API key not found. Please provide it as an argument or set the OPENAI_API_KEY environment variable.

In [11]:
print(os.environ.get("OPENAI_API_KEY"))

None


In [12]:
print(os.environ.get("HUGGINGFACE_TOKEN"))

None
