In [6]:
import feedparser
import pandas as pd
import requests
from datetime import datetime
import time
import xml.etree.ElementTree as ET
from typing import List, Dict

# Load news sources from CSV file
def load_news_sources_from_csv(csv_path='news_sources.csv'):
    """Load news sources from CSV file and organize by continent."""
    try:
        sources_df = pd.read_csv(csv_path)
        print(f"✅ Loaded {len(sources_df)} news sources from {csv_path}")
        
        # Organize by continent
        news_feeds = {}
        for continent in sources_df['continent'].unique():
            continent_sources = sources_df[sources_df['continent'] == continent]
            news_feeds[continent] = []
            
            for _, row in continent_sources.iterrows():
                news_feeds[continent].append({
                    'name': row['name'],
                    'url': row['url']
                })
        
        return news_feeds
        
    except FileNotFoundError:
        print(f"❌ CSV file '{csv_path}' not found!")
        raise
    except Exception as e:
        print(f"❌ Error loading CSV: {str(e)}")
        raise

# Load news feeds from CSV
NEWS_FEEDS = load_news_sources_from_csv()

def fetch_feed_titles(feed_url, feed_name, max_titles=10):
    """Extract titles and descriptions from a feed with simple error handling."""
    titles = []
    
    try:
        # Fetch with timeout
        print(f"  Fetching {feed_name}...", end="", flush=True)
        response = requests.get(feed_url, timeout=10, headers={
            'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36'
        })
        
        # Parse with feedparser
        feed = feedparser.parse(response.content)
        
        # Debug: Print first entry details for each source
        if feed.entries:
            first_entry = feed.entries[0]
            print(f"\n    📋 First entry from {feed_name}:")
            print(f"      Title: {getattr(first_entry, 'title', 'NO TITLE')}")
            print(f"      Link: {getattr(first_entry, 'link', 'NO LINK')}")
            print(f"      Published: {getattr(first_entry, 'published', 'NO DATE')}")
            print(f"      Summary: {getattr(first_entry, 'summary', 'NO SUMMARY')[:100]}...")
            print(f"      Description: {getattr(first_entry, 'description', 'NO DESCRIPTION')[:100]}...")
            print(f"      Available fields: {list(first_entry.keys())}")
        else:
            print(f"\n    ❌ No entries found in feed for {feed_name}")
        
        # Extract titles and descriptions
        for i, entry in enumerate(feed.entries[:max_titles]):
            if hasattr(entry, 'title'):
                # Try multiple fields for description content
                description = ""
                if hasattr(entry, 'summary') and entry.summary:
                    description = entry.summary.strip()
                elif hasattr(entry, 'description') and entry.description:
                    description = entry.description.strip()
                elif hasattr(entry, 'content') and entry.content:
                    # Some feeds use content field
                    if isinstance(entry.content, list) and len(entry.content) > 0:
                        description = entry.content[0].get('value', '').strip()
                    else:
                        description = str(entry.content).strip()
                
                titles.append({
                    'source': feed_name,
                    'title': entry.title.strip(),
                    'description': description,
                    'timestamp': datetime.now().isoformat()
                })
        
        print(f"    ✓ ({len(titles)} titles extracted)")
        
    except Exception as e:
        print(f" ✗ (Error: {type(e).__name__})")
    
    return titles

# Main execution
print("Starting news feed scanner...\n")
all_results = []

# Process each continent
for continent, feeds in NEWS_FEEDS.items():
    print(f"\n{continent}:")
    print("-" * 40)
    
    for feed in feeds:
        titles = fetch_feed_titles(feed['url'], feed['name'])
        
        # Add continent info to each title
        for title in titles:
            title['continent'] = continent
            title['feed_url'] = feed['url']
            all_results.append(title)
        
        # Small delay to be respectful
        time.sleep(0.5)

# Create DataFrame
print(f"\n\nCreating DataFrame...")
news_df = pd.DataFrame(all_results)

# Show summary
if not news_df.empty:
    print(f"\n✅ Success! Collected {len(news_df)} articles from {news_df['source'].nunique()} sources")
    print(f"\nArticles per continent:")
    for continent in news_df['continent'].unique():
        count = len(news_df[news_df['continent'] == continent])
        print(f"  {continent}: {count}")
    
    print("\n📰 Sample headlines:")
    print("=" * 80)
    for _, row in news_df.head(5).iterrows():
        print(f"{row['source']}: {row['title'][:80]}...")
        if row['description']:
            print(f"  📄 Description: {row['description'][:150]}...")
        print()
    
    print(f"\n📊 Description statistics:")
    print(f"  Articles with descriptions: {len(news_df[news_df['description'] != ''])}")
    print(f"  Articles without descriptions: {len(news_df[news_df['description'] == ''])}")
    print(f"  Average description length: {news_df['description'].str.len().mean():.0f} characters")
else:
    print("❌ No articles collected!")

# The DataFrame is now available as 'news_df'
print(f"\n\n✨ DataFrame stored in variable 'news_df' with {len(news_df)} articles")

✅ Loaded 45 news sources from news_sources.csv
Starting news feed scanner...


North America:
----------------------------------------
  Fetching BBC News Americas...
    📋 First entry from BBC News Americas:
      Title: Trump and Musk enter bitter feud - and Washington buckles up
      Link: https://www.bbc.com/news/articles/c3wd2215q08o
      Published: Fri, 06 Jun 2025 05:56:04 GMT
      Summary: A knock-down fight between the world's the richest person and the most powerful politician is playin...
      Description: A knock-down fight between the world's the richest person and the most powerful politician is playin...
      Available fields: ['title', 'title_detail', 'summary', 'summary_detail', 'links', 'link', 'id', 'guidislink', 'published', 'published_parsed', 'media_thumbnail', 'href']
    ✓ (10 titles extracted)
  Fetching NPR News...
    📋 First entry from NPR News:
      Title: Judge puts temporary hold on Trump's latest ban on Harvard's foreign students
      Link: https:

In [19]:
import anthropic
import json
import os
from datetime import datetime
from pathlib import Path

# Load environment variables from .env file
try:
    from dotenv import load_dotenv
    
    # Try multiple possible locations for .env file
    # First, try the current directory
    if Path('.env').exists():
        load_dotenv('.env')
        print("✅ Loaded .env file from current directory")
    # Then try parent directory
    elif Path('../.env').exists():
        load_dotenv('../.env')
        print("✅ Loaded .env file from parent directory")
    else:
        load_dotenv()  # This will search for .env in default locations
        print("✅ Loaded .env file from default location")
        
except ImportError:
    print("❌ python-dotenv not installed! Run: pip install python-dotenv")
    raise

# Get API key from environment variable
API_KEY = os.environ.get('ANTHROPIC_API_KEY')

# Validate that we have the API key
if not API_KEY:
    print("❌ ANTHROPIC_API_KEY not found in environment variables!")
    print("Available environment variables:", list(os.environ.keys()))
    print("\nPlease ensure:")
    print("1. Your .env file is in the same directory as this notebook")
    print("2. The .env file contains: ANTHROPIC_API_KEY=your-key-here")
    print("3. You've installed python-dotenv: pip install python-dotenv")
    raise ValueError("ANTHROPIC_API_KEY environment variable not set!")
else:
    # Show partial key for confirmation (security: only show first and last few characters)
    masked_key = f"{API_KEY[:10]}...{API_KEY[-4:]}" if len(API_KEY) > 20 else "KEY_TOO_SHORT"
    print(f"✅ API Key loaded successfully: {masked_key}")

# Initialize the Claude client with your API key
client = anthropic.Anthropic(api_key=API_KEY)

# Load prompt from external file
def load_prompt_template(prompt_file='analysis_prompt.txt'):
    """Load the prompt template from external file."""
    try:
        with open(prompt_file, 'r', encoding='utf-8') as f:
            prompt_template = f.read()
        print(f"✅ Loaded prompt template from {prompt_file}")
        return prompt_template
    except FileNotFoundError:
        print(f"❌ Prompt file '{prompt_file}' not found!")
        raise
    except Exception as e:
        print(f"❌ Error loading prompt file: {str(e)}")
        raise

# Craft the prompt based on 20min.ch's audience profile
def create_prompt(df_json):
    prompt_template = load_prompt_template()
    # Replace the {df_json} placeholder with actual data
    prompt = prompt_template.format(df_json=df_json)
    return prompt

# Function to make the API call
def analyze_news_for_20min(news_df):
    """
    Send news DataFrame to Claude API and get recommendations for 20min.ch
    """
    # Convert DataFrame to JSON for the prompt
    df_json = news_df.to_json(orient='records', indent=2)
    
    # Create the prompt
    prompt = create_prompt(df_json)
    
    print("Sending request to Claude API...")
    print(f"Analyzing {len(news_df)} articles from {news_df['source'].nunique()} sources...")
    
    try:
        # Make the API call
        response = client.messages.create(
            model="claude-3-5-sonnet-20241022",  # Using Sonnet 3.5
            max_tokens=2000,
            temperature=0.7,
            messages=[
                {
                    "role": "user",
                    "content": prompt
                }
            ]
        )
        
        # Extract the response
        analysis = response.content[0].text
        
        print("\n" + "="*80)
        print("CLAUDE'S RECOMMENDATIONS FOR 20MIN.CH")
        print("="*80 + "\n")
        print(analysis)
        
        return analysis
        
    except Exception as e:
        print(f"\n❌ Error calling Claude API: {str(e)}")
        return None

# Run the analysis
print("🚀 Starting 20min.ch content analysis...")
print(f"📊 DataFrame contains {len(news_df)} articles")

# Call the function with your news DataFrame
analysis_result = analyze_news_for_20min(news_df)

# Optional: Create a summary DataFrame of recommended articles
if analysis_result:
    print("\n✅ Analysis complete! Check the output above for Claude's recommendations.")
    print("\nTip: You can copy the recommended titles and search for them in your original DataFrame:")
    print("Example: news_df[news_df['title'].str.contains('search_term', case=False)]")

✅ Loaded .env file from current directory
✅ API Key loaded successfully: sk-ant-api...vwAA
🚀 Starting 20min.ch content analysis...
📊 DataFrame contains 444 articles
✅ Loaded prompt template from analysis_prompt.txt
Sending request to Claude API...
Analyzing 444 articles from 44 sources...

CLAUDE'S RECOMMENDATIONS FOR 20MIN.CH

Based on the news data, here are the 10 stories with the strongest narrative potential:

## TOP 5 NEWSWORTHY STORIES WITH DEPTH

1. **The Guardian US**: "Trump v Musk: The Two Worst People in the World Finally Have a Big, Beautiful Breakup"
   - **The Journey**: From close allies to bitter enemies in one week - Musk went from receiving a ceremonial key to threatening NASA contracts
   - **Why it resonates**: Shows how quickly powerful alliances can crumble when ego and ideology clash

2. **The Japan Times**: "A Return to Normalcy in South Korea, But Hard Work Lies Ahead"
   - **The Journey**: Lee's rise from slum dweller to president after predecessor's impeachm

In [21]:
import requests
import json
import os
from datetime import datetime
from jinja2 import Template

# Load environment variables with dotenv if available (for local development)
try:
    from dotenv import load_dotenv
    load_dotenv()
    print("✅ Loaded .env file")
except ImportError:
    print("📝 dotenv not available - using system environment variables")

# Your Zapier webhook URL
ZAPIER_WEBHOOK_URL = os.environ.get('ZAPIER_WEBHOOK_URL')

# Add error checking
if not ZAPIER_WEBHOOK_URL:
    raise ValueError("ZAPIER_WEBHOOK_URL environment variable not set!")

def load_template(template_file):
    """Load HTML template from file."""
    try:
        with open(template_file, 'r', encoding='utf-8') as f:
            template_content = f.read()
        return Template(template_content)
    except FileNotFoundError:
        print(f"❌ Template file '{template_file}' not found!")
        raise
    except Exception as e:
        print(f"❌ Error loading template file: {str(e)}")
        raise

def parse_analysis_text(analysis_text):
    """Parse the Claude analysis text into structured data for templates."""
    lines = analysis_text.split('\n')
    sections = []
    current_section = None
    current_story = None
    
    # German translations for labels
    label_translations = {
        'The Journey:': 'Die Entwicklung:',
        'Why it resonates:': 'Warum es berührt:',
        'The Angle:': 'Der Blickwinkel:',
        'Discussion value:': 'Diskussionswert:',
        'Why it matters to Swiss readers:': 'Warum es für Schweizer Leser wichtig ist:',
        'Key angle for 20min:': '20min-Winkel:',
        'Conversation starter:': 'Gesprächsanstoß:',
        '20min angle:': '20min-Winkel:'
    }
    
    for line in lines:
        line = line.strip()
        
        if not line:
            continue
            
        # Handle section headers
        if line.startswith('## '):
            # Save previous section
            if current_section and current_section['stories']:
                sections.append(current_section)
            
            header = line.replace('## ', '').replace('**', '')
            
            # Determine section type and German title
            if "NEWSWORTHY STORIES WITH DEPTH" in header or "TOP 5 NEWSWORTHY" in header:
                current_section = {
                    'title': 'Stories mit narrativer Tiefe',
                    'icon': '🔥',
                    'stories': []
                }
            elif "UNIQUE PERSPECTIVE STORIES" in header or "TOP 5 UNIQUE" in header or "TOP 5 TALKING" in header:
                current_section = {
                    'title': 'Einzigartige Perspektiven', 
                    'icon': '💡',
                    'stories': []
                }
            else:
                current_section = {
                    'title': header,
                    'icon': '',
                    'stories': []
                }
        
        # Handle story entries
        elif line[0:2] in ['1.', '2.', '3.', '4.', '5.'] and current_section:
            # Save previous story
            if current_story:
                current_section['stories'].append(current_story)
            
            # Parse new story
            parts = line.split('**')
            if len(parts) >= 3:
                number = parts[0].strip()
                source = parts[1].strip()
                title = parts[2].strip().lstrip(':').strip()
                
                # Remove quotes if present
                if title.startswith('"') and title.endswith('"'):
                    title = title[1:-1]
                elif title.startswith('"'):
                    title = title[1:]
                
                current_story = {
                    'number': number,
                    'source': source,
                    'title': title,
                    'details': []
                }
        
        # Handle detail bullet points
        elif line.startswith('- ') and current_story:
            content = line[2:].strip()
            
            # Handle bold markers
            if '**' in content:
                parts = content.split('**')
                formatted_content = ""
                for i, part in enumerate(parts):
                    if i % 2 == 1:
                        formatted_content += f'<strong>{part}</strong>'
                    else:
                        formatted_content += part
                content = formatted_content
            
            # Translate labels
            for eng_label, ger_label in label_translations.items():
                content = content.replace(eng_label, ger_label)
            
            # Parse label and value
            if ':' in content:
                label, value = content.split(':', 1)
                current_story['details'].append({
                    'label': label.strip(),
                    'value': value.strip()
                })
    
    # Save final story and section
    if current_story:
        current_section['stories'].append(current_story)
    if current_section and current_section['stories']:
        sections.append(current_section)
    
    return sections

def format_for_google_docs(analysis_text, news_df):
    """Format the analysis using Jinja2 template for Google Docs."""
    try:
        template = load_template('doc_template.html')
        
        # Parse analysis into structured data
        sections = parse_analysis_text(analysis_text)
        
        # Prepare continent statistics
        continent_names_de = {
            'North America': 'Nordamerika',
            'Europe': 'Europa', 
            'Asia': 'Asien',
            'Africa': 'Afrika',
            'South America': 'Südamerika',
            'Oceania': 'Ozeanien'
        }
        
        continents = []
        for continent in news_df['continent'].value_counts().index:
            count = len(news_df[news_df['continent'] == continent])
            continent_de = continent_names_de.get(continent, continent)
            continents.append({'name': continent_de, 'count': count})
        
        # Render template
        html_content = template.render(
            date_formatted=datetime.now().strftime('%d. %B %Y'),
            datetime_full=datetime.now().strftime('%d.%m.%Y um %H:%M CET'),
            time_generated=datetime.now().strftime('%H:%M:%S CET'),
            stats={
                'total_articles': len(news_df),
                'total_sources': news_df['source'].nunique(),
                'continents': news_df['continent'].nunique()
            },
            sections=sections,
            continents=continents
        )
        
        print("✅ Generated Google Docs content using template")
        return html_content
        
    except Exception as e:
        print(f"❌ Error formatting for Google Docs: {str(e)}")
        raise

def create_email_html(analysis_text, news_df):
    """Create HTML content for email using Jinja2 template."""
    try:
        template = load_template('email_template.html')
        
        # Parse analysis into structured data
        sections = parse_analysis_text(analysis_text)
        
        # Render template
        html_content = template.render(
            date=datetime.now().strftime('%d.%m.%Y'),
            time=datetime.now().strftime('%H:%M CET'),
            stats={
                'total_articles': len(news_df),
                'total_sources': news_df['source'].nunique(),
                'continents': news_df['continent'].nunique()
            },
            sections=sections
        )
        
        print("✅ Generated email content using template")
        return html_content
        
    except Exception as e:
        print(f"❌ Error creating email HTML: {str(e)}")
        raise

def extract_top_stories_with_details(analysis_text):
    """
    Extract all 5 stories from each category with full details for email preview
    (Legacy function - now handled by templates, kept for compatibility)
    """
    sections = parse_analysis_text(analysis_text)
    preview_html = ""
    
    for section in sections:
        preview_html += f"<h4>{section['icon']} {section['title']}:</h4>"
        
        for story in section['stories']:
            preview_html += f"<div style='margin: 15px 0; padding: 10px; background-color: #f9f9f9; border-radius: 5px;'><strong>{story['source']}</strong>: {story['title']}"
            
            for detail in story['details']:
                preview_html += f"<br><span style='color: #666; font-size: 14px;'>• <strong>{detail['label']}:</strong>{detail['value']}</span>"
            
            preview_html += "</div>"
    
    return preview_html

def extract_top_stories(analysis_text):
    """
    Extract all 5 stories from each category for email preview - FIXED quote handling
    (Legacy function - kept for compatibility)
    """
    sections = parse_analysis_text(analysis_text)
    preview_html = ""
    
    for section in sections:
        preview_html += f"<h4>{section['icon']} {section['title']}:</h4><ul>"
        
        for story in section['stories']:
            preview_html += f"<li><strong>{story['source']}</strong>: {story['title']}</li>"
        
        preview_html += "</ul>"
    
    return preview_html

def send_to_zapier(analysis_text, news_df):
    """
    Send the formatted data to Zapier webhook
    """
    print("📤 Sending data to Zapier...")
    
    # Prepare the payload
    payload = {
        "date": datetime.now().strftime("%d.%m.%Y"),
        "time": datetime.now().strftime("%H:%M CET"),
        "document_title": f"20min.ch News-Analyse - {datetime.now().strftime('%d.%m.%Y')}",
        "document_content": format_for_google_docs(analysis_text, news_df),
        "email_content_html": create_email_html(analysis_text, news_df),
        "stats": {
            "total_articles": len(news_df),
            "total_sources": news_df['source'].nunique(),
            "continents": news_df['continent'].nunique()
        },
        "recipient_email": "tom.vaillant@20minuten.ch",
        "email_subject": f"Tägliche News-Analyse - {datetime.now().strftime('%d.%m.%Y')}"
    }
    
    try:
        # Send to Zapier
        response = requests.post(ZAPIER_WEBHOOK_URL, json=payload)
        
        if response.status_code == 200:
            print("✅ Successfully sent to Zapier!")
            print(f"📧 Email will be sent to: {payload['recipient_email']}")
            print(f"📄 Document title: {payload['document_title']}")
            print("\n🎯 Next steps in Zapier:")
            print("1. Google Doc will be created automatically")
            print("2. Email will be sent with the doc link")
            return True
        else:
            print(f"❌ Error sending to Zapier: {response.status_code}")
            print(f"Response: {response.text}")
            return False
            
    except Exception as e:
        print(f"❌ Exception occurred: {str(e)}")
        return False

# Test the webhook with sample data (optional)
def test_zapier_webhook():
    """
    Test the Zapier webhook with minimal data
    """
    test_payload = {
        "test": True,
        "message": "Testing webhook connection",
        "timestamp": datetime.now().isoformat()
    }
    
    print("🧪 Testing Zapier webhook...")
    response = requests.post(ZAPIER_WEBHOOK_URL, json=test_payload)
    
    if response.status_code == 200:
        print("✅ Webhook test successful!")
        print("Check your Zapier dashboard to see the test data")
    else:
        print(f"❌ Webhook test failed: {response.status_code}")

# Main execution for your Jupyter notebook
if 'analysis_result' in globals() and 'news_df' in globals():
    # Send the analysis to Zapier
    success = send_to_zapier(analysis_result, news_df)
    
    if success:
        print("\n🎉 All done! Check your Zapier dashboard to monitor the workflow.")
else:
    print("❌ No analysis results found. Please run the Claude analysis first.")
    print("\n💡 To test your webhook connection, run:")
    print("test_zapier_webhook()")

✅ Loaded .env file
📤 Sending data to Zapier...
✅ Generated Google Docs content using template
✅ Generated email content using template
✅ Successfully sent to Zapier!
📧 Email will be sent to: tom.vaillant@20minuten.ch
📄 Document title: 20min.ch News-Analyse - 06.06.2025

🎯 Next steps in Zapier:
1. Google Doc will be created automatically
2. Email will be sent with the doc link

🎉 All done! Check your Zapier dashboard to monitor the workflow.
