In [5]:
import feedparser
import pandas as pd
import requests
from datetime import datetime
import time
import xml.etree.ElementTree as ET
from typing import List, Dict

# News feeds organized by continent (excluding paid/API-required sources)
NEWS_FEEDS = {
    "North America": [
        {"name": "BBC News Americas", "url": "https://feeds.bbci.co.uk/news/world/us_and_canada/rss.xml"},
        {"name": "NPR News", "url": "https://feeds.npr.org/1001/rss.xml"},
        {"name": "The New York Times", "url": "https://rss.nytimes.com/services/xml/rss/nyt/HomePage.xml"},
        {"name": "CBC News Canada", "url": "https://rss.cbc.ca/lineup/topstories.xml"},
        {"name": "The Guardian US", "url": "https://www.theguardian.com/us-news/rss"},
        {"name": "Mexico News Daily", "url": "https://mexiconewsdaily.com/feed/"}
    ],
    "Europe": [
        {"name": "BBC News Europe", "url": "https://feeds.bbci.co.uk/news/world/europe/rss.xml"},
        {"name": "The Guardian UK", "url": "https://www.theguardian.com/uk/rss"},
        {"name": "Deutsche Welle English", "url": "https://rss.dw.com/rdf/rss-en-all"},
        {"name": "France24 English", "url": "https://www.france24.com/en/rss"},
        {"name": "Euronews", "url": "https://www.euronews.com/rss"},
        {"name": "POLITICO Europe", "url": "https://www.politico.eu/feed/"},
        {"name": "Swiss Info", "url": "https://www.swissinfo.ch/eng/latest-news/rss"},
        {"name": "Blick", "url": "https://www.blick.ch/news/rss.xml"}
    ],
    "Asia": [
        {"name": "Al Jazeera", "url": "https://www.aljazeera.com/xml/rss/all.xml"},
        {"name": "The Japan Times", "url": "https://www.japantimes.co.jp/feed/"},
        {"name": "South China Morning Post", "url": "https://www.scmp.com/rss/91/feed"},
        {"name": "The Hindu India", "url": "https://www.thehindu.com/news/national/feeder/default.rss"},
        {"name": "Times of India", "url": "https://timesofindia.indiatimes.com/rssfeedstopstories.cms"},
        {"name": "Arab News", "url": "https://www.arabnews.com/rss.xml"}
    ],
    "Africa": [
        {"name": "BBC News Africa", "url": "https://feeds.bbci.co.uk/news/world/africa/rss.xml"},
        {"name": "AllAfrica", "url": "https://allafrica.com/tools/headlines/rdf/latest/headlines.rdf"},
        {"name": "Mail & Guardian SA", "url": "https://mg.co.za/feed/"},
        {"name": "News24 South Africa", "url": "https://feeds.news24.com/articles/news24/TopStories/rss"},
        {"name": "Morocco World News", "url": "https://www.moroccoworldnews.com/feed/"}
    ],
    "South America": [
        {"name": "BBC News Latin America", "url": "https://feeds.bbci.co.uk/news/world/latin_america/rss.xml"},
        {"name": "Buenos Aires Times", "url": "https://www.batimes.com.ar/feed"},
        {"name": "MercoPress", "url": "https://en.mercopress.com/rss/v2/headlines"},
        {"name": "Colombia Reports", "url": "https://colombiareports.com/feed/"}
    ],
    "Oceania": [
        {"name": "ABC News Australia", "url": "https://www.abc.net.au/news/feed/2942460/rss.xml"},
        {"name": "Sydney Morning Herald", "url": "https://www.smh.com.au/rss/feed.xml"},
        {"name": "The Guardian Australia", "url": "https://www.theguardian.com/australia-news/rss"},
        {"name": "Stuff.co.nz", "url": "https://www.stuff.co.nz/rss"},
        {"name": "Radio New Zealand", "url": "https://www.rnz.co.nz/rss/national.xml"}
    ]
}

def fetch_feed_titles(feed_url, feed_name, max_titles=10):
    """Extract titles from a feed with simple error handling."""
    titles = []
    
    try:
        # Fetch with timeout
        print(f"  Fetching {feed_name}...", end="", flush=True)
        response = requests.get(feed_url, timeout=10, headers={
            'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36'
        })
        
        # Parse with feedparser
        feed = feedparser.parse(response.content)
        
        # Extract titles
        for i, entry in enumerate(feed.entries[:max_titles]):
            if hasattr(entry, 'title'):
                titles.append({
                    'source': feed_name,
                    'title': entry.title.strip(),
                    'timestamp': datetime.now().isoformat()
                })
        
        print(f" ✓ ({len(titles)} titles)")
        
    except Exception as e:
        print(f" ✗ (Error: {type(e).__name__})")
    
    return titles

# Main execution
print("Starting news feed scanner...\n")
all_results = []

# Process each continent
for continent, feeds in NEWS_FEEDS.items():
    print(f"\n{continent}:")
    print("-" * 40)
    
    for feed in feeds:
        titles = fetch_feed_titles(feed['url'], feed['name'])
        
        # Add continent info to each title
        for title in titles:
            title['continent'] = continent
            title['feed_url'] = feed['url']
            all_results.append(title)
        
        # Small delay to be respectful
        time.sleep(0.5)

# Create DataFrame
print(f"\n\nCreating DataFrame...")
news_df = pd.DataFrame(all_results)

# Show summary
if not news_df.empty:
    print(f"\n✅ Success! Collected {len(news_df)} articles from {news_df['source'].nunique()} sources")
    print(f"\nArticles per continent:")
    for continent in news_df['continent'].unique():
        count = len(news_df[news_df['continent'] == continent])
        print(f"  {continent}: {count}")
    
    print("\n📰 Sample headlines:")
    print("=" * 80)
    for _, row in news_df.head(10).iterrows():
        print(f"{row['source']}: {row['title'][:100]}...")
else:
    print("❌ No articles collected!")

# The DataFrame is now available as 'news_df'
print(f"\n\n✨ DataFrame stored in variable 'news_df' with {len(news_df)} articles")

Starting news feed scanner...


North America:
----------------------------------------
  Fetching BBC News Americas... ✓ (10 titles)
  Fetching NPR News... ✓ (10 titles)
  Fetching The New York Times... ✓ (10 titles)
  Fetching CBC News Canada... ✓ (10 titles)
  Fetching The Guardian US... ✓ (10 titles)
  Fetching Mexico News Daily... ✓ (10 titles)

Europe:
----------------------------------------
  Fetching BBC News Europe... ✓ (10 titles)
  Fetching The Guardian UK... ✓ (10 titles)
  Fetching Deutsche Welle English... ✓ (10 titles)
  Fetching France24 English... ✓ (10 titles)
  Fetching Euronews... ✓ (10 titles)
  Fetching POLITICO Europe... ✓ (10 titles)
  Fetching Swiss Info... ✓ (0 titles)
  Fetching Blick... ✓ (4 titles)

Asia:
----------------------------------------
  Fetching Al Jazeera... ✓ (10 titles)
  Fetching The Japan Times... ✓ (10 titles)
  Fetching South China Morning Post... ✓ (10 titles)
  Fetching The Hindu India... ✓ (10 titles)
  Fetching Times of India... ✓ (10

In [6]:
import anthropic
import json
import os
from datetime import datetime

# Load environment variables with dotenv if available (for local development)
try:
    from dotenv import load_dotenv
    load_dotenv()
    print("✅ Loaded .env file")
except ImportError:
    print("📝 dotenv not available - using system environment variables")

# Initialize the Claude client with your API key
API_KEY = os.environ.get('ANTHROPIC_API_KEY')

# Add error checking
if not API_KEY:
    raise ValueError("ANTHROPIC_API_KEY environment variable not set!")

client = anthropic.Anthropic(api_key=API_KEY)

# Craft the prompt based on 20min.ch's audience profile
def create_prompt(df_json):
    prompt = f"""You are a news editor for 20min.ch, Switzerland's leading free commuter tabloid with over 2 million readers. Your audience consists of:

**TARGET AUDIENCE PROFILE:**
- Young urban commuters (15-40 years old)
- German-speaking Swiss citizens
- Quick readers who consume news during their commute (average 20 minutes)
- Prefer bite-sized, engaging content with visual appeal
- Interested in local Swiss news, lifestyle, entertainment, and conversational stories
- Enjoy interactive content, social media integration, and stories that spark discussion
- Appreciate humor, human interest stories, and relatable content

**YOUR TASK:**
Analyze the following news articles from various international sources and identify the most suitable content for 20min.ch readers.

**SELECTION CRITERIA:**
1. **Newsworthy Stories**: Must be genuinely important, impact Swiss readers, or have global significance
2. **Talking Pieces**: Stories that will generate conversation, debate, or emotional response among young Swiss readers

Consider:
- Swiss relevance or connection
- Shareability on social media
- Visual story potential
- Emotional impact (surprising, funny, shocking, heartwarming)
- Relatability to young urban lifestyle
- Potential for reader engagement/comments

**NEWS DATA:**
{df_json}

**REQUIRED OUTPUT:**
Please provide exactly 10 stories in the following format:

## TOP 5 NEWSWORTHY STORIES
1. **[Original Source]**: [Title]
   - **Why it matters to Swiss readers**: [Brief explanation]
   - **Key angle for 20min**: [How to present it]

2-5. [Continue same format]

## TOP 5 TALKING PIECES
1. **[Original Source]**: [Title]
   - **Conversation starter**: [What makes this discussable]
   - **20min angle**: [How to make it engaging]

2-5. [Continue same format]

Remember: 20min.ch readers want quick, impactful stories they can discuss with friends or share on social media. Focus on human stories, surprising facts, and content that connects to Swiss life."""
    
    return prompt

# Function to make the API call
def analyze_news_for_20min(news_df):
    """
    Send news DataFrame to Claude API and get recommendations for 20min.ch
    """
    # Convert DataFrame to JSON for the prompt
    df_json = news_df.to_json(orient='records', indent=2)
    
    # Create the prompt
    prompt = create_prompt(df_json)
    
    print("Sending request to Claude API...")
    print(f"Analyzing {len(news_df)} articles from {news_df['source'].nunique()} sources...")
    
    try:
        # Make the API call
        response = client.messages.create(
            model="claude-3-5-sonnet-20241022",  # Using Sonnet 3.5
            max_tokens=2000,
            temperature=0.7,
            messages=[
                {
                    "role": "user",
                    "content": prompt
                }
            ]
        )
        
        # Extract the response
        analysis = response.content[0].text
        
        print("\n" + "="*80)
        print("CLAUDE'S RECOMMENDATIONS FOR 20MIN.CH")
        print("="*80 + "\n")
        print(analysis)
        
        return analysis
        
    except Exception as e:
        print(f"\n❌ Error calling Claude API: {str(e)}")
        return None

# Run the analysis
print("🚀 Starting 20min.ch content analysis...")
print(f"📊 DataFrame contains {len(news_df)} articles")

# Call the function with your news DataFrame
analysis_result = analyze_news_for_20min(news_df)

# Optional: Create a summary DataFrame of recommended articles
if analysis_result:
    print("\n✅ Analysis complete! Check the output above for Claude's recommendations.")
    print("\nTip: You can copy the recommended titles and search for them in your original DataFrame:")
    print("Example: news_df[news_df['title'].str.contains('search_term', case=False)]")

✅ Loaded .env file
🚀 Starting 20min.ch content analysis...
📊 DataFrame contains 304 articles
Sending request to Claude API...
Analyzing 304 articles from 31 sources...

CLAUDE'S RECOMMENDATIONS FOR 20MIN.CH

Here's my selection and analysis for 20min.ch readers:

## TOP 5 NEWSWORTHY STORIES
1. **BBC News**: "What Merz wants from Trump showdown meeting"
   - **Why it matters to Swiss readers**: Direct Swiss connection with leading politician Merz meeting Trump, impacts Swiss-US relations
   - **Key angle for 20min**: "5 things at stake for Switzerland in the Merz-Trump meeting"

2. **Deutsche Welle**: "ECB eyes end to cuts after trimming key interest rate to 2%"
   - **Why it matters to Swiss readers**: Direct impact on Swiss economy and mortgage rates
   - **Key angle for 20min**: "What the ECB's latest move means for your wallet"

3. **POLITICO Europe**: "VPN signups surge after Pornhub pulls out of France"
   - **Why it matters to Swiss readers**: Similar age verification laws being 

In [ ]:
import requests
import json
import os
from datetime import datetime

# Load environment variables with dotenv if available (for local development)
try:
    from dotenv import load_dotenv
    load_dotenv()
    print("✅ Loaded .env file")
except ImportError:
    print("📝 dotenv not available - using system environment variables")

# Your Zapier webhook URL
ZAPIER_WEBHOOK_URL = os.environ.get('ZAPIER_WEBHOOK_URL')

# Add error checking
if not ZAPIER_WEBHOOK_URL:
    raise ValueError("ZAPIER_WEBHOOK_URL environment variable not set!")

def format_for_google_docs(analysis_text, news_df):
    """
    Format the analysis as clean, compact HTML for Google Docs in German
    """
    # Start with a simple, compact header in German
    html_content = f"""<h1>20MIN.CH TÄGLICHE NEWS-ANALYSE</h1>
<h2>{datetime.now().strftime('%d. %B %Y')}</h2>

<h3>ANALYSE-ZUSAMMENFASSUNG</h3>
<p>
<strong>Erstellt:</strong> {datetime.now().strftime('%d.%m.%Y um %H:%M CET')}<br>
<strong>Analysierte Artikel:</strong> {len(news_df)}<br>
<strong>Nachrichtenquellen:</strong> {news_df['source'].nunique()}<br>
<strong>Kontinente abgedeckt:</strong> {news_df['continent'].nunique()}
</p>

<hr>
"""
    
    # Process the analysis text with minimal formatting
    lines = analysis_text.split('\n')
    
    for line in lines:
        line = line.strip()
        
        if not line:
            continue
            
        # Handle main headers
        if line.startswith('## '):
            header = line.replace('## ', '').replace('**', '')
            html_content += f'<h2>{header}</h2>\n'
        
        # Handle story entries (numbered items) - keep original titles without translation
        elif line[0:2] in ['1.', '2.', '3.', '4.', '5.']:
            # Parse the story line
            parts = line.split('**')
            if len(parts) >= 3:
                number = parts[0].strip()
                source = parts[1].strip()
                title = parts[2].strip().lstrip(':').strip()
                
                # Simple, compact formatting - keep original title
                html_content += f'<p><strong>{number} {source}:</strong> {title}</p>\n'
        
        # Handle bullet points with German labels
        elif line.startswith('- '):
            content = line[2:].strip()
            
            # Handle bold markers
            if '**' in content:
                parts = content.split('**')
                formatted_content = ""
                for i, part in enumerate(parts):
                    if i % 2 == 1:
                        formatted_content += f'<strong>{part}</strong>'
                    else:
                        formatted_content += part
                content = formatted_content
            
            # Translate common English labels to German
            content = content.replace('Why it matters to Swiss readers:', 'Warum es für Schweizer Leser wichtig ist:')
            content = content.replace('Key angle for 20min:', '20min-Winkel:')
            content = content.replace('Conversation starter:', 'Gesprächsanstoß:')
            content = content.replace('20min angle:', '20min-Winkel:')
            
            # Add as indented paragraph
            if ':' in content:
                label, value = content.split(':', 1)
                html_content += f'<p style="margin-left: 20px;">• <strong>{label}:</strong>{value}</p>\n'
            else:
                html_content += f'<p style="margin-left: 20px;">• {content}</p>\n'
    
    # Add compact methodology section in German
    html_content += f"""
<hr>
<h3>METHODIK</h3>
<p>Diese Analyse wurde erstellt durch:</p>
<p style="margin-left: 20px;">
• Scannen von RSS-Feeds großer Nachrichtenportale aller Kontinente<br>
• Sammeln der neuesten {len(news_df)} Artikel von {news_df['source'].nunique()} Quellen<br>
• Verwendung von Claude AI zur Identifikation der für 20min.ch-Leser relevantesten Stories<br>
• Anwendung von Auswahlkriterien: Schweiz-Relevanz, Teilbarkeit, visuelles Potenzial, emotionale Wirkung, Gesprächspotenzial
</p>

<h3>QUELLENVERTEILUNG</h3>
<p>"""
    
    # Add source statistics in a compact format with German continent names
    continent_names_de = {
        'North America': 'Nordamerika',
        'Europe': 'Europa', 
        'Asia': 'Asien',
        'Africa': 'Afrika',
        'South America': 'Südamerika',
        'Oceania': 'Ozeanien'
    }
    
    source_lines = []
    for continent in news_df['continent'].value_counts().index:
        count = len(news_df[news_df['continent'] == continent])
        continent_de = continent_names_de.get(continent, continent)
        source_lines.append(f'<strong>{continent_de}:</strong> {count} Artikel')
    
    html_content += ' | '.join(source_lines)
    
    html_content += f"""
</p>

<hr>
<p><em>Automatisch generiert via GitHub Actions um {datetime.now().strftime('%H:%M:%S CET')}</em></p>
"""
    
    return html_content

def create_email_html(analysis_text, news_df):
    """
    Create HTML content for the email with summary in German
    """
    # Extract top stories for email preview
    stories_preview = extract_top_stories(analysis_text)
    
    html_content = f"""
<div style="font-family: Arial, sans-serif; max-width: 600px; margin: 0 auto;">
    <h2 style="color: #d32f2f;">20min.ch Tägliche News-Analyse</h2>
    
    <p>Hallo Tom,</p>
    
    <p>Deine automatisierte News-Analyse für <strong>{datetime.now().strftime('%d.%m.%Y')}</strong> ist bereit.</p>
    
    <div style="background-color: #f5f5f5; padding: 15px; border-radius: 5px; margin: 20px 0;">
        <h3 style="margin-top: 0;">Schnellübersicht:</h3>
        <ul style="list-style-type: none; padding-left: 0;">
            <li>📊 Analysierte Artikel: {len(news_df)}</li>
            <li>📰 Nachrichtenquellen: {news_df['source'].nunique()}</li>
            <li>🌍 Kontinente abgedeckt: {news_df['continent'].nunique()}</li>
            <li>⏰ Erstellt um: {datetime.now().strftime('%H:%M CET')}</li>
        </ul>
    </div>
    
    <div style="margin: 20px 0;">
        <h3>Heutige Empfehlungen:</h3>
        {stories_preview}
    </div>
    
    <p style="margin-top: 30px;">
        <em>💡 Die vollständige Analyse enthält detaillierte Empfehlungen für jede Story, 
        einschließlich Schweizer Relevanz und vorgeschlagener Präsentationswinkel.</em>
    </p>
    
    <hr style="border: none; border-top: 1px solid #ddd; margin: 30px 0;">
    
    <p style="color: #666; font-size: 12px;">
        Diese Analyse wurde automatisch mit KI erstellt, um die für das 20min.ch-Publikum 
        relevantesten Stories zu identifizieren.
    </p>
</div>
"""
    
    return html_content

def extract_top_stories(analysis_text):
    """
    Extract all 5 stories from each category for email preview
    """
    preview_html = ""
    lines = analysis_text.split('\n')
    
    # Flags to track sections
    in_newsworthy = False
    in_talking = False
    story_count = 0
    
    for line in lines:
        if "TOP 5 NEWSWORTHY" in line:
            in_newsworthy = True
            in_talking = False
            story_count = 0
            preview_html += "<h4>🔥 Top 5 Newsworthy Stories:</h4><ul>"
        elif "TOP 5 TALKING" in line:
            in_newsworthy = False
            in_talking = True
            story_count = 0
            if "</ul>" not in preview_html[-5:]:
                preview_html += "</ul>"
            preview_html += "<h4>💬 Top 5 Talking Pieces:</h4><ul>"
        elif (in_newsworthy or in_talking) and story_count < 5:
            if line.strip().startswith(('1.', '2.', '3.', '4.', '5.')):
                # Extract the title part
                if '**' in line and ':' in line:
                    parts = line.split('**')
                    if len(parts) >= 3:
                        source = parts[1]
                        title = parts[2].split(':')[1].strip() if ':' in parts[2] else parts[2].strip()
                        preview_html += f"<li><strong>{source}</strong>: {title}</li>"
                        story_count += 1
    
    if "</ul>" not in preview_html[-5:]:
        preview_html += "</ul>"
    
    return preview_html

def send_to_zapier(analysis_text, news_df):
    """
    Send the formatted data to Zapier webhook
    """
    print("📤 Sending data to Zapier...")
    
    # Prepare the payload
    payload = {
        "date": datetime.now().strftime("%d.%m.%Y"),
        "time": datetime.now().strftime("%H:%M CET"),
        "document_title": f"20min.ch News-Analyse - {datetime.now().strftime('%d.%m.%Y')}",
        "document_content": format_for_google_docs(analysis_text, news_df),
        "email_content_html": create_email_html(analysis_text, news_df),
        "stats": {
            "total_articles": len(news_df),
            "total_sources": news_df['source'].nunique(),
            "continents": news_df['continent'].nunique()
        },
        "recipient_email": "tom.vaillant@20minuten.ch",
        "email_subject": f"Tägliche News-Analyse - {datetime.now().strftime('%d.%m.%Y')}"
    }
    
    try:
        # Send to Zapier
        response = requests.post(ZAPIER_WEBHOOK_URL, json=payload)
        
        if response.status_code == 200:
            print("✅ Successfully sent to Zapier!")
            print(f"📧 Email will be sent to: {payload['recipient_email']}")
            print(f"📄 Document title: {payload['document_title']}")
            print("\n🎯 Next steps in Zapier:")
            print("1. Google Doc will be created automatically")
            print("2. Email will be sent with the doc link")
            return True
        else:
            print(f"❌ Error sending to Zapier: {response.status_code}")
            print(f"Response: {response.text}")
            return False
            
    except Exception as e:
        print(f"❌ Exception occurred: {str(e)}")
        return False

# Test the webhook with sample data (optional)
def test_zapier_webhook():
    """
    Test the Zapier webhook with minimal data
    """
    test_payload = {
        "test": True,
        "message": "Testing webhook connection",
        "timestamp": datetime.now().isoformat()
    }
    
    print("🧪 Testing Zapier webhook...")
    response = requests.post(ZAPIER_WEBHOOK_URL, json=test_payload)
    
    if response.status_code == 200:
        print("✅ Webhook test successful!")
        print("Check your Zapier dashboard to see the test data")
    else:
        print(f"❌ Webhook test failed: {response.status_code}")

# Main execution for your Jupyter notebook
if 'analysis_result' in globals() and 'news_df' in globals():
    # Send the analysis to Zapier
    success = send_to_zapier(analysis_result, news_df)
    
    if success:
        print("\n🎉 All done! Check your Zapier dashboard to monitor the workflow.")
else:
    print("❌ No analysis results found. Please run the Claude analysis first.")
    print("\n💡 To test your webhook connection, run:")
    print("test_zapier_webhook()")