In [6]:
import feedparser
import pandas as pd
import requests
from datetime import datetime
import time
import xml.etree.ElementTree as ET
from typing import List, Dict

# Load news sources from CSV file
def load_news_sources_from_csv(csv_path='news_sources.csv'):
    """Load news sources from CSV file and organize by continent."""
    try:
        sources_df = pd.read_csv(csv_path)
        print(f"✅ Loaded {len(sources_df)} news sources from {csv_path}")
        
        # Organize by continent
        news_feeds = {}
        for continent in sources_df['continent'].unique():
            continent_sources = sources_df[sources_df['continent'] == continent]
            news_feeds[continent] = []
            
            for _, row in continent_sources.iterrows():
                news_feeds[continent].append({
                    'name': row['name'],
                    'url': row['url']
                })
        
        return news_feeds
        
    except FileNotFoundError:
        print(f"❌ CSV file '{csv_path}' not found!")
        raise
    except Exception as e:
        print(f"❌ Error loading CSV: {str(e)}")
        raise

# Load news feeds from CSV
NEWS_FEEDS = load_news_sources_from_csv()

def fetch_feed_titles(feed_url, feed_name, max_titles=10):
    """Extract titles and descriptions from a feed with simple error handling."""
    titles = []
    
    try:
        # Fetch with timeout
        print(f"  Fetching {feed_name}...", end="", flush=True)
        response = requests.get(feed_url, timeout=10, headers={
            'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36'
        })
        
        # Parse with feedparser
        feed = feedparser.parse(response.content)
        
        # Debug: Print first entry details for each source
        if feed.entries:
            first_entry = feed.entries[0]
            print(f"\n    📋 First entry from {feed_name}:")
            print(f"      Title: {getattr(first_entry, 'title', 'NO TITLE')}")
            print(f"      Link: {getattr(first_entry, 'link', 'NO LINK')}")
            print(f"      Published: {getattr(first_entry, 'published', 'NO DATE')}")
            print(f"      Summary: {getattr(first_entry, 'summary', 'NO SUMMARY')[:100]}...")
            print(f"      Description: {getattr(first_entry, 'description', 'NO DESCRIPTION')[:100]}...")
            print(f"      Available fields: {list(first_entry.keys())}")
        else:
            print(f"\n    ❌ No entries found in feed for {feed_name}")
        
        # Extract titles and descriptions
        for i, entry in enumerate(feed.entries[:max_titles]):
            if hasattr(entry, 'title'):
                # Try multiple fields for description content
                description = ""
                if hasattr(entry, 'summary') and entry.summary:
                    description = entry.summary.strip()
                elif hasattr(entry, 'description') and entry.description:
                    description = entry.description.strip()
                elif hasattr(entry, 'content') and entry.content:
                    # Some feeds use content field
                    if isinstance(entry.content, list) and len(entry.content) > 0:
                        description = entry.content[0].get('value', '').strip()
                    else:
                        description = str(entry.content).strip()
                
                titles.append({
                    'source': feed_name,
                    'title': entry.title.strip(),
                    'description': description,
                    'timestamp': datetime.now().isoformat()
                })
        
        print(f"    ✓ ({len(titles)} titles extracted)")
        
    except Exception as e:
        print(f" ✗ (Error: {type(e).__name__})")
    
    return titles

# Main execution
print("Starting news feed scanner...\n")
all_results = []

# Process each continent
for continent, feeds in NEWS_FEEDS.items():
    print(f"\n{continent}:")
    print("-" * 40)
    
    for feed in feeds:
        titles = fetch_feed_titles(feed['url'], feed['name'])
        
        # Add continent info to each title
        for title in titles:
            title['continent'] = continent
            title['feed_url'] = feed['url']
            all_results.append(title)
        
        # Small delay to be respectful
        time.sleep(0.5)

# Create DataFrame
print(f"\n\nCreating DataFrame...")
news_df = pd.DataFrame(all_results)

# Show summary
if not news_df.empty:
    print(f"\n✅ Success! Collected {len(news_df)} articles from {news_df['source'].nunique()} sources")
    print(f"\nArticles per continent:")
    for continent in news_df['continent'].unique():
        count = len(news_df[news_df['continent'] == continent])
        print(f"  {continent}: {count}")
    
    print("\n📰 Sample headlines:")
    print("=" * 80)
    for _, row in news_df.head(5).iterrows():
        print(f"{row['source']}: {row['title'][:80]}...")
        if row['description']:
            print(f"  📄 Description: {row['description'][:150]}...")
        print()
    
    print(f"\n📊 Description statistics:")
    print(f"  Articles with descriptions: {len(news_df[news_df['description'] != ''])}")
    print(f"  Articles without descriptions: {len(news_df[news_df['description'] == ''])}")
    print(f"  Average description length: {news_df['description'].str.len().mean():.0f} characters")
else:
    print("❌ No articles collected!")

# The DataFrame is now available as 'news_df'
print(f"\n\n✨ DataFrame stored in variable 'news_df' with {len(news_df)} articles")

✅ Loaded 45 news sources from news_sources.csv
Starting news feed scanner...


North America:
----------------------------------------
  Fetching BBC News Americas...
    📋 First entry from BBC News Americas:
      Title: Trump and Musk enter bitter feud - and Washington buckles up
      Link: https://www.bbc.com/news/articles/c3wd2215q08o
      Published: Fri, 06 Jun 2025 05:56:04 GMT
      Summary: A knock-down fight between the world's the richest person and the most powerful politician is playin...
      Description: A knock-down fight between the world's the richest person and the most powerful politician is playin...
      Available fields: ['title', 'title_detail', 'summary', 'summary_detail', 'links', 'link', 'id', 'guidislink', 'published', 'published_parsed', 'media_thumbnail', 'href']
    ✓ (10 titles extracted)
  Fetching NPR News...
    📋 First entry from NPR News:
      Title: Judge puts temporary hold on Trump's latest ban on Harvard's foreign students
      Link: https:

In [17]:
import anthropic
import json
import os
from datetime import datetime
from pathlib import Path

# Load environment variables from .env file
try:
    from dotenv import load_dotenv
    
    # Try multiple possible locations for .env file
    # First, try the current directory
    if Path('.env').exists():
        load_dotenv('.env')
        print("✅ Loaded .env file from current directory")
    # Then try parent directory
    elif Path('../.env').exists():
        load_dotenv('../.env')
        print("✅ Loaded .env file from parent directory")
    else:
        load_dotenv()  # This will search for .env in default locations
        print("✅ Loaded .env file from default location")
        
except ImportError:
    print("❌ python-dotenv not installed! Run: pip install python-dotenv")
    raise

# Get API key from environment variable
API_KEY = os.environ.get('ANTHROPIC_API_KEY')

# Validate that we have the API key
if not API_KEY:
    print("❌ ANTHROPIC_API_KEY not found in environment variables!")
    print("Available environment variables:", list(os.environ.keys()))
    print("\nPlease ensure:")
    print("1. Your .env file is in the same directory as this notebook")
    print("2. The .env file contains: ANTHROPIC_API_KEY=your-key-here")
    print("3. You've installed python-dotenv: pip install python-dotenv")
    raise ValueError("ANTHROPIC_API_KEY environment variable not set!")
else:
    # Show partial key for confirmation (security: only show first and last few characters)
    masked_key = f"{API_KEY[:10]}...{API_KEY[-4:]}" if len(API_KEY) > 20 else "KEY_TOO_SHORT"
    print(f"✅ API Key loaded successfully: {masked_key}")

# Initialize the Claude client with your API key
client = anthropic.Anthropic(api_key=API_KEY)

# Craft the prompt based on 20min.ch's audience profile
def create_prompt(df_json):
    prompt = f"""You are a senior story editor for 20min.ch, Switzerland's leading free commuter tabloid. Your specialty is identifying stories with exceptional narrative power that cut through the noise of daily news.

**TARGET AUDIENCE:**
- Young urban Swiss commuters (15-40 years old) 
- Quick readers seeking stories worth talking about
- Appreciate compelling narratives over routine news updates

**YOUR EXPERTISE:**
Find stories with **narrative depth** and **unique angles** that make them stand out. The best stories often feature:

**STORY PATTERNS TO IDENTIFY:**
1. **Dramatic Evolution**: Relationships/situations that have transformed over time
   - Former allies becoming enemies (or vice versa)
   - Complete reversals of fortune or position
   - Long-term consequences finally surfacing

2. **Compelling Contrasts**: Stories highlighting stark juxtapositions
   - Rich vs. poor, powerful vs. powerless
   - Public image vs. private reality
   - Expectations vs. outcomes

3. **Hidden Connections**: Stories revealing unexpected links
   - How seemingly unrelated events connect
   - Behind-the-scenes relationships affecting public events
   - Cause-and-effect chains spanning time/geography

4. **Human Drama with Stakes**: Personal stories that illuminate larger issues
   - Individual journeys that reflect broader trends
   - Choices with far-reaching consequences
   - People at crossroads or tipping points

**AVOID:**
- Routine political announcements
- Standard economic updates
- Predictable celebrity coverage
- Breaking news without context

**NEWS DATA:**
{df_json}

**REQUIRED OUTPUT:**
Identify exactly 10 stories with the strongest narrative potential:

## TOP 5 NEWSWORTHY STORIES WITH DEPTH
1. **[Source]**: [Title]
   - **The Journey**: [What evolution/transformation makes this compelling]
   - **Why it resonates**: [Human element that connects with readers]

2-5. [Continue same format]

## TOP 5 UNIQUE PERSPECTIVE STORIES  
1. **[Source]**: [Title]
   - **The Angle**: [What fresh perspective or hidden story this reveals]
   - **Discussion value**: [Why this sparks meaningful conversation]

2-5. [Continue same format]

Focus on stories that have **layers** - where there's more than meets the eye, where the present situation has an interesting backstory, or where personal drama illuminates larger truths."""
    
    return prompt

# Function to make the API call
def analyze_news_for_20min(news_df):
    """
    Send news DataFrame to Claude API and get recommendations for 20min.ch
    """
    # Convert DataFrame to JSON for the prompt
    df_json = news_df.to_json(orient='records', indent=2)
    
    # Create the prompt
    prompt = create_prompt(df_json)
    
    print("Sending request to Claude API...")
    print(f"Analyzing {len(news_df)} articles from {news_df['source'].nunique()} sources...")
    
    try:
        # Make the API call
        response = client.messages.create(
            model="claude-3-5-sonnet-20241022",  # Using Sonnet 3.5
            max_tokens=2000,
            temperature=0.7,
            messages=[
                {
                    "role": "user",
                    "content": prompt
                }
            ]
        )
        
        # Extract the response
        analysis = response.content[0].text
        
        print("\n" + "="*80)
        print("CLAUDE'S RECOMMENDATIONS FOR 20MIN.CH")
        print("="*80 + "\n")
        print(analysis)
        
        return analysis
        
    except Exception as e:
        print(f"\n❌ Error calling Claude API: {str(e)}")
        return None

# Run the analysis
print("🚀 Starting 20min.ch content analysis...")
print(f"📊 DataFrame contains {len(news_df)} articles")

# Call the function with your news DataFrame
analysis_result = analyze_news_for_20min(news_df)

# Optional: Create a summary DataFrame of recommended articles
if analysis_result:
    print("\n✅ Analysis complete! Check the output above for Claude's recommendations.")
    print("\nTip: You can copy the recommended titles and search for them in your original DataFrame:")
    print("Example: news_df[news_df['title'].str.contains('search_term', case=False)]")

✅ Loaded .env file from current directory
✅ API Key loaded successfully: sk-ant-api...vwAA
🚀 Starting 20min.ch content analysis...
📊 DataFrame contains 304 articles
Sending request to Claude API...
Analyzing 304 articles from 31 sources...

CLAUDE'S RECOMMENDATIONS FOR 20MIN.CH

Based on the target audience and selection criteria, here are the most suitable stories for 20min.ch:

## TOP 5 NEWSWORTHY STORIES
1. **[Deutsche Welle]**: "NATO ministers back defense spending increase"
   - **Why it matters to Swiss readers**: Though not NATO members, Swiss defense spending and security alignment with European neighbors is a major ongoing debate
   - **Key angle for 20min**: "What NATO's new spending targets mean for Swiss security"

2. **[BBC News Europe]**: "Eight injured on Ryanair flight hit by 'severe turbulence'"
   - **Why it matters to Swiss readers**: Many Swiss use Ryanair for European travel
   - **Key angle for 20min**: "Dramatic footage: Swiss passengers describe 'terrifying' Rya

In [18]:
import requests
import json
import os
from datetime import datetime

# Load environment variables with dotenv if available (for local development)
try:
    from dotenv import load_dotenv
    load_dotenv()
    print("✅ Loaded .env file")
except ImportError:
    print("📝 dotenv not available - using system environment variables")

# Your Zapier webhook URL
ZAPIER_WEBHOOK_URL = os.environ.get('ZAPIER_WEBHOOK_URL')

# Add error checking
if not ZAPIER_WEBHOOK_URL:
    raise ValueError("ZAPIER_WEBHOOK_URL environment variable not set!")

def format_for_google_docs(analysis_text, news_df):
    """
    Format the analysis as clean, compact HTML for Google Docs in German
    """
    # Start with a simple, compact header in German
    html_content = f"""<h1>20MIN.CH TÄGLICHE NEWS-ANALYSE</h1>
<h2>{datetime.now().strftime('%d. %B %Y')}</h2>

<h3>ANALYSE-ZUSAMMENFASSUNG</h3>
<p>
<strong>Erstellt:</strong> {datetime.now().strftime('%d.%m.%Y um %H:%M CET')}<br>
<strong>Analysierte Artikel:</strong> {len(news_df)}<br>
<strong>Nachrichtenquellen:</strong> {news_df['source'].nunique()}<br>
<strong>Kontinente abgedeckt:</strong> {news_df['continent'].nunique()}
</p>

<hr>
"""
    
    # Process the analysis text with minimal formatting
    lines = analysis_text.split('\n')
    
    for line in lines:
        line = line.strip()
        
        if not line:
            continue
            
        # Handle main headers
        if line.startswith('## '):
            header = line.replace('## ', '').replace('**', '')
            html_content += f'<h2>{header}</h2>\n'
        
        # Handle story entries (numbered items) - keep original titles without translation
        elif line[0:2] in ['1.', '2.', '3.', '4.', '5.']:
            # Parse the story line
            parts = line.split('**')
            if len(parts) >= 3:
                number = parts[0].strip()
                source = parts[1].strip()
                title = parts[2].strip().lstrip(':').strip()
                
                # Simple, compact formatting - keep original title
                html_content += f'<p><strong>{number} {source}:</strong> {title}</p>\n'
        
        # Handle bullet points with German labels
        elif line.startswith('- '):
            content = line[2:].strip()
            
            # Handle bold markers
            if '**' in content:
                parts = content.split('**')
                formatted_content = ""
                for i, part in enumerate(parts):
                    if i % 2 == 1:
                        formatted_content += f'<strong>{part}</strong>'
                    else:
                        formatted_content += part
                content = formatted_content
            
            # Translate common English labels to German
            content = content.replace('Why it matters to Swiss readers:', 'Warum es für Schweizer Leser wichtig ist:')
            content = content.replace('Key angle for 20min:', '20min-Winkel:')
            content = content.replace('Conversation starter:', 'Gesprächsanstoß:')
            content = content.replace('20min angle:', '20min-Winkel:')
            
            # Add as indented paragraph
            if ':' in content:
                label, value = content.split(':', 1)
                html_content += f'<p style="margin-left: 20px;">• <strong>{label}:</strong>{value}</p>\n'
            else:
                html_content += f'<p style="margin-left: 20px;">• {content}</p>\n'
    
    # Add compact methodology section in German
    html_content += f"""
<hr>
<h3>METHODIK</h3>
<p>Diese Analyse wurde erstellt durch:</p>
<p style="margin-left: 20px;">
• Scannen von RSS-Feeds großer Nachrichtenportale aller Kontinente<br>
• Sammeln der neuesten {len(news_df)} Artikel von {news_df['source'].nunique()} Quellen<br>
• Verwendung von Claude AI zur Identifikation der für 20min.ch-Leser relevantesten Stories<br>
• Anwendung von Auswahlkriterien: Schweiz-Relevanz, Teilbarkeit, visuelles Potenzial, emotionale Wirkung, Gesprächspotenzial
</p>

<h3>QUELLENVERTEILUNG</h3>
<p>"""
    
    # Add source statistics in a compact format with German continent names
    continent_names_de = {
        'North America': 'Nordamerika',
        'Europe': 'Europa', 
        'Asia': 'Asien',
        'Africa': 'Afrika',
        'South America': 'Südamerika',
        'Oceania': 'Ozeanien'
    }
    
    source_lines = []
    for continent in news_df['continent'].value_counts().index:
        count = len(news_df[news_df['continent'] == continent])
        continent_de = continent_names_de.get(continent, continent)
        source_lines.append(f'<strong>{continent_de}:</strong> {count} Artikel')
    
    html_content += ' | '.join(source_lines)
    
    html_content += f"""
</p>

<hr>
<p><em>Automatisch generiert via GitHub Actions um {datetime.now().strftime('%H:%M:%S CET')}</em></p>
"""
    
    return html_content

def create_email_html(analysis_text, news_df):
    """
    Create HTML content for the email with summary in German
    """
    # Extract top stories for email preview
    stories_preview = extract_top_stories(analysis_text)
    
    html_content = f"""
<div style="font-family: Arial, sans-serif; max-width: 600px; margin: 0 auto;">
    <h2 style="color: #d32f2f;">20min.ch Tägliche News-Analyse</h2>
    
    <p>Hallo Tom,</p>
    
    <p>Deine automatisierte News-Analyse für <strong>{datetime.now().strftime('%d.%m.%Y')}</strong> ist bereit.</p>
    
    <div style="background-color: #f5f5f5; padding: 15px; border-radius: 5px; margin: 20px 0;">
        <h3 style="margin-top: 0;">Schnellübersicht:</h3>
        <ul style="list-style-type: none; padding-left: 0;">
            <li>📊 Analysierte Artikel: {len(news_df)}</li>
            <li>📰 Nachrichtenquellen: {news_df['source'].nunique()}</li>
            <li>🌍 Kontinente abgedeckt: {news_df['continent'].nunique()}</li>
            <li>⏰ Erstellt um: {datetime.now().strftime('%H:%M CET')}</li>
        </ul>
    </div>
    
    <div style="margin: 20px 0;">
        <h3>Heutige Empfehlungen:</h3>
        {stories_preview}
    </div>
    
    <p style="margin-top: 30px;">
        <em>💡 Die vollständige Analyse enthält detaillierte Empfehlungen für jede Story, 
        einschließlich Schweizer Relevanz und vorgeschlagener Präsentationswinkel.</em>
    </p>
    
    <hr style="border: none; border-top: 1px solid #ddd; margin: 30px 0;">
    
    <p style="color: #666; font-size: 12px;">
        Diese Analyse wurde automatisch mit KI erstellt, um die für das 20min.ch-Publikum 
        relevantesten Stories zu identifizieren.
    </p>
</div>
"""
    
    return html_content

def extract_top_stories(analysis_text):
    """
    Extract all 5 stories from each category for email preview
    """
    preview_html = ""
    lines = analysis_text.split('\n')
    
    # Flags to track sections
    in_newsworthy = False
    in_talking = False
    story_count = 0
    
    for line in lines:
        if "TOP 5 NEWSWORTHY" in line:
            in_newsworthy = True
            in_talking = False
            story_count = 0
            preview_html += "<h4>🔥 Top 5 Newsworthy Stories:</h4><ul>"
        elif "TOP 5 TALKING" in line:
            in_newsworthy = False
            in_talking = True
            story_count = 0
            if "</ul>" not in preview_html[-5:]:
                preview_html += "</ul>"
            preview_html += "<h4>💬 Top 5 Talking Pieces:</h4><ul>"
        elif (in_newsworthy or in_talking) and story_count < 5:
            if line.strip().startswith(('1.', '2.', '3.', '4.', '5.')):
                # Extract the title part
                if '**' in line and ':' in line:
                    parts = line.split('**')
                    if len(parts) >= 3:
                        source = parts[1]
                        title = parts[2].split(':')[1].strip() if ':' in parts[2] else parts[2].strip()
                        preview_html += f"<li><strong>{source}</strong>: {title}</li>"
                        story_count += 1
    
    if "</ul>" not in preview_html[-5:]:
        preview_html += "</ul>"
    
    return preview_html

def send_to_zapier(analysis_text, news_df):
    """
    Send the formatted data to Zapier webhook
    """
    print("📤 Sending data to Zapier...")
    
    # Prepare the payload
    payload = {
        "date": datetime.now().strftime("%d.%m.%Y"),
        "time": datetime.now().strftime("%H:%M CET"),
        "document_title": f"20min.ch News-Analyse - {datetime.now().strftime('%d.%m.%Y')}",
        "document_content": format_for_google_docs(analysis_text, news_df),
        "email_content_html": create_email_html(analysis_text, news_df),
        "stats": {
            "total_articles": len(news_df),
            "total_sources": news_df['source'].nunique(),
            "continents": news_df['continent'].nunique()
        },
        "recipient_email": "tom.vaillant@20minuten.ch",
        "email_subject": f"Tägliche News-Analyse - {datetime.now().strftime('%d.%m.%Y')}"
    }
    
    try:
        # Send to Zapier
        response = requests.post(ZAPIER_WEBHOOK_URL, json=payload)
        
        if response.status_code == 200:
            print("✅ Successfully sent to Zapier!")
            print(f"📧 Email will be sent to: {payload['recipient_email']}")
            print(f"📄 Document title: {payload['document_title']}")
            print("\n🎯 Next steps in Zapier:")
            print("1. Google Doc will be created automatically")
            print("2. Email will be sent with the doc link")
            return True
        else:
            print(f"❌ Error sending to Zapier: {response.status_code}")
            print(f"Response: {response.text}")
            return False
            
    except Exception as e:
        print(f"❌ Exception occurred: {str(e)}")
        return False

# Test the webhook with sample data (optional)
def test_zapier_webhook():
    """
    Test the Zapier webhook with minimal data
    """
    test_payload = {
        "test": True,
        "message": "Testing webhook connection",
        "timestamp": datetime.now().isoformat()
    }
    
    print("🧪 Testing Zapier webhook...")
    response = requests.post(ZAPIER_WEBHOOK_URL, json=test_payload)
    
    if response.status_code == 200:
        print("✅ Webhook test successful!")
        print("Check your Zapier dashboard to see the test data")
    else:
        print(f"❌ Webhook test failed: {response.status_code}")

# Main execution for your Jupyter notebook
if 'analysis_result' in globals() and 'news_df' in globals():
    # Send the analysis to Zapier
    success = send_to_zapier(analysis_result, news_df)
    
    if success:
        print("\n🎉 All done! Check your Zapier dashboard to monitor the workflow.")
else:
    print("❌ No analysis results found. Please run the Claude analysis first.")
    print("\n💡 To test your webhook connection, run:")
    print("test_zapier_webhook()")

✅ Loaded .env file
📤 Sending data to Zapier...
✅ Successfully sent to Zapier!
📧 Email will be sent to: tom.vaillant@20minuten.ch
📄 Document title: 20min.ch News-Analyse - 05.06.2025

🎯 Next steps in Zapier:
1. Google Doc will be created automatically
2. Email will be sent with the doc link

🎉 All done! Check your Zapier dashboard to monitor the workflow.
