In [1]:
import feedparser
import pandas as pd
import requests
from datetime import datetime
import time
import xml.etree.ElementTree as ET
from typing import List, Dict

# Load news sources from CSV file
def load_news_sources_from_csv(csv_path='news_sources.csv'):
    """Load news sources from CSV file and organize by continent."""
    try:
        sources_df = pd.read_csv(csv_path)
        print(f"✅ Loaded {len(sources_df)} news sources from {csv_path}")
        
        # Organize by continent
        news_feeds = {}
        for continent in sources_df['continent'].unique():
            continent_sources = sources_df[sources_df['continent'] == continent]
            news_feeds[continent] = []
            
            for _, row in continent_sources.iterrows():
                news_feeds[continent].append({
                    'name': row['name'],
                    'url': row['url']
                })
        
        return news_feeds
        
    except FileNotFoundError:
        print(f"❌ CSV file '{csv_path}' not found!")
        raise
    except Exception as e:
        print(f"❌ Error loading CSV: {str(e)}")
        raise

# Load news feeds from CSV
NEWS_FEEDS = load_news_sources_from_csv()

def fetch_feed_titles(feed_url, feed_name, max_titles=10):
    """Extract titles, descriptions, and URLs from a feed with simple error handling."""
    titles = []
    
    try:
        print(f"  Fetching {feed_name}...", end="", flush=True)
        response = requests.get(feed_url, timeout=10, headers={
            'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36'
        })
        
        # Parse with feedparser
        feed = feedparser.parse(response.content)
        
        # Extract titles, descriptions, and article URLs
        for i, entry in enumerate(feed.entries[:max_titles]):
            if hasattr(entry, 'title'):
                # Try multiple fields for description content
                description = ""
                if hasattr(entry, 'summary') and entry.summary:
                    description = entry.summary.strip()
                elif hasattr(entry, 'description') and entry.description:
                    description = entry.description.strip()
                elif hasattr(entry, 'content') and entry.content:
                    # Some feeds use content field
                    if isinstance(entry.content, list) and len(entry.content) > 0:
                        description = entry.content[0].get('value', '').strip()
                    else:
                        description = str(entry.content).strip()
                
                # Get the article URL
                article_url = getattr(entry, 'link', '')
                
                titles.append({
                    'source': feed_name,
                    'title': entry.title.strip(),
                    'description': description,
                    'url': article_url,
                    'timestamp': datetime.now().isoformat()
                })
        
        print(f" ✓ ({len(titles)} articles)")
        
    except Exception as e:
        print(f" ✗ (Error: {type(e).__name__})")
    
    return titles

# Main execution
print("Starting news feed scanner...\n")
all_results = []

# Process each continent
for continent, feeds in NEWS_FEEDS.items():
    print(f"\n{continent}:")
    print("-" * 40)
    
    for feed in feeds:
        titles = fetch_feed_titles(feed['url'], feed['name'])
        
        # Add continent info to each title
        for title in titles:
            title['continent'] = continent
            title['feed_url'] = feed['url']
            all_results.append(title)
        
        # Small delay to be respectful
        time.sleep(0.5)

# Create DataFrame
print(f"\n\nCreating DataFrame...")
news_df = pd.DataFrame(all_results)

# Show summary
if not news_df.empty:
    print(f"\n✅ Success! Collected {len(news_df)} articles from {news_df['source'].nunique()} sources")
    print(f"📊 Articles per continent:")
    for continent in news_df['continent'].unique():
        count = len(news_df[news_df['continent'] == continent])
        print(f"  {continent}: {count}")
    
    print(f"\n📊 Statistics:")
    print(f"  Articles with URLs: {len(news_df[news_df['url'] != ''])}")
    print(f"  Articles with descriptions: {len(news_df[news_df['description'] != ''])}")
else:
    print("❌ No articles collected!")

print(f"\n✨ DataFrame stored in variable 'news_df' with {len(news_df)} articles")

✅ Loaded 45 news sources from news_sources.csv
Starting news feed scanner...


North America:
----------------------------------------
  Fetching BBC News Americas...

 ✓ (10 articles)


  Fetching NPR News...

 ✓ (10 articles)


  Fetching The New York Times...

 ✓ (10 articles)


  Fetching CBC News Canada...

 ✓ (10 articles)


  Fetching The Guardian US...

 ✓ (10 articles)


  Fetching Mexico News Daily...

 ✓ (10 articles)



Europe:
----------------------------------------
  Fetching BBC News Europe...

 ✓ (10 articles)


  Fetching The Guardian UK...

 ✓ (10 articles)


  Fetching Deutsche Welle English...

 ✓ (10 articles)


  Fetching France24 English...

 ✓ (10 articles)


  Fetching Euronews...

 ✓ (10 articles)


  Fetching POLITICO Europe...

 ✓ (10 articles)


  Fetching Swiss Info...

 ✓ (10 articles)


  Fetching Blick...

 ✓ (0 articles)


  Fetching Spiegel Online...

 ✓ (10 articles)


  Fetching Die Zeit...

 ✓ (10 articles)


  Fetching Süddeutsche Zeitung...

 ✓ (10 articles)


  Fetching Frankfurter Allgemeine...

 ✓ (10 articles)


  Fetching Bild...

 ✓ (10 articles)


  Fetching Blick...

 ✓ (10 articles)


  Fetching Tagesschau...

 ✓ (10 articles)


  Fetching Neue Zürcher Zeitung...

 ✓ (10 articles)


  Fetching Tages-Anzeiger...

 ✓ (10 articles)


  Fetching Der Standard AT...

 ✓ (10 articles)


  Fetching Kurier...

 ✓ (10 articles)


  Fetching ORF News...

 ✓ (10 articles)



Asia:
----------------------------------------
  Fetching Al Jazeera...

 ✓ (10 articles)


  Fetching The Japan Times...

 ✓ (10 articles)


  Fetching South China Morning Post...

 ✓ (10 articles)


  Fetching The Hindu India...

 ✓ (10 articles)


  Fetching Times of India...

 ✓ (10 articles)



Africa:
----------------------------------------
  Fetching BBC News Africa...

 ✓ (10 articles)


  Fetching AllAfrica...

 ✓ (10 articles)


  Fetching Mail & Guardian SA...

 ✓ (10 articles)


  Fetching News24 South Africa...

 ✓ (10 articles)


  Fetching Morocco World News...

 ✓ (10 articles)



South America:
----------------------------------------
  Fetching BBC News Latin America...

 ✓ (10 articles)


  Fetching Buenos Aires Times...

 ✓ (10 articles)


  Fetching MercoPress...

 ✓ (10 articles)


  Fetching Colombia Reports...

 ✓ (10 articles)



Oceania:
----------------------------------------
  Fetching ABC News Australia...

 ✓ (10 articles)


  Fetching Sydney Morning Herald...

 ✓ (10 articles)


  Fetching The Guardian Australia...

 ✓ (10 articles)


  Fetching Stuff.co.nz...

 ✓ (10 articles)


  Fetching Radio New Zealand...

 ✓ (10 articles)




Creating DataFrame...

✅ Success! Collected 440 articles from 44 sources
📊 Articles per continent:
  North America: 60
  Europe: 190
  Asia: 50
  Africa: 50
  South America: 40
  Oceania: 50

📊 Statistics:
  Articles with URLs: 440
  Articles with descriptions: 431

✨ DataFrame stored in variable 'news_df' with 440 articles


In [2]:
import anthropic
import json
import os
from datetime import datetime
from pathlib import Path

# Load environment variables from .env file
try:
    from dotenv import load_dotenv
    
    if Path('.env').exists():
        load_dotenv('.env')
        print("✅ Loaded .env file")
    elif Path('../.env').exists():
        load_dotenv('../.env')
        print("✅ Loaded .env file from parent directory")
    else:
        load_dotenv()
        print("✅ Loaded .env file from default location")
        
except ImportError:
    print("❌ python-dotenv not installed! Run: pip install python-dotenv")
    raise

# Get API key from environment variable
API_KEY = os.environ.get('ANTHROPIC_API_KEY')

# Validate that we have the API key
if not API_KEY:
    print("❌ ANTHROPIC_API_KEY not found in environment variables!")
    raise ValueError("ANTHROPIC_API_KEY environment variable not set!")
else:
    # Show partial key for confirmation
    masked_key = f"{API_KEY[:10]}...{API_KEY[-4:]}" if len(API_KEY) > 20 else "KEY_TOO_SHORT"
    print(f"✅ API Key loaded: {masked_key}")

# Initialize the Claude client
client = anthropic.Anthropic(api_key=API_KEY)

def load_prompt_template(prompt_file='analysis_prompt.txt'):
    """Load the prompt template from external file."""
    try:
        with open(prompt_file, 'r', encoding='utf-8') as f:
            prompt_template = f.read()
        print(f"✅ Loaded prompt template from {prompt_file}")
        return prompt_template
    except FileNotFoundError:
        print(f"❌ Prompt file '{prompt_file}' not found!")
        raise

def create_prompt(df_json):
    prompt_template = load_prompt_template()
    prompt = prompt_template.format(df_json=df_json)
    return prompt

def analyze_news_for_20min(news_df):
    """Send news DataFrame to Claude API and get recommendations for 20min.ch"""
    # Convert DataFrame to JSON for the prompt
    df_json = news_df.to_json(orient='records', indent=2)
    
    # Create the prompt
    prompt = create_prompt(df_json)
    
    print(f"🚀 Analyzing {len(news_df)} articles from {news_df['source'].nunique()} sources...")
    
    try:
        # Make the API call
        response = client.messages.create(
            model="claude-3-5-sonnet-20241022",
            max_tokens=2000,
            temperature=0.7,
            messages=[
                {
                    "role": "user",
                    "content": prompt
                }
            ]
        )
        
        # Extract the response
        analysis = response.content[0].text
        
        print("\n" + "="*80)
        print("CLAUDE'S RECOMMENDATIONS FOR 20MIN.CH")
        print("="*80 + "\n")
        print(analysis)
        
        return analysis
        
    except Exception as e:
        print(f"\n❌ Error calling Claude API: {str(e)}")
        return None

# Run the analysis
print(f"📊 DataFrame contains {len(news_df)} articles")
analysis_result = analyze_news_for_20min(news_df)

if analysis_result:
    print("\n✅ Analysis complete! Ready for Zapier push.")

✅ Loaded .env file from default location
✅ API Key loaded: sk-ant-api...vwAA
📊 DataFrame contains 440 articles
✅ Loaded prompt template from analysis_prompt.txt
🚀 Analyzing 440 articles from 44 sources...



CLAUDE'S RECOMMENDATIONS FOR 20MIN.CH

## TOP 5 SCHWEIZ/LOKALE STORIES MIT MAXIMALER WIRKUNG

1. **Blick**: "Diese Gebiete sind besonders bedroht: Erhebliche Bergsturzgefahr herrscht in vielen Bündner Talschaften"
   - **URL**: https://www.blick.ch/schweiz/graubuenden/diese-gebiete-sind-besonders-bedroht-erhebliche-bergsturzgefahr-herrscht-in-vielen-buendner-talschaften-id20941156.html
   - **Der Hook**: 😱 Unmittelbare Bedrohung vor der Haustür: Jedes zehnte Gebäude im Kanton steht in einer Gefahrenzone. Die Kombination aus persönlicher Betroffenheit und existenzieller Angst macht die Geschichte unwiderstehlich.

2. **Swiss Info**: "Budgetkürzungen für die humanitäre UNO-Hilfe sorgen für Unruhe"
   - **URL**: https://www.srf.ch/news/international/humanitaere-hilfe-budgetkuerzungen-fuer-die-humanitaere-uno-hilfe-sorgen-fuer-unruhe
   - **Der Hook**: 😔 Die Schweiz als humanitäre Weltmacht in Gefahr: Berührt sowohl unser nationales Selbstverständnis als auch unsere moralische Verantwortu

In [3]:
import requests
import json
import os
from datetime import datetime
from jinja2 import Template

# Load environment variables with dotenv if available (for local development)
try:
    from dotenv import load_dotenv
    load_dotenv()
    print("✅ Loaded .env file")
except ImportError:
    print("📝 dotenv not available - using system environment variables")

# Your Zapier webhook URL
ZAPIER_WEBHOOK_URL = os.environ.get('ZAPIER_WEBHOOK_URL')

# Add error checking
if not ZAPIER_WEBHOOK_URL:
    raise ValueError("ZAPIER_WEBHOOK_URL environment variable not set!")

def load_template(template_file):
    """Load HTML template from file."""
    try:
        with open(template_file, 'r', encoding='utf-8') as f:
            template_content = f.read()
        return Template(template_content)
    except FileNotFoundError:
        print(f"❌ Template file '{template_file}' not found!")
        raise
    except Exception as e:
        print(f"❌ Error loading template file: {str(e)}")
        raise

def parse_analysis_text(analysis_text):
    """Parse the Claude analysis text into structured data for templates."""
    print("\n🔍 Parsing analysis text...")
    
    lines = analysis_text.split('\n')
    sections = []
    current_section = None
    current_story = None
    
    for i, line in enumerate(lines):
        line = line.strip()
        
        if not line:
            continue
            
        # Handle section headers
        if line.startswith('## '):
            # Save previous section
            if current_section and current_section['stories']:
                sections.append(current_section)
            
            header = line.replace('## ', '').replace('**', '')
            
            # Determine section type and title
            if "SCHWEIZ/LOKALE STORIES" in header or "SWISS" in header:
                current_section = {
                    'title': 'Schweiz/Lokale Stories mit maximaler Wirkung',
                    'icon': '🇨🇭',
                    'stories': []
                }
            elif "INTERNATIONALE STORIES" in header or "INTERNATIONAL" in header:
                current_section = {
                    'title': 'Internationale Stories mit narrativer Kraft', 
                    'icon': '🌍',
                    'stories': []
                }
            else:
                current_section = {
                    'title': header,
                    'icon': '',
                    'stories': []
                }
        
        # Handle story entries
        elif line[0:2] in ['1.', '2.', '3.', '4.', '5.'] and current_section:
            # Save previous story
            if current_story:
                current_section['stories'].append(current_story)
            
            # Parse new story
            parts = line.split('**')
            if len(parts) >= 3:
                number = parts[0].strip()
                source = parts[1].strip()
                title = parts[2].strip().lstrip(':').strip()
                
                # Remove quotes if present
                if title.startswith('"') and title.endswith('"'):
                    title = title[1:-1]
                elif title.startswith('"'):
                    title = title[1:]
                
                current_story = {
                    'number': number,
                    'source': source,
                    'title': title,
                    'url': '',  # Will be populated from details
                    'details': []  # Keep details structure for the labels
                }
        
        # Handle detail bullet points - capture URL, Der Hook, or Warum es funktioniert
        elif (line.startswith('- ') or line.startswith('   - ')) and current_story:
            content = line.lstrip('- ').strip()
            original_content = content  # Keep original for debugging
            
            # Check BEFORE processing bold markers - Updated to catch URL pattern
            is_target_field = (
                content.startswith('**URL**:') or
                content.startswith('**Der Hook**:') or 
                content.startswith('**Warum es funktioniert**:') or
                content.startswith('URL:') or
                content.startswith('Der Hook:') or 
                content.startswith('Warum es funktioniert:')
            )
            
            # Check for target fields
            if is_target_field:
                # Extract label and value from original content (without HTML)
                clean_content = original_content.replace('**', '')  # Remove bold markers
                
                if ':' in clean_content:
                    label, value = clean_content.split(':', 1)
                    label = label.strip()
                    value = value.strip()
                    
                    # Handle URL field specially
                    if label == 'URL':
                        current_story['url'] = value
                        print(f"🔗 Found URL for {current_story['source']}: {value[:60]}...")
                    else:
                        # Regular detail field (Der Hook or Warum es funktioniert)
                        detail = {
                            'label': label,
                            'value': value
                        }
                        current_story['details'].append(detail)
    
    # Save final story and section
    if current_story:
        current_section['stories'].append(current_story)
    if current_section and current_section['stories']:
        sections.append(current_section)
    
    print(f"✅ Parsed {len(sections)} sections with URLs and details")
    
    return sections

def format_for_google_docs(analysis_text, news_df):
    """Format the analysis using Jinja2 template for Google Docs."""
    template = load_template('doc_template.html')
    sections = parse_analysis_text(analysis_text)
    
    # Prepare continent statistics
    continent_names_de = {
        'North America': 'Nordamerika',
        'Europe': 'Europa', 
        'Asia': 'Asien',
        'Africa': 'Afrika',
        'South America': 'Südamerika',
        'Oceania': 'Ozeanien'
    }
    
    continents = []
    for continent in news_df['continent'].value_counts().index:
        count = len(news_df[news_df['continent'] == continent])
        continent_de = continent_names_de.get(continent, continent)
        continents.append({'name': continent_de, 'count': count})
    
    # Render template
    html_content = template.render(
        date_formatted=datetime.now().strftime('%d. %B %Y'),
        datetime_full=datetime.now().strftime('%d.%m.%Y um %H:%M CET'),
        time_generated=datetime.now().strftime('%H:%M:%S CET'),
        stats={
            'total_articles': len(news_df),
            'total_sources': news_df['source'].nunique(),
            'continents': news_df['continent'].nunique()
        },
        sections=sections,
        continents=continents
    )
    
    return html_content

def create_email_html(analysis_text, news_df):
    """Create HTML content for email using Jinja2 template."""
    template = load_template('email_template.html')
    sections = parse_analysis_text(analysis_text)
    
    # Render template
    html_content = template.render(
        date=datetime.now().strftime('%d.%m.%Y'),
        time=datetime.now().strftime('%H:%M CET'),
        stats={
            'total_articles': len(news_df),
            'total_sources': news_df['source'].nunique(),
            'continents': news_df['continent'].nunique()
        },
        sections=sections
    )
    
    return html_content

def send_to_zapier(analysis_text, news_df):
    """Send the formatted data to Zapier webhook"""
    print("\n📤 Sending to Zapier...")
    
    # Generate content
    document_content = format_for_google_docs(analysis_text, news_df)
    email_content = create_email_html(analysis_text, news_df)
    
    # Prepare the payload
    payload = {
        "date": datetime.now().strftime("%d.%m.%Y"),
        "time": datetime.now().strftime("%H:%M CET"),
        "document_title": f"20min.ch News-Analyse - {datetime.now().strftime('%d.%m.%Y')}",
        "document_content": document_content,
        "email_content_html": email_content,
        "stats": {
            "total_articles": len(news_df),
            "total_sources": news_df['source'].nunique(),
            "continents": news_df['continent'].nunique()
        },
        "recipient_email": "tom.vaillant@20minuten.ch",
        "email_subject": f"Tägliche News-Analyse - {datetime.now().strftime('%d.%m.%Y')}"
    }
    
    try:
        # Send to Zapier
        response = requests.post(ZAPIER_WEBHOOK_URL, json=payload)
        
        if response.status_code == 200:
            print("✅ Successfully sent to Zapier!")
            return True
        else:
            print(f"❌ Error sending to Zapier: {response.status_code}")
            print(f"Response: {response.text}")
            return False
            
    except Exception as e:
        print(f"❌ Exception occurred: {str(e)}")
        return False

# Main execution for your Jupyter notebook
if 'analysis_result' in globals() and 'news_df' in globals():
    # Send the analysis to Zapier
    success = send_to_zapier(analysis_result, news_df)
    
    if success:
        print("🎉 All done! Check your Zapier dashboard.")
else:
    print("❌ No analysis results found. Please run the Claude analysis first.")

✅ Loaded .env file

📤 Sending to Zapier...

🔍 Parsing analysis text...
🔗 Found URL for Blick: https://www.blick.ch/schweiz/graubuenden/diese-gebiete-sind-...
🔗 Found URL for Swiss Info: https://www.srf.ch/news/international/humanitaere-hilfe-budg...
🔗 Found URL for Blick: https://www.blick.ch/schweiz/verkehr-am-pfingstwochenende-20...
🔗 Found URL for Blick: https://www.blick.ch/schweiz/duftbaeume-mit-rechtsextremen-m...
🔗 Found URL for Swiss Info: https://www.srf.ch/news/schweiz/neue-studie-von-pro-senectut...
🔗 Found URL for BBC News Americas: https://www.bbc.com/news/articles/czdyeld2m2yo...
🔗 Found URL for The Guardian: https://www.theguardian.com/lifeandstyle/2025/jun/07/beckxit...
🔗 Found URL for BBC News Europe: https://www.bbc.com/news/articles/cwy30nqwy99o...
🔗 Found URL for France24: https://www.france24.com/en/live-news/20250608-colombian-pre...
🔗 Found URL for The Japan Times: https://www.japantimes.co.jp/news/2025/06/08/asia-pacific/po...
✅ Parsed 2 sections with URLs and d