In [1]:
import feedparser
import pandas as pd
import requests
from datetime import datetime
import time
import xml.etree.ElementTree as ET
from typing import List, Dict

# News feeds organized by continent (excluding paid/API-required sources)
NEWS_FEEDS = {
    "North America": [
        {"name": "BBC News Americas", "url": "https://feeds.bbci.co.uk/news/world/us_and_canada/rss.xml"},
        {"name": "NPR News", "url": "https://feeds.npr.org/1001/rss.xml"},
        {"name": "The New York Times", "url": "https://rss.nytimes.com/services/xml/rss/nyt/HomePage.xml"},
        {"name": "CBC News Canada", "url": "https://rss.cbc.ca/lineup/topstories.xml"},
        {"name": "The Guardian US", "url": "https://www.theguardian.com/us-news/rss"},
        {"name": "Mexico News Daily", "url": "https://mexiconewsdaily.com/feed/"}
    ],
    "Europe": [
        {"name": "BBC News Europe", "url": "https://feeds.bbci.co.uk/news/world/europe/rss.xml"},
        {"name": "The Guardian UK", "url": "https://www.theguardian.com/uk/rss"},
        {"name": "Deutsche Welle English", "url": "https://rss.dw.com/rdf/rss-en-all"},
        {"name": "France24 English", "url": "https://www.france24.com/en/rss"},
        {"name": "Euronews", "url": "https://www.euronews.com/rss"},
        {"name": "POLITICO Europe", "url": "https://www.politico.eu/feed/"},
        {"name": "Swiss Info", "url": "https://www.swissinfo.ch/eng/latest-news/rss"}
    ],
    "Asia": [
        {"name": "Al Jazeera", "url": "https://www.aljazeera.com/xml/rss/all.xml"},
        {"name": "The Japan Times", "url": "https://www.japantimes.co.jp/feed/"},
        {"name": "South China Morning Post", "url": "https://www.scmp.com/rss/91/feed"},
        {"name": "The Hindu India", "url": "https://www.thehindu.com/news/national/feeder/default.rss"},
        {"name": "Times of India", "url": "https://timesofindia.indiatimes.com/rssfeedstopstories.cms"},
        {"name": "Arab News", "url": "https://www.arabnews.com/rss.xml"}
    ],
    "Africa": [
        {"name": "BBC News Africa", "url": "https://feeds.bbci.co.uk/news/world/africa/rss.xml"},
        {"name": "AllAfrica", "url": "https://allafrica.com/tools/headlines/rdf/latest/headlines.rdf"},
        {"name": "Mail & Guardian SA", "url": "https://mg.co.za/feed/"},
        {"name": "News24 South Africa", "url": "https://feeds.news24.com/articles/news24/TopStories/rss"},
        {"name": "Morocco World News", "url": "https://www.moroccoworldnews.com/feed/"}
    ],
    "South America": [
        {"name": "BBC News Latin America", "url": "https://feeds.bbci.co.uk/news/world/latin_america/rss.xml"},
        {"name": "Buenos Aires Times", "url": "https://www.batimes.com.ar/feed"},
        {"name": "MercoPress", "url": "https://en.mercopress.com/rss/v2/headlines"},
        {"name": "Colombia Reports", "url": "https://colombiareports.com/feed/"}
    ],
    "Oceania": [
        {"name": "ABC News Australia", "url": "https://www.abc.net.au/news/feed/2942460/rss.xml"},
        {"name": "Sydney Morning Herald", "url": "https://www.smh.com.au/rss/feed.xml"},
        {"name": "The Guardian Australia", "url": "https://www.theguardian.com/australia-news/rss"},
        {"name": "Stuff.co.nz", "url": "https://www.stuff.co.nz/rss"},
        {"name": "Radio New Zealand", "url": "https://www.rnz.co.nz/rss/national.xml"}
    ]
}

def fetch_feed_titles(feed_url, feed_name, max_titles=5):
    """Extract titles from a feed with simple error handling."""
    titles = []
    
    try:
        # Fetch with timeout
        print(f"  Fetching {feed_name}...", end="", flush=True)
        response = requests.get(feed_url, timeout=10, headers={
            'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36'
        })
        
        # Parse with feedparser
        feed = feedparser.parse(response.content)
        
        # Extract titles
        for i, entry in enumerate(feed.entries[:max_titles]):
            if hasattr(entry, 'title'):
                titles.append({
                    'source': feed_name,
                    'title': entry.title.strip(),
                    'timestamp': datetime.now().isoformat()
                })
        
        print(f" ✓ ({len(titles)} titles)")
        
    except Exception as e:
        print(f" ✗ (Error: {type(e).__name__})")
    
    return titles

# Main execution
print("Starting news feed scanner...\n")
all_results = []

# Process each continent
for continent, feeds in NEWS_FEEDS.items():
    print(f"\n{continent}:")
    print("-" * 40)
    
    for feed in feeds:
        titles = fetch_feed_titles(feed['url'], feed['name'])
        
        # Add continent info to each title
        for title in titles:
            title['continent'] = continent
            title['feed_url'] = feed['url']
            all_results.append(title)
        
        # Small delay to be respectful
        time.sleep(0.5)

# Create DataFrame
print(f"\n\nCreating DataFrame...")
news_df = pd.DataFrame(all_results)

# Show summary
if not news_df.empty:
    print(f"\n✅ Success! Collected {len(news_df)} articles from {news_df['source'].nunique()} sources")
    print(f"\nArticles per continent:")
    for continent in news_df['continent'].unique():
        count = len(news_df[news_df['continent'] == continent])
        print(f"  {continent}: {count}")
    
    # Save to CSV
    filename = f"news_scan_{datetime.now().strftime('%Y%m%d_%H%M%S')}.csv"
    news_df.to_csv(filename, index=False)
    print(f"\n💾 Saved to {filename}")
    
    print("\n📰 Sample headlines:")
    print("=" * 80)
    for _, row in news_df.head(10).iterrows():
        print(f"{row['source']}: {row['title'][:100]}...")
else:
    print("❌ No articles collected!")

# The DataFrame is now available as 'news_df'
print(f"\n\n✨ DataFrame stored in variable 'news_df' with {len(news_df)} articles")

Starting news feed scanner...


North America:
----------------------------------------
  Fetching BBC News Americas... ✓ (5 titles)
  Fetching NPR News... ✓ (5 titles)
  Fetching The New York Times... ✓ (5 titles)
  Fetching CBC News Canada... ✓ (5 titles)
  Fetching The Guardian US... ✓ (5 titles)
  Fetching Mexico News Daily... ✓ (5 titles)

Europe:
----------------------------------------
  Fetching BBC News Europe... ✓ (5 titles)
  Fetching The Guardian UK... ✓ (5 titles)
  Fetching Deutsche Welle English... ✓ (5 titles)
  Fetching France24 English... ✓ (5 titles)
  Fetching Euronews... ✓ (5 titles)
  Fetching POLITICO Europe... ✓ (5 titles)
  Fetching Swiss Info... ✓ (0 titles)

Asia:
----------------------------------------
  Fetching Al Jazeera... ✓ (5 titles)
  Fetching The Japan Times... ✓ (5 titles)
  Fetching South China Morning Post... ✓ (5 titles)
  Fetching The Hindu India... ✓ (5 titles)
  Fetching Times of India... ✓ (5 titles)
  Fetching Arab News... ✓ (5 titles)

Afr

In [4]:
import anthropic
import json
from datetime import datetime

# Initialize the Claude client with your API key
API_KEY = "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"
client = anthropic.Anthropic(api_key=API_KEY)

# Craft the prompt based on 20min.ch's audience profile
def create_prompt(df_json):
    prompt = f"""You are a news editor for 20min.ch, Switzerland's leading free commuter tabloid with over 2 million readers. Your audience consists of:

**TARGET AUDIENCE PROFILE:**
- Young urban commuters (15-40 years old)
- German-speaking Swiss citizens
- Quick readers who consume news during their commute (average 20 minutes)
- Prefer bite-sized, engaging content with visual appeal
- Interested in local Swiss news, lifestyle, entertainment, and conversational stories
- Enjoy interactive content, social media integration, and stories that spark discussion
- Appreciate humor, human interest stories, and relatable content

**YOUR TASK:**
Analyze the following news articles from various international sources and identify the most suitable content for 20min.ch readers.

**SELECTION CRITERIA:**
1. **Newsworthy Stories**: Must be genuinely important, impact Swiss readers, or have global significance
2. **Talking Pieces**: Stories that will generate conversation, debate, or emotional response among young Swiss readers

Consider:
- Swiss relevance or connection
- Shareability on social media
- Visual story potential
- Emotional impact (surprising, funny, shocking, heartwarming)
- Relatability to young urban lifestyle
- Potential for reader engagement/comments

**NEWS DATA:**
{df_json}

**REQUIRED OUTPUT:**
Please provide exactly 10 stories in the following format:

## TOP 5 NEWSWORTHY STORIES
1. **[Original Source]**: [Title]
   - **Why it matters to Swiss readers**: [Brief explanation]
   - **Key angle for 20min**: [How to present it]

2-5. [Continue same format]

## TOP 5 TALKING PIECES
1. **[Original Source]**: [Title]
   - **Conversation starter**: [What makes this discussable]
   - **20min angle**: [How to make it engaging]

2-5. [Continue same format]

Remember: 20min.ch readers want quick, impactful stories they can discuss with friends or share on social media. Focus on human stories, surprising facts, and content that connects to Swiss life."""
    
    return prompt

# Function to make the API call
def analyze_news_for_20min(news_df):
    """
    Send news DataFrame to Claude API and get recommendations for 20min.ch
    """
    # Convert DataFrame to JSON for the prompt
    df_json = news_df.to_json(orient='records', indent=2)
    
    # Create the prompt
    prompt = create_prompt(df_json)
    
    print("Sending request to Claude API...")
    print(f"Analyzing {len(news_df)} articles from {news_df['source'].nunique()} sources...")
    
    try:
        # Make the API call
        response = client.messages.create(
            model="claude-3-5-sonnet-20241022",  # Using Sonnet 3.5
            max_tokens=2000,
            temperature=0.7,
            messages=[
                {
                    "role": "user",
                    "content": prompt
                }
            ]
        )
        
        # Extract the response
        analysis = response.content[0].text
        
        print("\n" + "="*80)
        print("CLAUDE'S RECOMMENDATIONS FOR 20MIN.CH")
        print("="*80 + "\n")
        print(analysis)
        
        # Save the analysis
        timestamp = datetime.now().strftime('%Y%m%d_%H%M%S')
        filename = f"20min_analysis_{timestamp}.txt"
        
        with open(filename, 'w', encoding='utf-8') as f:
            f.write(f"Analysis generated at: {datetime.now()}\n")
            f.write(f"Total articles analyzed: {len(news_df)}\n")
            f.write(f"Sources: {news_df['source'].nunique()}\n\n")
            f.write("="*80 + "\n")
            f.write("RECOMMENDATIONS\n")
            f.write("="*80 + "\n\n")
            f.write(analysis)
        
        print(f"\n💾 Analysis saved to: {filename}")
        
        return analysis
        
    except Exception as e:
        print(f"\n❌ Error calling Claude API: {str(e)}")
        return None

# Run the analysis
print("🚀 Starting 20min.ch content analysis...")
print(f"📊 DataFrame contains {len(news_df)} articles")

# Call the function with your news DataFrame
analysis_result = analyze_news_for_20min(news_df)

# Optional: Create a summary DataFrame of recommended articles
if analysis_result:
    print("\n✅ Analysis complete! Check the output above for Claude's recommendations.")
    print("\nTip: You can copy the recommended titles and search for them in your original DataFrame:")
    print("Example: news_df[news_df['title'].str.contains('search_term', case=False)]")

🚀 Starting 20min.ch content analysis...
📊 DataFrame contains 155 articles
Sending request to Claude API...
Analyzing 155 articles from 31 sources...

CLAUDE'S RECOMMENDATIONS FOR 20MIN.CH

Here's my selection and analysis for 20min.ch readers:

## TOP 5 NEWSWORTHY STORIES
1. **BBC News Europe**: "Eight injured on Ryanair flight hit by 'severe turbulence'"
   - **Why it matters to Swiss readers**: Aviation safety directly impacts Swiss travelers who frequently use low-cost carriers
   - **Key angle for 20min**: Focus on passenger experiences and safety tips for budget airline travel

2. **Deutsche Welle English**: "Germany updates: Merz in Washington for talks with Trump"
   - **Why it matters to Swiss readers**: Germany's relationship with Trump could affect Swiss-EU-US relations
   - **Key angle for 20min**: What Trump's potential return means for Switzerland's key trading partner

3. **POLITICO Europe**: "Lagarde insists she'll complete her term at ECB"
   - **Why it matters to Swiss

In [11]:
def send_to_zapier(analysis_text, news_df):
    """
    Send the formatted data to Zapier webhook
    """
    print("📤 Sending data to Zapier...")
    
    # Prepare the payload - now with HTML as primary format
    payload = {
        "date": datetime.now().strftime("%d.%m.%Y"),
        "time": datetime.now().strftime("%H:%M CET"),
        "document_title": f"20min.ch News Analysis - {datetime.now().strftime('%d.%m.%Y')}",
        "document_content_html": format_for_google_docs(analysis_text, news_df),
        "email_content_html": create_email_html(analysis_text, news_df),
        "stats": {
            "total_articles": len(news_df),
            "total_sources": news_df['source'].nunique(),
            "continents": news_df['continent'].nunique()
        },
        "recipient_email": "tom.vaillant@20minuten.ch",
        "email_subject": f"Daily News Analysis - {datetime.now().strftime('%d.%m.%Y')}"
    }
    
    try:
        # Send to Zapier
        response = requests.post(ZAPIER_WEBHOOK_URL, json=payload)
        
        if response.status_code == 200:
            print("✅ Successfully sent to Zapier!")
            print(f"📧 Email will be sent to: {payload['recipient_email']}")
            print(f"📄 Document title: {payload['document_title']}")
            print("\n🎯 Next steps in Zapier:")
            print("1. Google Doc will be created with HTML formatting")
            print("2. Email will be sent with the doc link")
            print("\n💡 In Zapier, use 'document_content_html' field for Google Docs")
            return True
        else:
            print(f"❌ Error sending to Zapier: {response.status_code}")
            print(f"Response: {response.text}")
            return False
            
    except Exception as e:
        print(f"❌ Exception occurred: {str(e)}")
        return False
        
import requests
import json
from datetime import datetime

# Your Zapier webhook URL - REPLACE WITH YOUR ACTUAL URL
ZAPIER_WEBHOOK_URL = "https://hooks.zapier.com/hooks/catch/23240089/2vphjoe/"

def format_for_google_docs(analysis_text, news_df):
    """
    Format the analysis as clean, compact HTML for Google Docs
    """
    # Start with a simple, compact header
    html_content = f"""<h1>20MIN.CH DAILY NEWS ANALYSIS</h1>
<h2>{datetime.now().strftime('%d %B %Y')}</h2>

<h3>ANALYSIS SUMMARY</h3>
<p>
<strong>Generated:</strong> {datetime.now().strftime('%d.%m.%Y at %H:%M CET')}<br>
<strong>Articles Analyzed:</strong> {len(news_df)}<br>
<strong>News Sources:</strong> {news_df['source'].nunique()}<br>
<strong>Continents Covered:</strong> {news_df['continent'].nunique()}
</p>

<hr>
"""
    
    # Process the analysis text with minimal formatting
    lines = analysis_text.split('\n')
    current_section = ""
    
    for line in lines:
        line = line.strip()
        
        if not line:
            continue
            
        # Handle main headers
        if line.startswith('## '):
            header = line.replace('## ', '').replace('**', '')
            html_content += f'<h2>{header}</h2>\n'
            current_section = header
        
        # Handle story entries (numbered items)
        elif line[0:2] in ['1.', '2.', '3.', '4.', '5.']:
            # Parse the story line
            parts = line.split('**')
            if len(parts) >= 3:
                number = parts[0].strip()
                source = parts[1].strip()
                title = parts[2].strip().lstrip(':').strip()
                
                # Simple, compact formatting
                html_content += f'<p><strong>{number} {source}:</strong> {title}</p>\n'
        
        # Handle bullet points
        elif line.startswith('- '):
            content = line[2:].strip()
            
            # Handle bold markers
            if '**' in content:
                parts = content.split('**')
                formatted_content = ""
                for i, part in enumerate(parts):
                    if i % 2 == 1:
                        formatted_content += f'<strong>{part}</strong>'
                    else:
                        formatted_content += part
                content = formatted_content
            
            # Add as indented paragraph
            if ':' in content:
                label, value = content.split(':', 1)
                html_content += f'<p style="margin-left: 20px;">• <strong>{label}:</strong>{value}</p>\n'
            else:
                html_content += f'<p style="margin-left: 20px;">• {content}</p>\n'
    
    # Add compact methodology section
    html_content += """
<hr>
<h3>METHODOLOGY</h3>
<p>This analysis was generated by:</p>
<p style="margin-left: 20px;">
• Scanning RSS feeds from major news outlets across all continents<br>
• Collecting the latest """ + str(len(news_df)) + """ articles from """ + str(news_df['source'].nunique()) + """ sources<br>
• Using Claude AI to identify stories most relevant to 20min.ch's audience<br>
• Applying selection criteria: Swiss relevance, shareability, visual potential, emotional engagement, conversation potential
</p>

<h3>SOURCE DISTRIBUTION</h3>
<p>"""
    
    # Add source statistics in a compact format
    source_lines = []
    for continent in news_df['continent'].value_counts().index:
        count = len(news_df[news_df['continent'] == continent])
        source_lines.append(f'<strong>{continent}:</strong> {count} articles')
    
    html_content += ' | '.join(source_lines)
    
    html_content += f"""
</p>

<hr>
<p><em>Generated automatically via GitHub Actions at {datetime.now().strftime('%H:%M:%S CET')}</em></p>
"""
    
    return html_content

def create_email_html(analysis_text, news_df):
    """
    Create HTML content for the email with summary
    """
    # Extract top stories for email preview
    stories_preview = extract_top_stories(analysis_text)
    
    html_content = f"""
<div style="font-family: Arial, sans-serif; max-width: 600px; margin: 0 auto;">
    <h2 style="color: #d32f2f;">20min.ch Daily News Analysis</h2>
    
    <p>Hallo Tom,</p>
    
    <p>Your automated news analysis for <strong>{datetime.now().strftime('%d.%m.%Y')}</strong> is ready.</p>
    
    <div style="background-color: #f5f5f5; padding: 15px; border-radius: 5px; margin: 20px 0;">
        <h3 style="margin-top: 0;">Quick Stats:</h3>
        <ul style="list-style-type: none; padding-left: 0;">
            <li>📊 Articles analyzed: {len(news_df)}</li>
            <li>📰 News sources: {news_df['source'].nunique()}</li>
            <li>🌍 Continents covered: {news_df['continent'].nunique()}</li>
            <li>⏰ Generated at: {datetime.now().strftime('%H:%M CET')}</li>
        </ul>
    </div>
    
    <div style="margin: 20px 0;">
        <h3>Today's Highlights:</h3>
        {stories_preview}
    </div>
    
    <p style="margin-top: 30px;">
        <em>💡 The full analysis includes detailed recommendations for each story, 
        including Swiss relevance and suggested presentation angles.</em>
    </p>
    
    <hr style="border: none; border-top: 1px solid #ddd; margin: 30px 0;">
    
    <p style="color: #666; font-size: 12px;">
        This analysis was automatically generated using AI to identify stories 
        most relevant to 20min.ch's audience.
    </p>
</div>
"""
    
    return html_content

def extract_top_stories(analysis_text):
    """
    Extract the first 2 stories from each category for email preview
    """
    preview_html = ""
    lines = analysis_text.split('\n')
    
    # Flags to track sections
    in_newsworthy = False
    in_talking = False
    story_count = 0
    
    for line in lines:
        if "TOP 5 NEWSWORTHY" in line:
            in_newsworthy = True
            in_talking = False
            story_count = 0
            preview_html += "<h4>🔥 Top Newsworthy:</h4><ul>"
        elif "TOP 5 TALKING" in line:
            in_newsworthy = False
            in_talking = True
            story_count = 0
            if "</ul>" not in preview_html[-5:]:
                preview_html += "</ul>"
            preview_html += "<h4>💬 Top Talking Pieces:</h4><ul>"
        elif (in_newsworthy or in_talking) and story_count < 2:
            if line.strip().startswith(('1.', '2.')):
                # Extract the title part
                if '**' in line and ':' in line:
                    parts = line.split('**')
                    if len(parts) >= 3:
                        source = parts[1]
                        title = parts[2].split(':')[1].strip() if ':' in parts[2] else parts[2].strip()
                        preview_html += f"<li><strong>{source}</strong>: {title}</li>"
                        story_count += 1
    
    if "</ul>" not in preview_html[-5:]:
        preview_html += "</ul>"
    
    return preview_html

def send_to_zapier(analysis_text, news_df):
    """
    Send the formatted data to Zapier webhook
    """
    print("📤 Sending data to Zapier...")
    
    # Prepare the payload
    payload = {
        "date": datetime.now().strftime("%d.%m.%Y"),
        "time": datetime.now().strftime("%H:%M CET"),
        "document_title": f"20min.ch News Analysis - {datetime.now().strftime('%d.%m.%Y')}",
        "document_content": format_for_google_docs(analysis_text, news_df),
        "email_content_html": create_email_html(analysis_text, news_df),
        "stats": {
            "total_articles": len(news_df),
            "total_sources": news_df['source'].nunique(),
            "continents": news_df['continent'].nunique()
        },
        "recipient_email": "tom.vaillant@20minuten.ch",
        "email_subject": f"Daily News Analysis - {datetime.now().strftime('%d.%m.%Y')}"
    }
    
    try:
        # Send to Zapier
        response = requests.post(ZAPIER_WEBHOOK_URL, json=payload)
        
        if response.status_code == 200:
            print("✅ Successfully sent to Zapier!")
            print(f"📧 Email will be sent to: {payload['recipient_email']}")
            print(f"📄 Document title: {payload['document_title']}")
            print("\n🎯 Next steps in Zapier:")
            print("1. Google Doc will be created automatically")
            print("2. Email will be sent with the doc link")
            return True
        else:
            print(f"❌ Error sending to Zapier: {response.status_code}")
            print(f"Response: {response.text}")
            return False
            
    except Exception as e:
        print(f"❌ Exception occurred: {str(e)}")
        return False

# Test the webhook with sample data (optional)
def test_zapier_webhook():
    """
    Test the Zapier webhook with minimal data
    """
    test_payload = {
        "test": True,
        "message": "Testing webhook connection",
        "timestamp": datetime.now().isoformat()
    }
    
    print("🧪 Testing Zapier webhook...")
    response = requests.post(ZAPIER_WEBHOOK_URL, json=test_payload)
    
    if response.status_code == 200:
        print("✅ Webhook test successful!")
        print("Check your Zapier dashboard to see the test data")
    else:
        print(f"❌ Webhook test failed: {response.status_code}")

# Main execution for your Jupyter notebook
if 'analysis_result' in globals() and 'news_df' in globals():
    # Send the analysis to Zapier
    success = send_to_zapier(analysis_result, news_df)
    
    if success:
        print("\n🎉 All done! Check your Zapier dashboard to monitor the workflow.")
else:
    print("❌ No analysis results found. Please run the Claude analysis first.")
    print("\n💡 To test your webhook connection, run:")
    print("test_zapier_webhook()")

📤 Sending data to Zapier...
✅ Successfully sent to Zapier!
📧 Email will be sent to: tom.vaillant@20minuten.ch
📄 Document title: 20min.ch News Analysis - 05.06.2025

🎯 Next steps in Zapier:
1. Google Doc will be created automatically
2. Email will be sent with the doc link

🎉 All done! Check your Zapier dashboard to monitor the workflow.
