In [1]:
import pandas as pd
import requests
from bs4 import BeautifulSoup
from datetime import datetime
import time
from typing import List, Dict
import re

# Load competitor sources from CSV file
def load_competitor_sources_from_csv(csv_path='competitor_sources.csv'):
    """Load competitor sources from CSV file and organize by continent."""
    try:
        sources_df = pd.read_csv(csv_path)
        print(f"✅ Loaded {len(sources_df)} competitor sources from {csv_path}")
        
        # Organize by continent
        news_feeds = {}
        for continent in sources_df['continent'].unique():
            continent_sources = sources_df[sources_df['continent'] == continent]
            news_feeds[continent] = []
            
            for _, row in continent_sources.iterrows():
                news_feeds[continent].append({
                    'name': row['name'],
                    'url': row['url']
                })
        
        return news_feeds
        
    except FileNotFoundError:
        print(f"❌ CSV file '{csv_path}' not found!")
        raise
    except Exception as e:
        print(f"❌ Error loading CSV: {str(e)}")
        raise

# Function to extract time ago from span text
def extract_time_ago(time_text):
    """Convert time ago text to minutes for sorting."""
    if not time_text:
        return 999999  # Large number for unknown times
    
    time_text = time_text.lower()
    if 'm' in time_text or 'min' in time_text:
        match = re.search(r'(\d+)', time_text)
        return int(match.group(1)) if match else 999999
    elif 'h' in time_text or 'hour' in time_text:
        match = re.search(r'(\d+)', time_text)
        return int(match.group(1)) * 60 if match else 999999
    elif 'd' in time_text or 'day' in time_text:
        match = re.search(r'(\d+)', time_text)
        return int(match.group(1)) * 1440 if match else 999999
    else:
        return 999999

# Function to scrape Bing News results
def scrape_bing_news(url, source_name, max_articles=10):
    """Scrape news articles from Bing News search results."""
    articles = []
    
    try:
        print(f"  Scraping {source_name}...", end="", flush=True)
        
        # Headers to mimic a real browser
        headers = {
            'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36',
            'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8',
            'Accept-Language': 'en-US,en;q=0.5',
            'Accept-Encoding': 'gzip, deflate',
            'Connection': 'keep-alive',
            'Upgrade-Insecure-Requests': '1'
        }
        
        # Make request
        response = requests.get(url, headers=headers, timeout=15)
        response.raise_for_status()
        
        # Parse HTML
        soup = BeautifulSoup(response.content, 'html.parser')
        
        # Find all news cards
        news_cards = soup.find_all('div', class_='news-card')
        
        if not news_cards:
            # Try alternative selectors
            news_cards = soup.find_all('div', {'class': re.compile('newsitem|news-card-body')})
        
        print(f"\n    📋 Found {len(news_cards)} news items")
        
        for i, card in enumerate(news_cards[:max_articles]):
            try:
                article_data = {}
                
                # Extract title
                title_elem = card.find('a', class_='title')
                if not title_elem:
                    title_elem = card.find('a', {'class': re.compile('title|headline')})
                
                if title_elem:
                    article_data['title'] = title_elem.get_text(strip=True)
                    article_data['link'] = title_elem.get('href', '')
                else:
                    continue  # Skip if no title found
                
                # Extract snippet/description
                snippet_elem = card.find('div', class_='snippet')
                if not snippet_elem:
                    snippet_elem = card.find('div', {'class': re.compile('snippet|summary|description')})
                
                article_data['description'] = snippet_elem.get_text(strip=True) if snippet_elem else ''
                
                # Extract time
                time_elem = card.find('span', {'aria-label': re.compile('ago|minutes|hours|days')})
                if not time_elem:
                    time_elem = card.find('span', text=re.compile(r'\d+[mhd]'))
                
                time_text = time_elem.get_text(strip=True) if time_elem else 'Unknown'
                article_data['time_ago'] = time_text
                article_data['minutes_ago'] = extract_time_ago(time_text)
                
                # Extract source (might be different from search source)
                source_elem = card.find('a', {'aria-label': re.compile('Search news from')})
                if source_elem:
                    article_data['original_source'] = source_elem.get_text(strip=True)
                else:
                    article_data['original_source'] = source_name
                
                article_data['source'] = source_name
                article_data['timestamp'] = datetime.now().isoformat()
                
                articles.append(article_data)
                
            except Exception as e:
                print(f"\n    ⚠️  Error parsing article {i+1}: {str(e)}")
                continue
        
        # Sort by recency
        articles.sort(key=lambda x: x['minutes_ago'])
        
        print(f"    ✓ ({len(articles)} articles extracted)")
        
        # Debug: Print first article
        if articles:
            first = articles[0]
            print(f"      First article: {first['title'][:60]}...")
            print(f"      Time: {first['time_ago']}, Link: {first['link'][:50]}...")
        
    except requests.exceptions.RequestException as e:
        print(f" ✗ (Network error: {type(e).__name__})")
    except Exception as e:
        print(f" ✗ (Error: {type(e).__name__}: {str(e)})")
    
    return articles

# Main execution
print("Starting Bing News competitor scraper...\n")

# Load competitor sources
NEWS_FEEDS = load_competitor_sources_from_csv()

all_results = []

# Process each continent
for continent, feeds in NEWS_FEEDS.items():
    print(f"\n{continent}:")
    print("-" * 40)
    
    for feed in feeds:
        articles = scrape_bing_news(feed['url'], feed['name'])
        
        # Add continent info to each article
        for article in articles:
            article['continent'] = continent
            article['feed_url'] = feed['url']
            all_results.append(article)
        
        # Delay to be respectful and avoid rate limiting
        time.sleep(2)

# Create DataFrame
print(f"\n\nCreating DataFrame...")
news_df = pd.DataFrame(all_results)

# Show summary
if not news_df.empty:
    print(f"\n✅ Success! Collected {len(news_df)} articles from {news_df['source'].nunique()} sources")
    
    print(f"\nArticles per source:")
    for source in news_df['source'].value_counts().index[:10]:
        count = len(news_df[news_df['source'] == source])
        print(f"  {source}: {count}")
    
    print("\n📰 Sample headlines:")
    print("=" * 80)
    for _, row in news_df.head(5).iterrows():
        print(f"{row['source']}: {row['title'][:80]}...")
        if row['description']:
            print(f"  📄 Description: {row['description'][:150]}...")
        print(f"  ⏰ Posted: {row['time_ago']}")
        print()
    
    print(f"\n📊 Content statistics:")
    print(f"  Articles with descriptions: {len(news_df[news_df['description'] != ''])}")
    print(f"  Articles without descriptions: {len(news_df[news_df['description'] == ''])}")
    print(f"  Average description length: {news_df['description'].str.len().mean():.0f} characters")
    
    # Show recency distribution
    print(f"\n⏱️  Recency distribution:")
    recent_counts = news_df['time_ago'].value_counts().head(10)
    for time_val, count in recent_counts.items():
        print(f"  {time_val}: {count} articles")
else:
    print("❌ No articles collected!")

# The DataFrame is now available as 'news_df'
print(f"\n\n✨ DataFrame stored in variable 'news_df' with {len(news_df)} articles")

In [2]:
import requests
import pandas as pd
import json
from datetime import datetime

def fetch_20min_titles():
    """Fetch titles from 20min.ch API."""
    
    url = "https://api.20min.ch/content/6/category/1"
    
    # Headers to simulate request from 20min.ch
    headers = {
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36',
        'Accept': 'application/json, text/plain, */*',
        'Accept-Language': 'de-CH,de;q=0.9,en;q=0.8',
        'Origin': 'https://www.20min.ch',
        'Referer': 'https://www.20min.ch/',
    }
    
    try:
        print("Fetching data from 20min.ch API...")
        
        # Make request
        response = requests.get(url, headers=headers, timeout=10)
        response.raise_for_status()
        
        # Parse JSON
        data = response.json()
        
        # Extract titles
        titles = []
        
        if 'content' in data and 'elements' in data['content']:
            elements = data['content']['elements']
            print(f"Found {len(elements)} elements")
            
            # Extract titles from elements that have content.title
            for element in elements:
                try:
                    if 'content' in element and 'title' in element['content']:
                        titles.append({
                            'title': element['content']['title'],
                            'source': '20min.ch',
                            'timestamp': datetime.now().isoformat()
                        })
                except:
                    # Skip elements without proper structure
                    continue
        
        print(f"✅ Extracted {len(titles)} article titles from 20min.ch")
        
        # Create DataFrame
        twentymin_df = pd.DataFrame(titles)
        
        # Show sample
        if not twentymin_df.empty:
            print("\n📰 Sample 20min.ch headlines:")
            print("=" * 80)
            for _, row in twentymin_df.head(10).iterrows():
                print(f"- {row['title']}")
        
        return twentymin_df
        
    except Exception as e:
        print(f"❌ Error: {e}")
        return pd.DataFrame()

# Execute the function
twentymin_df = fetch_20min_titles()

print(f"\n✨ DataFrame stored in variable 'twentymin_df' with {len(twentymin_df)} titles")

In [3]:
import anthropic
import json
import os
from datetime import datetime
from pathlib import Path

# Load environment variables from .env file
try:
    from dotenv import load_dotenv
    
    if Path('.env').exists():
        load_dotenv('.env')
        print("✅ Loaded .env file from current directory")
    elif Path('../.env').exists():
        load_dotenv('../.env')
        print("✅ Loaded .env file from parent directory")
    else:
        load_dotenv()
        print("✅ Loaded .env file from default location")
        
except ImportError:
    print("❌ python-dotenv not installed! Run: pip install python-dotenv")
    raise

# Get API key from environment variable
API_KEY = os.environ.get('ANTHROPIC_API_KEY')

# Validate that we have the API key
if not API_KEY:
    print("❌ ANTHROPIC_API_KEY not found in environment variables!")
    raise ValueError("ANTHROPIC_API_KEY environment variable not set!")
else:
    masked_key = f"{API_KEY[:10]}...{API_KEY[-4:]}" if len(API_KEY) > 20 else "KEY_TOO_SHORT"
    print(f"✅ API Key loaded successfully: {masked_key}")

# Initialize the Claude client
client = anthropic.Anthropic(api_key=API_KEY)

# Load prompt from external file
def load_prompt_template(prompt_file='analysis_prompt.txt'):
    """Load the prompt template from external file."""
    try:
        with open(prompt_file, 'r', encoding='utf-8') as f:
            prompt_template = f.read()
        print(f"✅ Loaded prompt template from {prompt_file}")
        return prompt_template
    except FileNotFoundError:
        print(f"❌ Prompt file '{prompt_file}' not found!")
        raise
    except Exception as e:
        print(f"❌ Error loading prompt file: {str(e)}")
        raise

# Create combined data structure for analysis
def prepare_data_for_analysis(competitor_df, twentymin_df):
    """Prepare combined data structure for Claude analysis."""
    
    # Create combined data structure
    combined_data = {
        "competitor_articles": competitor_df.to_dict('records'),
        "twentymin_articles": twentymin_df.to_dict('records'),
        "statistics": {
            "total_competitor_articles": len(competitor_df),
            "total_twentymin_articles": len(twentymin_df),
            "competitor_sources": competitor_df['source'].nunique() if 'source' in competitor_df.columns else 0,
            "timestamp": datetime.now().isoformat()
        }
    }
    
    return combined_data

# Function to make the API call
def analyze_competitor_gaps(competitor_df, twentymin_df):
    """
    Send both DataFrames to Claude API for gap analysis
    """
    # Prepare the data
    combined_data = prepare_data_for_analysis(competitor_df, twentymin_df)
    
    # Convert to JSON
    df_json = json.dumps(combined_data, ensure_ascii=False, indent=2)
    
    # Load and prepare prompt
    prompt_template = load_prompt_template()
    prompt = prompt_template.format(df_json=df_json)
    
    print("\nSending request to Claude API...")
    print(f"Analyzing {len(competitor_df)} competitor articles vs {len(twentymin_df)} 20min articles...")
    
    try:
        # Make the API call
        response = client.messages.create(
            model="claude-3-5-sonnet-20241022",  # Using Sonnet 3.5
            max_tokens=4000,  # Increased for comprehensive analysis
            temperature=0.7,
            messages=[
                {
                    "role": "user",
                    "content": prompt
                }
            ]
        )
        
        # Extract the response
        analysis = response.content[0].text
        
        print("\n" + "="*80)
        print("COMPETITOR GAP ANALYSIS FOR 20MIN.CH")
        print("="*80 + "\n")
        print(analysis)
        
        return analysis
        
    except Exception as e:
        print(f"\n❌ Error calling Claude API: {str(e)}")
        return None

# Run the analysis
print("🚀 Starting competitor gap analysis for 20min.ch...")
print(f"📊 Competitor articles: {len(news_df)}")
print(f"📊 20min articles: {len(twentymin_df)}")

# Call the function with both DataFrames
analysis_result = analyze_competitor_gaps(news_df, twentymin_df)

# Store results
if analysis_result:
    print("\n✅ Analysis complete! Check the output above for missing content opportunities.")
    
    # Optional: Save to file
    with open(f'competitor_analysis_{datetime.now().strftime("%Y%m%d_%H%M%S")}.txt', 'w', encoding='utf-8') as f:
        f.write(analysis_result)
    print(f"\n💾 Analysis saved to competitor_analysis_{datetime.now().strftime('%Y%m%d_%H%M%S')}.txt")

In [4]:
import requests
import json
import os
from datetime import datetime
from jinja2 import Template
import re

# Load environment variables
try:
    from dotenv import load_dotenv
    load_dotenv()
    print("✅ Loaded .env file")
except ImportError:
    print("📝 dotenv not available - using system environment variables")

# Your Zapier webhook URL
ZAPIER_WEBHOOK_URL = os.environ.get('ZAPIER_WEBHOOK_URL')

# Add error checking
if not ZAPIER_WEBHOOK_URL:
    raise ValueError("ZAPIER_WEBHOOK_URL environment variable not set!")

def load_template(template_file='competitor_email_template.html'):
    """Load HTML template from file."""
    try:
        with open(template_file, 'r', encoding='utf-8') as f:
            template_content = f.read()
        return Template(template_content)
    except FileNotFoundError:
        print(f"❌ Template file '{template_file}' not found!")
        raise
    except Exception as e:
        print(f"❌ Error loading template file: {str(e)}")
        raise

def extract_title_and_url(text):
    """Extract title and URL from Claude's formatted text."""
    url_pattern = r'(.*?)\s*-\s*(https?://[^\s]+)'
    match = re.search(url_pattern, text)
    
    if match:
        title = match.group(1).strip().strip('"')
        url = match.group(2).strip()
        return {'text': title, 'url': url}
    else:
        clean_text = text.strip().strip('"')
        return {'text': clean_text, 'url': None}

def parse_analysis_for_email(analysis_text):
    """Parse the Claude analysis text into structured data for email template."""
    
    categories = []
    keywords = []
    recommendations = []
    audience_recommendations = []
    
    lines = analysis_text.split('\n')
    current_section = None
    current_category = None
    current_audience = None
    
    for line in lines:
        line = line.strip()
        
        if not line:
            continue
            
        # Detect main category sections
        if line.endswith(':') and any(word in line.upper() for word in ['POLITIK', 'WIRTSCHAFT', 'GESELLSCHAFT', 'LIFESTYLE', 'WISSENSCHAFT', 'TECHNOLOGIE', 'DIGITAL', 'BUSINESS', 'SPORT', 'HEALTH']):
            category_name = line.replace(':', '').strip()
            current_category = {
                'name': category_name,
                'stories': []
            }
            categories.append(current_category)
            current_section = "categories"
            continue
        
        # Detect numbered items within categories
        elif current_section == "categories" and current_category and line and (line[0].isdigit() and '. ' in line):
            story_data = extract_title_and_url(line)
            current_category['stories'].append(story_data)
            continue
            
        # Detect Keywords section
        elif line == "Keywords:" or line == "Schlüsselwörter:" or "**Keywords:**" in line:
            current_section = "keywords"
            continue
        elif current_section == "keywords" and line.startswith('- '):
            keyword = line[2:].strip()
            keywords.append(keyword)
            continue
        
        # Detect audience-specific recommendations
        elif line.startswith('Für ') and ('Gruppe' in line or 'Nutzer' in line) and line.endswith(':'):
            current_section = "audience"
            audience_name = line.replace(':', '').strip()
            current_audience = {
                'name': audience_name,
                'items': []
            }
            audience_recommendations.append(current_audience)
            continue
        elif current_section == "audience" and current_audience and line.startswith('- '):
            item_text = line[2:]
            item_data = extract_title_and_url(item_text)
            current_audience['items'].append(item_data)
            continue
        
        # Detect Final recommendations section
        elif "Finale Top-Empfehlungen:" in line:
            current_section = "recommendations"
            current_category = None
            current_audience = None
            continue
        elif current_section == "recommendations" and line and (line[0].isdigit() and '. ' in line):
            rec_data = extract_title_and_url(line)
            recommendations.append(rec_data)
            continue
        
        # Reset section if we hit a new major section
        elif any(keyword in line for keyword in ["**Zielgruppenspezifische Empfehlungen:**", "**Top-Themen nach Kategorien"]):
            current_section = None
            current_category = None
            current_audience = None
    
    return {
        'categories': categories[:5],
        'keywords': keywords[:10],
        'recommendations': recommendations[:5],
        'audience_recommendations': audience_recommendations
    }

def create_email_html(analysis_text, competitor_df, twentymin_df):
    """Create HTML content for email using competitor analysis template."""
    try:
        template = load_template('competitor_email_template.html')
        parsed_data = parse_analysis_for_email(analysis_text)
        
        print(f"📊 Parsed {len(parsed_data['categories'])} categories")
        print(f"🔑 Parsed {len(parsed_data['keywords'])} keywords")
        print(f"💡 Parsed {len(parsed_data['recommendations'])} recommendations")
        print(f"🎯 Parsed {len(parsed_data['audience_recommendations'])} audience groups")
        
        html_content = template.render(
            date=datetime.now().strftime('%d.%m.%Y'),
            time=datetime.now().strftime('%H:%M CET'),
            stats={
                'competitor_articles': len(competitor_df),
                'twentymin_articles': len(twentymin_df),
                'competitor_sources': competitor_df['source'].nunique() if 'source' in competitor_df.columns else 0
            },
            categories=parsed_data['categories'],
            keywords=parsed_data['keywords'],
            recommendations=parsed_data['recommendations'],
            audience_recommendations=parsed_data['audience_recommendations']
        )
        
        print("✅ Generated email content using template")
        return html_content
        
    except Exception as e:
        print(f"❌ Error creating email HTML: {str(e)}")
        raise

def send_to_zapier(analysis_text, competitor_df, twentymin_df):
    """Send the analysis via Zapier webhook for email delivery"""
    print("📤 Sending data to Zapier...")
    
    payload = {
        "date": datetime.now().strftime("%d.%m.%Y"),
        "time": datetime.now().strftime("%H:%M CET"),
        "email_content_html": create_email_html(analysis_text, competitor_df, twentymin_df),
        "stats": {
            "competitor_articles": len(competitor_df),
            "twentymin_articles": len(twentymin_df),
            "competitor_sources": competitor_df['source'].nunique() if 'source' in competitor_df.columns else 0
        },
        "recipient_email": "tom.vaillant@20minuten.ch",
        "email_subject": f"Competitor Gap Analysis - {datetime.now().strftime('%d.%m.%Y')}"
    }
    
    try:
        response = requests.post(ZAPIER_WEBHOOK_URL, json=payload)
        
        if response.status_code == 200:
            print("✅ Successfully sent to Zapier!")
            print(f"📧 Email will be sent to: {payload['recipient_email']}")
            print(f"📋 Subject: {payload['email_subject']}")
            return True
        else:
            print(f"❌ Error sending to Zapier: {response.status_code}")
            print(f"Response: {response.text}")
            return False
            
    except Exception as e:
        print(f"❌ Exception occurred: {str(e)}")
        return False

# Main execution
if 'analysis_result' in globals() and 'news_df' in globals() and 'twentymin_df' in globals():
    success = send_to_zapier(analysis_result, news_df, twentymin_df)
    
    if success:
        print("\n🎉 All done! Check your email for the competitor gap analysis.")
else:
    print("❌ Missing required data. Please ensure all previous cells have been run:")
    print("   - news_df (competitor articles)")
    print("   - twentymin_df (20min articles)")
    print("   - analysis_result (Claude's analysis)")