In [1]:
import pandas as pd
import requests
from bs4 import BeautifulSoup
from datetime import datetime
import time
from typing import List, Dict
import re

# Load competitor sources from CSV file
def load_competitor_sources_from_csv(csv_path='competitor_sources.csv'):
    """Load competitor sources from CSV file."""
    try:
        sources_df = pd.read_csv(csv_path)
        print(f"✅ Loaded {len(sources_df)} competitor sources")
        
        # Convert to list of dictionaries
        news_feeds = []
        for _, row in sources_df.iterrows():
            news_feeds.append({
                'name': row['name'],
                'url': row['url']
            })
        
        return news_feeds
        
    except Exception as e:
        print(f"❌ Error loading CSV: {str(e)}")
        raise

# Function to extract time ago from span text
def extract_time_ago(time_text):
    """Convert time ago text to minutes for sorting."""
    if not time_text:
        return 999999
    
    time_text = time_text.lower()
    if 'm' in time_text or 'min' in time_text:
        match = re.search(r'(\d+)', time_text)
        return int(match.group(1)) if match else 999999
    elif 'h' in time_text or 'hour' in time_text:
        match = re.search(r'(\d+)', time_text)
        return int(match.group(1)) * 60 if match else 999999
    elif 'd' in time_text or 'day' in time_text:
        match = re.search(r'(\d+)', time_text)
        return int(match.group(1)) * 1440 if match else 999999
    else:
        return 999999

# Function to scrape Bing News results
def scrape_bing_news(url, source_name, max_articles=10):
    """Scrape news articles from Bing News search results."""
    articles = []
    
    try:
        # Headers to mimic a real browser
        headers = {
            'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36',
            'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8',
            'Accept-Language': 'en-US,en;q=0.5',
            'Accept-Encoding': 'gzip, deflate',
            'Connection': 'keep-alive',
            'Upgrade-Insecure-Requests': '1'
        }
        
        # Make request
        response = requests.get(url, headers=headers, timeout=15)
        response.raise_for_status()
        
        # Parse HTML
        soup = BeautifulSoup(response.content, 'html.parser')
        
        # Find all news cards
        news_cards = soup.find_all('div', class_='news-card')
        
        if not news_cards:
            news_cards = soup.find_all('div', {'class': re.compile('newsitem|news-card-body')})
        
        for i, card in enumerate(news_cards[:max_articles]):
            try:
                article_data = {}
                
                # Extract title
                title_elem = card.find('a', class_='title')
                if not title_elem:
                    title_elem = card.find('a', {'class': re.compile('title|headline')})
                
                if title_elem:
                    article_data['title'] = title_elem.get_text(strip=True)
                    article_data['link'] = title_elem.get('href', '')
                else:
                    continue
                
                # Extract snippet/description
                snippet_elem = card.find('div', class_='snippet')
                if not snippet_elem:
                    snippet_elem = card.find('div', {'class': re.compile('snippet|summary|description')})
                
                article_data['description'] = snippet_elem.get_text(strip=True) if snippet_elem else ''
                
                # Extract time
                time_elem = card.find('span', {'aria-label': re.compile('ago|minutes|hours|days')})
                if not time_elem:
                    time_elem = card.find('span', text=re.compile(r'\d+[mhd]'))
                
                time_text = time_elem.get_text(strip=True) if time_elem else 'Unknown'
                article_data['time_ago'] = time_text
                article_data['minutes_ago'] = extract_time_ago(time_text)
                
                # Extract source
                source_elem = card.find('a', {'aria-label': re.compile('Search news from')})
                if source_elem:
                    article_data['original_source'] = source_elem.get_text(strip=True)
                else:
                    article_data['original_source'] = source_name
                
                article_data['source'] = source_name
                article_data['timestamp'] = datetime.now().isoformat()
                
                articles.append(article_data)
                
            except Exception:
                continue
        
        # Sort by recency
        articles.sort(key=lambda x: x['minutes_ago'])
        
    except Exception:
        pass
    
    return articles

# Main execution
print("🚀 Starting competitor scraper...")

# Load competitor sources
NEWS_FEEDS = load_competitor_sources_from_csv()

all_results = []

# Process each feed
for feed in NEWS_FEEDS:
    articles = scrape_bing_news(feed['url'], feed['name'])
    
    # Add each article to results
    for article in articles:
        article['feed_url'] = feed['url']
        all_results.append(article)
    
    # Delay to be respectful
    time.sleep(2)

# Create DataFrame
news_df = pd.DataFrame(all_results)

if not news_df.empty:
    print(f"✅ Collected {len(news_df)} articles from {news_df['source'].nunique()} sources")
else:
    print("❌ No articles collected!")

print(f"✨ DataFrame 'news_df' ready with {len(news_df)} articles")

🚀 Starting competitor scraper...
✅ Loaded 13 competitor sources


✅ Collected 130 articles from 13 sources
✨ DataFrame 'news_df' ready with 130 articles


In [2]:
import requests
import pandas as pd
import json
from datetime import datetime

def fetch_20min_titles():
    """Fetch titles from 20min.ch API."""
    
    url = "https://api.20min.ch/content/6/category/1"
    
    headers = {
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36',
        'Accept': 'application/json, text/plain, */*',
        'Accept-Language': 'de-CH,de;q=0.9,en;q=0.8',
        'Origin': 'https://www.20min.ch',
        'Referer': 'https://www.20min.ch/',
    }
    
    try:
        print("📰 Fetching 20min.ch articles...")
        
        response = requests.get(url, headers=headers, timeout=10)
        response.raise_for_status()
        
        data = response.json()
        titles = []
        
        if 'content' in data and 'elements' in data['content']:
            elements = data['content']['elements']
            
            for element in elements:
                try:
                    if 'content' in element and 'title' in element['content']:
                        titles.append({
                            'title': element['content']['title'],
                            'source': '20min.ch',
                            'timestamp': datetime.now().isoformat()
                        })
                except:
                    continue
        
        twentymin_df = pd.DataFrame(titles)
        print(f"✅ Extracted {len(titles)} article titles")
        
        return twentymin_df
        
    except Exception as e:
        print(f"❌ Error: {e}")
        return pd.DataFrame()

# Execute the function
twentymin_df = fetch_20min_titles()
print(f"✨ DataFrame 'twentymin_df' ready with {len(twentymin_df)} titles")

📰 Fetching 20min.ch articles...


✅ Extracted 149 article titles
✨ DataFrame 'twentymin_df' ready with 149 titles


In [3]:
import anthropic
import json
import os
from datetime import datetime
from pathlib import Path

# Load environment variables
try:
    from dotenv import load_dotenv
    load_dotenv()
    print("✅ Environment loaded")
except ImportError:
    print("❌ python-dotenv not installed")
    raise

# Get API key
API_KEY = os.environ.get('ANTHROPIC_API_KEY')

if not API_KEY:
    raise ValueError("ANTHROPIC_API_KEY environment variable not set!")

# Initialize Claude client
client = anthropic.Anthropic(api_key=API_KEY)

def load_prompt_template(prompt_file='analysis_prompt.txt'):
    """Load the prompt template from external file."""
    try:
        with open(prompt_file, 'r', encoding='utf-8') as f:
            prompt_template = f.read()
        return prompt_template
    except Exception as e:
        print(f"❌ Error loading prompt file: {str(e)}")
        raise

def prepare_data_for_analysis(competitor_df, twentymin_df):
    """Prepare combined data structure for Claude analysis."""
    
    combined_data = {
        "competitor_articles": competitor_df.to_dict('records'),
        "twentymin_articles": twentymin_df.to_dict('records'),
        "statistics": {
            "total_competitor_articles": len(competitor_df),
            "total_twentymin_articles": len(twentymin_df),
            "competitor_sources": competitor_df['source'].nunique() if 'source' in competitor_df.columns else 0,
            "timestamp": datetime.now().isoformat()
        }
    }
    
    return combined_data

def analyze_competitor_gaps(competitor_df, twentymin_df):
    """Send both DataFrames to Claude API for gap analysis"""
    
    combined_data = prepare_data_for_analysis(competitor_df, twentymin_df)
    df_json = json.dumps(combined_data, ensure_ascii=False, indent=2)
    
    prompt_template = load_prompt_template()
    prompt = prompt_template.format(df_json=df_json)
    
    print(f"🤖 Analyzing {len(competitor_df)} vs {len(twentymin_df)} articles...")
    
    try:
        response = client.messages.create(
            model="claude-3-5-sonnet-20241022",
            max_tokens=4000,
            temperature=0.7,
            messages=[
                {
                    "role": "user",
                    "content": prompt
                }
            ]
        )
        
        analysis = response.content[0].text
        print("✅ Analysis complete")
        
        return analysis
        
    except Exception as e:
        print(f"❌ Error calling Claude API: {str(e)}")
        return None

# Run the analysis
print("🚀 Starting competitor analysis...")
analysis_result = analyze_competitor_gaps(news_df, twentymin_df)

if analysis_result:
    print("✨ Analysis stored in 'analysis_result' variable")

✅ Environment loaded
🚀 Starting competitor analysis...
🤖 Analyzing 130 vs 149 articles...


✅ Analysis complete
✨ Analysis stored in 'analysis_result' variable


In [4]:
import requests
import json
import os
from datetime import datetime
from jinja2 import Template
import re

# Load environment variables
try:
    from dotenv import load_dotenv
    load_dotenv()
    print("✅ Environment loaded")
except ImportError:
    print("📝 Using system environment variables")

ZAPIER_WEBHOOK_URL = os.environ.get('ZAPIER_WEBHOOK_URL')

if not ZAPIER_WEBHOOK_URL:
    raise ValueError("ZAPIER_WEBHOOK_URL environment variable not set!")

def load_template(template_file='competitor_email_template.html'):
    """Load HTML template from file."""
    try:
        with open(template_file, 'r', encoding='utf-8') as f:
            template_content = f.read()
        return Template(template_content)
    except Exception as e:
        print(f"❌ Error loading template: {str(e)}")
        raise

def extract_title_and_url(text):
    """Extract title and URL from Claude's formatted text."""
    text = text.strip().lstrip('- ').strip('"')
    
    url_pattern = r'(.*?)\s*-\s*(https?://[^\s]+)'
    match = re.search(url_pattern, text)
    
    if match:
        title = match.group(1).strip().strip('"')
        url = match.group(2).strip()
        return {'text': title, 'url': url}
    else:
        clean_text = text.strip().strip('"')
        return {'text': clean_text, 'url': None}

def parse_analysis_for_email(analysis_text):
    """Parse the Claude analysis text into structured data for email template."""
    
    categories = []
    keywords = []
    recommendations = []
    audience_recommendations = []
    
    lines = analysis_text.split('\n')
    current_section = None
    current_category = None
    current_audience = None
    
    for line in lines:
        line = line.strip()
        
        if not line:
            continue
            
        # Detect main category sections
        if line.endswith(':') and any(word in line.upper() for word in ['POLITIK', 'WIRTSCHAFT', 'GESELLSCHAFT', 'LIFESTYLE', 'WISSENSCHAFT', 'TECHNOLOGIE', 'DIGITAL', 'BUSINESS', 'SPORT', 'HEALTH']):
            category_name = line.replace(':', '').strip()
            current_category = {
                'name': category_name,
                'stories': []
            }
            categories.append(current_category)
            current_section = "categories"
            continue
        
        # Detect numbered items within categories
        elif current_section == "categories" and current_category and line and (line[0].isdigit() and '. ' in line):
            story_data = extract_title_and_url(line)
            current_category['stories'].append(story_data)
            continue
            
        # Detect Keywords section
        elif line == "Keywords:" or line == "Schlüsselwörter:" or "**Keywords:**" in line:
            current_section = "keywords"
            continue
        elif current_section == "keywords" and line.startswith('- '):
            keyword = line[2:].strip()
            keywords.append(keyword)
            continue
        
        # Detect TOP 5 STORY-EMPFEHLUNGEN section
        elif "TOP 5 STORY-EMPFEHLUNGEN:" in line or "Finale Top-Empfehlungen:" in line:
            current_section = "recommendations"
            current_category = None
            current_audience = None
            continue
        elif current_section == "recommendations" and line and (line[0].isdigit() and '. ' in line):
            rec_data = extract_title_and_url(line)
            recommendations.append(rec_data)
            continue
        
        # Detect audience-specific recommendations with "Für" pattern
        elif line.startswith('Für ') and line.endswith(':'):
            current_section = "audience"
            audience_name = line.replace(':', '').strip()
            current_audience = {
                'name': audience_name,
                'items': []
            }
            audience_recommendations.append(current_audience)
            continue
        elif current_section == "audience" and current_audience and line.startswith('- '):
            item_text = line[2:]
            item_data = extract_title_and_url(item_text)
            current_audience['items'].append(item_data)
            continue
        
        # Reset section if we hit a new major section
        elif any(keyword in line for keyword in ["**Zielgruppenspezifische Empfehlungen:**", "**Top-Themen nach Kategorien"]):
            current_section = None
            current_category = None
            current_audience = None
    
    return {
        'categories': categories[:5],
        'keywords': keywords[:10],
        'recommendations': recommendations[:5],
        'audience_recommendations': audience_recommendations
    }

def create_email_html(analysis_text, competitor_df, twentymin_df):
    """Create HTML content for email using competitor analysis template."""
    try:
        template = load_template('competitor_email_template.html')
        parsed_data = parse_analysis_for_email(analysis_text)
        
        html_content = template.render(
            date=datetime.now().strftime('%d.%m.%Y'),
            time=datetime.now().strftime('%H:%M CET'),
            stats={
                'competitor_articles': len(competitor_df),
                'twentymin_articles': len(twentymin_df),
                'competitor_sources': competitor_df['source'].nunique() if 'source' in competitor_df.columns else 0
            },
            categories=parsed_data['categories'],
            keywords=parsed_data['keywords'],
            recommendations=parsed_data['recommendations'],
            audience_recommendations=parsed_data['audience_recommendations']
        )
        
        return html_content
        
    except Exception as e:
        print(f"❌ Error creating email HTML: {str(e)}")
        raise

def send_to_zapier(analysis_text, competitor_df, twentymin_df):
    """Send the analysis via Zapier webhook for email delivery"""
    print("📤 Sending to Zapier...")
    
    payload = {
        "date": datetime.now().strftime("%d.%m.%Y"),
        "time": datetime.now().strftime("%H:%M CET"),
        "email_content_html": create_email_html(analysis_text, competitor_df, twentymin_df),
        "stats": {
            "competitor_articles": len(competitor_df),
            "twentymin_articles": len(twentymin_df),
            "competitor_sources": competitor_df['source'].nunique() if 'source' in competitor_df.columns else 0
        },
        "recipient_email": "tom.vaillant@20minuten.ch",
        "email_subject": f"Konkurrenz-Lückenanalyse - {datetime.now().strftime('%d.%m.%Y %H:%M CET')}"
    }
    
    try:
        response = requests.post(ZAPIER_WEBHOOK_URL, json=payload)
        
        if response.status_code == 200:
            print("✅ Email sent successfully!")
            return True
        else:
            print(f"❌ Error: {response.status_code}")
            return False
            
    except Exception as e:
        print(f"❌ Exception: {str(e)}")
        return False

# Main execution
if 'analysis_result' in globals() and 'news_df' in globals() and 'twentymin_df' in globals():
    success = send_to_zapier(analysis_result, news_df, twentymin_df)
    
    if success:
        print("🎉 Analysis complete!")
else:
    print("❌ Missing required data from previous cells")

✅ Environment loaded
📤 Sending to Zapier...
✅ Email sent successfully!
🎉 Analysis complete!
