In [None]:
import requests
from newspaper import Article
from crawl4ai import *
import scrapy

class ArticleSpider(scrapy.Spider):
    name = "article"
    start_urls = []  # wird dynamisch übergeben

    def __init__(self, url=None, *args, **kwargs):
        super(ArticleSpider, self).__init__(*args, **kwargs)
        if url:
            self.start_urls = [url]

    def parse(self, response):
        # Alle relevanten Textblöcke sammeln
        text_blocks = response.css("p::text, div::text, span::text, h1::text, h2::text, h3::text").getall()
        
        # Nur sinnvolle Texte nehmen
        clean_text = ' '.join(
            [t.strip() for t in text_blocks if t and len(t.strip()) > 30]  # Filter gegen Navigation/Trash
        )

        yield {"text": clean_text}


def search_news(stock_symbol, NEWS_API_KEY, max_results=5):
    url = "https://newsapi.org/v2/everything"
    params = {
        "q": stock_symbol,
        "sortBy": "publishedAt",
        "language": "en",
        "pageSize": max_results,
        "apiKey": NEWS_API_KEY,
    }

    response = requests.get(url, params=params)
    print(response)
    
    if response.status_code != 200:
        print("Fehler beim Abrufen der News:", response.json())
        return []

    data = response.json()
    links = [article["url"] for article in data.get("articles", [])]
    return links


import requests
from newspaper import Article

def extract_article_text(url):
    """Extracts the full article text with custom headers."""
    try:
        headers = {
            "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 "
                          "(KHTML, like Gecko) Chrome/114.0 Safari/537.36"
        }
        article = Article(url)
        article.download(input_html=requests.get(url, headers=headers).text)
        article.parse()
        return article.text
    except Exception as e:
        print(f"Error extracting article from {url}: {e}")
        return ""
