# This is the base code working to fetch one news. Never delete!

In [1]:
import pandas as pd
import requests
from bs4 import BeautifulSoup
import time
import csv
import json
from urllib.parse import urljoin, urlparse
import re
from datetime import datetime
import logging
import requests

In [2]:
def try_request(url, max_retries=3):
    """Try to make a request with retries"""
    
    headers = {
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36',
        'Accept': 'text/html,application/xhtml+xml',
        'Accept-Language': 'en-US,en;q=0.9',
        'Connection': 'keep-alive'
    }
    
    for attempt in range(max_retries):
        try:
            print(f"Attempt {attempt + 1}/{max_retries}: {url}")
            
            # Try with shorter timeout first
            timeout = 10 + (attempt * 5)  # 10, 15, 20 seconds
            
            response = requests.get(url, headers=headers, timeout=timeout)
            
            if response.status_code == 200:
                print(f"Success! Response length: {len(response.content)}")
                return response
            else:
                print(f"Status code: {response.status_code}")
                
        except requests.exceptions.Timeout:
            print(f"Timeout on attempt {attempt + 1}")
            if attempt < max_retries - 1:
                wait_time = (attempt + 1) * 3
                print(f"Waiting {wait_time} seconds...")
                time.sleep(wait_time)
                
        except requests.exceptions.RequestException as e:
            print(f"Request error: {e}")
            if attempt < max_retries - 1:
                time.sleep(2)
    
    print("All attempts failed")
    return None

def get_first_link():
    """Get the first news link"""
    url = "https://www.prnewswire.com/news-releases/all-public-company-news/?page=1&pagesize=100"
    
    response = try_request(url)
    if not response:
        return None, None
    
    soup = BeautifulSoup(response.content, 'html.parser')
    
    # Find first news link
    link = soup.find('a', href=re.compile(r'/news-releases/.*\.html'))
    if link:
        href = link.get('href')
        full_url = "https://www.prnewswire.com" + href
        title = link.get_text(strip=True)
        return full_url, title
    
    return None, None

def get_content(url):
    """Get article content"""
    response = try_request(url)
    if not response:
        return None
    
    soup = BeautifulSoup(response.content, 'html.parser')
    
    # Try to find content
    content_text = ""
    
    # Strategy 1: Look for paragraphs
    paragraphs = soup.find_all('p')
    good_paragraphs = []
    
    for p in paragraphs:
        text = p.get_text(strip=True)
        # Filter out short paragraphs and navigation text
        if (len(text) > 50 and 
            'search' not in text.lower() and 
            'menu' not in text.lower() and
            'navigation' not in text.lower() and
            'copyright' not in text.lower()):
            good_paragraphs.append(text)
    
    if good_paragraphs:
        content_text = '\n\n'.join(good_paragraphs[:10])  # First 10 good paragraphs
    
    # Strategy 2: If no good paragraphs, get title at least
    title_elem = soup.find('h1') or soup.find('title')
    title = title_elem.get_text(strip=True) if title_elem else "No title"
    
    return {
        'title': title,
        'content': content_text,
        'url': url
    }

def main():
    print("=== Simple PR Newswire Scraper ===")
    print("Trying to get one article...")
    
    # Step 1: Get first news link
    print("\nStep 1: Getting news listing page...")
    article_url, link_title = get_first_link()
    
    if not article_url:
        print("❌ Failed to get news listing page")
        return
    
    print(f"✅ Found article: {link_title}")
    print(f"URL: {article_url}")
    
    # Step 2: Get article content
    print(f"\nStep 2: Getting article content...")
    article = get_content(article_url)
    
    if not article:
        print("❌ Failed to get article content")
        return
    
    print(f"✅ Article title: {article['title']}")
    print(f"Content length: {len(article['content'])} characters")
    
    if article['content']:
        print(f"\nFirst 300 characters:")
        print(article['content'][:] + "...")
        
        # Save to file
        with open('article.txt', 'w', encoding='utf-8') as f:
            f.write(f"Title: {article['title']}\n")
            f.write(f"URL: {article['url']}\n")
            f.write(f"\nContent:\n{article['content']}")
        
        print(f"\n✅ Saved to 'article.txt'")
    else:
        print("❌ No content extracted")

# if __name__ == "__main__":
main()

=== Simple PR Newswire Scraper ===
Trying to get one article...

Step 1: Getting news listing page...
Attempt 1/3: https://www.prnewswire.com/news-releases/all-public-company-news/?page=1&pagesize=100
Success! Response length: 401699
✅ Found article: 09:32 ETTreasury Revolution Sparks 150% Stock Surges as Corporate Bitcoin Holdings Hit $113 BillionCorporate treasury companies surge an average of 150% within 24 hours of announcing crypto adoption strategies, according to a 2025 Animoca Brands...
URL: https://www.prnewswire.com/news-releases/treasury-revolution-sparks-150-stock-surges-as-corporate-bitcoin-holdings-hit-113-billion-302552628.html

Step 2: Getting article content...
Attempt 1/3: https://www.prnewswire.com/news-releases/treasury-revolution-sparks-150-stock-surges-as-corporate-bitcoin-holdings-hit-113-billion-302552628.html
Success! Response length: 223174
✅ Article title: Treasury Revolution Sparks 150% Stock Surges as Corporate Bitcoin Holdings Hit $113 Billion
Content leng