In [2]:
import requests
from bs4 import BeautifulSoup
import urllib3
from tqdm import tqdm
import json
import time

urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning)

In [None]:
import requests
from bs4 import BeautifulSoup
import urllib3

urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning)

def extract_articles_from_archive_page(archive_url):
    response = requests.get(archive_url, verify=False)
    soup = BeautifulSoup(response.content, "html.parser")
    articles = []

    # 1. Featured hero article (only on first page)
    hero = soup.find("h3", class_="hero__card__title")
    if hero and hero.a:
        try:
            title = hero.a.get_text(strip=True)
            url = hero.a.get("href")
    
            #Find the correct outer container
            hero_container = hero.find_parent("div", class_="hero__card__inner")
    
            #Extract author from <span class="byline__author">
            author_tag = hero_container.find("span", class_="byline__author") if hero_container else None
            author = author_tag.get_text(strip=True) if author_tag else "Unknown"
    
            #Extract topic from kicker (optional — depends on page)
            topic_tag = hero_container.find("a", class_="kicker") if hero_container else None
            topic = topic_tag.get_text(strip=True) if topic_tag else "Unknown"
    
            if url and url.startswith("https://www.quantamagazine.org/"):
                articles.append({
                    "title": title,
                    "url": url,
                    "author": author,
                    "topic": topic
                })
    
        except Exception as e:
            print(f"Error extracting hero article: {e}")


    # 2. Standard article cards
    cards = soup.find_all("div", class_="card__content")
    for card in cards:
        try:
            title_tag = card.find("h3", class_="card__title")
            title = title_tag.get_text(strip=True) if title_tag else ''
            url = title_tag.find_parent("a")["href"] if title_tag and title_tag.find_parent("a") else ''

            author_tag = card.find("span", class_="byline__author")
            author = author_tag.get_text(strip=True) if author_tag else "Unknown"

            topic_tag = card.find("div", class_="card__kicker")
            topic = topic_tag.get_text(strip=True) if topic_tag else "Unknown"

            if url and url.startswith("https://www.quantamagazine.org/"):
                articles.append({
                    "title": title,
                    "url": url,
                    "author": author,
                    "topic": topic
                })
        except Exception as e:
            print(f" Error parsing card: {e}")
            continue

    return articles


In [None]:
def crawl_all_archive_pages(max_pages=250, delay=0.5):
    all_articles = []
    seen_urls = set()

    for page in tqdm(range(1, max_pages + 1), desc="Crawling archive pages"):
        url = "https://www.quantamagazine.org/archive/" if page == 1 else f"https://www.quantamagazine.org/archive/page/{page}/"
        try:
            articles = extract_articles_from_archive_page(url)
            if not articles:
                print(f" No articles found on page {page}. Stopping.")
                break
            for article in articles:
                if article["url"] not in seen_urls:
                    all_articles.append(article)
                    seen_urls.add(article["url"])
            time.sleep(delay)  # be polite to the server
        except Exception as e:
            print(f"Failed on page {page}: {e}")
            break

    return all_articles

In [67]:
# archive_url = "https://www.quantamagazine.org/archive/"
# articles = extract_articles_from_archive_page(archive_url)

# for idx, article in enumerate(articles, 1):
#     print(f"{idx}. {article['title']}")
#     print(f"   Author: {article['author']}")
#     print(f"   Topic: {article['topic']}")
#     print(f"   URL: {article['url']}\n")


In [None]:
if __name__ == "__main__":
    articles = crawl_all_archive_pages(max_pages=250)

    print(f"\n Total unique articles found: {len(articles)}")

    with open("quanta_all_articles_with_authors.json", "w", encoding="utf-8") as f:
        json.dump(articles, f, indent=2, ensure_ascii=False)

    # Print first 5 for verification
    for i, article in enumerate(articles[:5], 1):
        print(f"{i}. {article['title']}")
        print(f"   {article['url']}")
