In [None]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
import re
import time
import os
from urllib.parse import urljoin

WIKI_BASE_URL = "https://en.wikipedia.org"
OUTPUT_DIR = "../data/wiki/pittsburgh/"

# Create output directory if not exists
os.makedirs(OUTPUT_DIR, exist_ok=True)

def get_wikipedia_content(url):
    """Fetch and extract main text and tables from a Wikipedia page."""
    response = requests.get(url)
    if response.status_code != 200:
        return None, None
    
    soup = BeautifulSoup(response.text, 'html.parser')

    # Extract page title
    title = soup.find("h1", {"id": "firstHeading"}).text.strip()

    # Extract main text content
    content_div = soup.find("div", {"id": "bodyContent"})
    if not content_div:
        return title, None

    paragraphs = content_div.find_all("p", recursive=True)
    text = "\n".join(p.get_text() for p in paragraphs if p.get_text().strip())

    # Remove reference markers like [1], [2], etc.
    cleaned_text = re.sub(r'\[\d+\]', '', text)

    # Extract tables
    tables = soup.find_all("table", {"class": "wikitable"})
    table_texts = []

    for i, table in enumerate(tables):
        df = pd.read_html(str(table))[0]  # Read table into DataFrame
        table_filename = os.path.join(OUTPUT_DIR, f"{title}_table_{i+1}.csv")
        df.to_csv(table_filename, index=False)  # Save table as CSV
        # table_texts.append(f"\n[Table {i+1} saved as {table_filename}]\n")

    # Append table info to content
    final_content = cleaned_text + "\n\n" + "\n".join(table_texts)
    
    return title, final_content

def save_content(title, content):
    """Save Wikipedia content to a text file."""
    if not content:
        return
    
    filename = re.sub(r'[\\/*?:"<>|]', "", title) + ".txt"
    filepath = os.path.join(OUTPUT_DIR, filename)

    with open(filepath, "w", encoding="utf-8") as file:
        file.write(content)
    
    print(f"Saved: {filepath}")

def find_wiki_links(url, max_links=5):
    """Find internal Wikipedia links from a given page."""
    response = requests.get(url)
    if response.status_code != 200:
        return []
    
    soup = BeautifulSoup(response.text, 'html.parser')
    links = set()

    for link in soup.select("div#bodyContent a[href^='/wiki/']"):
        full_url = urljoin(WIKI_BASE_URL, link['href'])
        if ":" not in link['href']:  # Avoid special pages
            links.add(full_url)
        if len(links) >= max_links:
            break

    return list(links)

def crawl_wikipedia(start_url, depth=2):
    """Crawl Wikipedia starting from `start_url` up to a given depth."""
    visited = set()
    to_visit = [(start_url, 0)]

    while to_visit:
        url, level = to_visit.pop(0)
        if url in visited or level > depth:
            continue
        
        print(f"Crawling: {url} (Depth {level})")
        visited.add(url)

        title, content = get_wikipedia_content(url)
        if content:
            save_content(title, content)

        if level < depth:
            new_links = find_wiki_links(url)
            to_visit.extend((link, level + 1) for link in new_links)

        time.sleep(1)  # Be polite to Wikipedia servers

# Example usage: Crawl Wikipedia starting from the "Pittsburgh" page
start_url = "https://en.wikipedia.org/wiki/Pittsburgh"
crawl_wikipedia(start_url, depth=0)


Crawling: https://en.wikipedia.org/wiki/Pittsburgh (Depth 0)


  df = pd.read_html(str(table))[0]  # Read table into DataFrame
  df = pd.read_html(str(table))[0]  # Read table into DataFrame
  df = pd.read_html(str(table))[0]  # Read table into DataFrame
  df = pd.read_html(str(table))[0]  # Read table into DataFrame
  df = pd.read_html(str(table))[0]  # Read table into DataFrame
  df = pd.read_html(str(table))[0]  # Read table into DataFrame
  df = pd.read_html(str(table))[0]  # Read table into DataFrame
  df = pd.read_html(str(table))[0]  # Read table into DataFrame


Saved: wiki_pages/Pittsburgh.txt


In [11]:
df = pd.read_csv("wiki_pages/Pittsburgh_table_3.csv")

In [12]:
df

Unnamed: 0,Race / Ethnicity (NH = Non-Hispanic),Pop 1980[110],Pop 1990[111],Pop 2000[112],Pop 2010[113],Pop 2020[114],% 1980,% 1990,% 2000,% 2010,% 2020
0,White alone (NH),316262.0,264722.0,223982,198186,187099,74.60%,71.57%,66.95%,64.83%,61.75%
1,Black or African American alone (NH),100734.0,94743.0,90183,78847,68314,23.76%,25.61%,26.96%,25.79%,22.55%
2,Native American or Alaska Native alone (NH),552.0,583.0,561,505,475,0.13%,0.16%,0.17%,0.17%,0.16%
3,Asian alone (NH),2778.0,5865.0,9160,13393,19745,0.66%,1.59%,2.74%,4.38%,6.52%
4,Pacific Islander alone (NH),,,100,76,96,,,0.03%,0.02%,0.03%
5,Other race alone (NH),242.0,498.0,1217,843,2081,0.06%,0.13%,0.36%,0.28%,0.69%
6,Mixed race or Multiracial (NH),,,4935,6890,13541,,,1.48%,2.25%,4.47%
7,Hispanic or Latino (any race),3370.0,3468.0,4425,6964,11620,0.79%,0.94%,1.32%,2.28%,3.84%
8,Total,423938.0,369879.0,334563,305704,302971,100.00%,100.00%,100.00%,100.00%,100.00%
