# WIKI scraper

In [2]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
import re
import time
import os
from urllib.parse import urljoin

WIKI_BASE_URL = "https://en.wikipedia.org"
OUTPUT_DIR = "../data/zianp/wiki_test/"

# Create output directory if not exists
os.makedirs(OUTPUT_DIR, exist_ok=True)

def get_wikipedia_content(OUTPUT_DIR, url):
    """Fetch and extract main text and tables from a Wikipedia page."""
    response = requests.get(url)
    if response.status_code != 200:
        return None, None
    
    soup = BeautifulSoup(response.text, 'html.parser')

    # Extract page title
    title = soup.find("h1", {"id": "firstHeading"}).text.strip()

    OUTPUT_DIR = os.path.join(OUTPUT_DIR, title)
    os.makedirs(OUTPUT_DIR, exist_ok=True)

    # Extract main text content
    content_div = soup.find("div", {"id": "bodyContent"})
    if not content_div:
        return title, None

    paragraphs = content_div.find_all("p", recursive=True)
    text = "\n".join(p.get_text() for p in paragraphs if p.get_text().strip())

    # Remove reference markers like [1], [2], etc.
    cleaned_text = re.sub(r'\[\d+\]', '', text)

    # Extract tables
    tables = soup.find_all("table", {"class": "wikitable"})
    table_texts = []
    

    for i, table in enumerate(tables):
        df = pd.read_html(str(table))[0]  # Read table into DataFrame
        # makedir OUTPUT_DIR + tables
        os.makedirs(os.path.join(OUTPUT_DIR, "tables"), exist_ok=True)
        table_filename = os.path.join(OUTPUT_DIR, "tables", f"{title}_table_{i+1}.csv")
        df.to_csv(table_filename, index=False)  # Save table as CSV
        # table_texts.append(f"\n[Table {i+1} saved as {table_filename}]\n")

    # Append table info to content
    final_content = cleaned_text + "\n\n" + "\n".join(table_texts)
    
    return title, final_content, OUTPUT_DIR

def save_content(title, content, new_out):
    """Save Wikipedia content to a text file."""
    if not content:
        return
    
    filename = re.sub(r'[\\/*?:"<>|]', "", title) + ".txt"
    filepath = os.path.join(new_out, filename)

    with open(filepath, "w", encoding="utf-8") as file:
        file.write(content)
    
    # print(f"Saved: {filepath}")

def find_wiki_links(url, max_links=5):
    """Find internal Wikipedia links from a given page."""
    response = requests.get(url)
    if response.status_code != 200:
        return []
    
    soup = BeautifulSoup(response.text, 'html.parser')
    links = set()

    # find all the links in the bodyContent div
    for link in soup.select("div#bodyContent a[href^='/wiki/']"):
        if len(links) >= max_links:
            break
        full_url = urljoin(WIKI_BASE_URL, link['href'])
        if ":" not in link['href']:  # Avoid special pages
            links.add(full_url)
        
    
    return list(links)

def crawl_wikipedia(start_url, depth=2, max_links = 5):
    """Crawl Wikipedia starting from `start_url` up to a given depth."""
    to_visit = [(start_url, 0)]

    while to_visit:
        print(len(to_visit))
        url, level = to_visit.pop(0)
        if url in visited or level > depth:
            continue
        
        # print(f"Crawling: {url} (Depth {level})")
        visited.add(url)

        title, content, new_out = get_wikipedia_content(OUTPUT_DIR, url)
        if content:
            save_content(title, content, new_out)

        if level < depth:
            new_links = find_wiki_links(url, max_links=max_links)
            to_visit.extend((link, level + 1) for link in new_links)

        time.sleep(1)  # Be polite to Wikipedia servers

# Example usage: Crawl Wikipedia starting from the "Pittsburgh" page
    """Crawl Wikipedia starting from `start_url` up to a given depth."""
visited = set()
# start_url = ["https://en.wikipedia.org/wiki/History_of_Pittsburgh", "https://en.wikipedia.org/wiki/Pittsburgh"]
# start_url = ["https://en.wikipedia.org/wiki/List_of_museums_in_Pittsburgh"]
start_url = ["https://en.wikipedia.org/wiki/Carnegie_Mellon_University"]
for url in start_url:
    crawl_wikipedia(url, depth=1, max_links= 100)


1


  df = pd.read_html(str(table))[0]  # Read table into DataFrame
  df = pd.read_html(str(table))[0]  # Read table into DataFrame


100


  df = pd.read_html(str(table))[0]  # Read table into DataFrame


99


  df = pd.read_html(str(table))[0]  # Read table into DataFrame


98
97


  df = pd.read_html(str(table))[0]  # Read table into DataFrame


96
95


  df = pd.read_html(str(table))[0]  # Read table into DataFrame
  df = pd.read_html(str(table))[0]  # Read table into DataFrame


94
93
92
91
90
89


  df = pd.read_html(str(table))[0]  # Read table into DataFrame
  df = pd.read_html(str(table))[0]  # Read table into DataFrame
  df = pd.read_html(str(table))[0]  # Read table into DataFrame
  df = pd.read_html(str(table))[0]  # Read table into DataFrame
  df = pd.read_html(str(table))[0]  # Read table into DataFrame
  df = pd.read_html(str(table))[0]  # Read table into DataFrame
  df = pd.read_html(str(table))[0]  # Read table into DataFrame
  df = pd.read_html(str(table))[0]  # Read table into DataFrame
  df = pd.read_html(str(table))[0]  # Read table into DataFrame
  df = pd.read_html(str(table))[0]  # Read table into DataFrame


88
87
86
85


  df = pd.read_html(str(table))[0]  # Read table into DataFrame
  df = pd.read_html(str(table))[0]  # Read table into DataFrame
  df = pd.read_html(str(table))[0]  # Read table into DataFrame
  df = pd.read_html(str(table))[0]  # Read table into DataFrame
  df = pd.read_html(str(table))[0]  # Read table into DataFrame
  df = pd.read_html(str(table))[0]  # Read table into DataFrame
  df = pd.read_html(str(table))[0]  # Read table into DataFrame
  df = pd.read_html(str(table))[0]  # Read table into DataFrame
  df = pd.read_html(str(table))[0]  # Read table into DataFrame
  df = pd.read_html(str(table))[0]  # Read table into DataFrame
  df = pd.read_html(str(table))[0]  # Read table into DataFrame
  df = pd.read_html(str(table))[0]  # Read table into DataFrame
  df = pd.read_html(str(table))[0]  # Read table into DataFrame
  df = pd.read_html(str(table))[0]  # Read table into DataFrame
  df = pd.read_html(str(table))[0]  # Read table into DataFrame


84
83


  df = pd.read_html(str(table))[0]  # Read table into DataFrame
  df = pd.read_html(str(table))[0]  # Read table into DataFrame


82
81
80
79
78
77


  df = pd.read_html(str(table))[0]  # Read table into DataFrame


76
75


  df = pd.read_html(str(table))[0]  # Read table into DataFrame


74


  df = pd.read_html(str(table))[0]  # Read table into DataFrame
  df = pd.read_html(str(table))[0]  # Read table into DataFrame
  df = pd.read_html(str(table))[0]  # Read table into DataFrame


73
72


  df = pd.read_html(str(table))[0]  # Read table into DataFrame
  df = pd.read_html(str(table))[0]  # Read table into DataFrame


71
70
69
68
67
66
65
64


  df = pd.read_html(str(table))[0]  # Read table into DataFrame


63


  df = pd.read_html(str(table))[0]  # Read table into DataFrame


62
61
60
59
58
57


  df = pd.read_html(str(table))[0]  # Read table into DataFrame
  df = pd.read_html(str(table))[0]  # Read table into DataFrame
  df = pd.read_html(str(table))[0]  # Read table into DataFrame
  df = pd.read_html(str(table))[0]  # Read table into DataFrame
  df = pd.read_html(str(table))[0]  # Read table into DataFrame


56


  df = pd.read_html(str(table))[0]  # Read table into DataFrame
  df = pd.read_html(str(table))[0]  # Read table into DataFrame
  df = pd.read_html(str(table))[0]  # Read table into DataFrame
  df = pd.read_html(str(table))[0]  # Read table into DataFrame


55
54


  df = pd.read_html(str(table))[0]  # Read table into DataFrame
  df = pd.read_html(str(table))[0]  # Read table into DataFrame
  df = pd.read_html(str(table))[0]  # Read table into DataFrame
  df = pd.read_html(str(table))[0]  # Read table into DataFrame
  df = pd.read_html(str(table))[0]  # Read table into DataFrame


53


  df = pd.read_html(str(table))[0]  # Read table into DataFrame
  df = pd.read_html(str(table))[0]  # Read table into DataFrame
  df = pd.read_html(str(table))[0]  # Read table into DataFrame
  df = pd.read_html(str(table))[0]  # Read table into DataFrame


52


  df = pd.read_html(str(table))[0]  # Read table into DataFrame
  df = pd.read_html(str(table))[0]  # Read table into DataFrame
  df = pd.read_html(str(table))[0]  # Read table into DataFrame
  df = pd.read_html(str(table))[0]  # Read table into DataFrame
  df = pd.read_html(str(table))[0]  # Read table into DataFrame
  df = pd.read_html(str(table))[0]  # Read table into DataFrame
  df = pd.read_html(str(table))[0]  # Read table into DataFrame
  df = pd.read_html(str(table))[0]  # Read table into DataFrame


51
50
49
48
47


  df = pd.read_html(str(table))[0]  # Read table into DataFrame


46


  df = pd.read_html(str(table))[0]  # Read table into DataFrame
  df = pd.read_html(str(table))[0]  # Read table into DataFrame


45
44
43
42
41
40
39
38
37
36
35
34


  df = pd.read_html(str(table))[0]  # Read table into DataFrame


33
32
31
30


  df = pd.read_html(str(table))[0]  # Read table into DataFrame
  df = pd.read_html(str(table))[0]  # Read table into DataFrame
  df = pd.read_html(str(table))[0]  # Read table into DataFrame


29


  df = pd.read_html(str(table))[0]  # Read table into DataFrame


28
27
26
25
24
23


  df = pd.read_html(str(table))[0]  # Read table into DataFrame
  df = pd.read_html(str(table))[0]  # Read table into DataFrame
  df = pd.read_html(str(table))[0]  # Read table into DataFrame
  df = pd.read_html(str(table))[0]  # Read table into DataFrame
  df = pd.read_html(str(table))[0]  # Read table into DataFrame
  df = pd.read_html(str(table))[0]  # Read table into DataFrame
  df = pd.read_html(str(table))[0]  # Read table into DataFrame


22


  df = pd.read_html(str(table))[0]  # Read table into DataFrame


21


  df = pd.read_html(str(table))[0]  # Read table into DataFrame


20


  df = pd.read_html(str(table))[0]  # Read table into DataFrame
  df = pd.read_html(str(table))[0]  # Read table into DataFrame
  df = pd.read_html(str(table))[0]  # Read table into DataFrame
  df = pd.read_html(str(table))[0]  # Read table into DataFrame
  df = pd.read_html(str(table))[0]  # Read table into DataFrame
  df = pd.read_html(str(table))[0]  # Read table into DataFrame
  df = pd.read_html(str(table))[0]  # Read table into DataFrame
  df = pd.read_html(str(table))[0]  # Read table into DataFrame


19
18
17
16


  df = pd.read_html(str(table))[0]  # Read table into DataFrame


15


  df = pd.read_html(str(table))[0]  # Read table into DataFrame
  df = pd.read_html(str(table))[0]  # Read table into DataFrame
  df = pd.read_html(str(table))[0]  # Read table into DataFrame


14


  df = pd.read_html(str(table))[0]  # Read table into DataFrame


13


  df = pd.read_html(str(table))[0]  # Read table into DataFrame


12
11


  df = pd.read_html(str(table))[0]  # Read table into DataFrame


10
9
8
7


  df = pd.read_html(str(table))[0]  # Read table into DataFrame
  df = pd.read_html(str(table))[0]  # Read table into DataFrame
  df = pd.read_html(str(table))[0]  # Read table into DataFrame


6


  df = pd.read_html(str(table))[0]  # Read table into DataFrame
  df = pd.read_html(str(table))[0]  # Read table into DataFrame


5


  df = pd.read_html(str(table))[0]  # Read table into DataFrame


4
3


  df = pd.read_html(str(table))[0]  # Read table into DataFrame
  df = pd.read_html(str(table))[0]  # Read table into DataFrame
  df = pd.read_html(str(table))[0]  # Read table into DataFrame
  df = pd.read_html(str(table))[0]  # Read table into DataFrame


2


  df = pd.read_html(str(table))[0]  # Read table into DataFrame
  df = pd.read_html(str(table))[0]  # Read table into DataFrame
  df = pd.read_html(str(table))[0]  # Read table into DataFrame
  df = pd.read_html(str(table))[0]  # Read table into DataFrame
  df = pd.read_html(str(table))[0]  # Read table into DataFrame
  df = pd.read_html(str(table))[0]  # Read table into DataFrame
  df = pd.read_html(str(table))[0]  # Read table into DataFrame
  df = pd.read_html(str(table))[0]  # Read table into DataFrame
  df = pd.read_html(str(table))[0]  # Read table into DataFrame


1


In [11]:
df = pd.read_csv("wiki_pages/Pittsburgh_table_3.csv")

In [16]:
s  = """
Spain Bilbao, Spain
Vietnam Da Nang, Vietnam
Paraguay Fernando de la Mora, Paraguay
Turkey Gaziantep, Turkey
Scotland Glasgow, Scotland
Israel Karmiel, Israel
Cuba Matanzas, Cuba
Israel Misgav, Israel
Mexico Naucalpan, Mexico
Czech Republic Ostrava, Czech Republic
Slovakia Prešov, Slovakia
Germany Saarbrücken, Germany
Japan Saitama, Japan
Nicaragua San Isidro, Nicaragua
England Sheffield, England[g]
North Macedonia Skopje, North Macedonia
Bulgaria Sofia, Bulgaria
China Wuhan, China
Croatia Zagreb, Croatia

"""

In [17]:
s = s.split("\n")

In [21]:
print('|'.join(s))

|Spain Bilbao, Spain|Vietnam Da Nang, Vietnam|Paraguay Fernando de la Mora, Paraguay|Turkey Gaziantep, Turkey|Scotland Glasgow, Scotland|Israel Karmiel, Israel|Cuba Matanzas, Cuba|Israel Misgav, Israel|Mexico Naucalpan, Mexico|Czech Republic Ostrava, Czech Republic|Slovakia Prešov, Slovakia|Germany Saarbrücken, Germany|Japan Saitama, Japan|Nicaragua San Isidro, Nicaragua|England Sheffield, England[g]|North Macedonia Skopje, North Macedonia|Bulgaria Sofia, Bulgaria|China Wuhan, China|Croatia Zagreb, Croatia||
