In [None]:
from datetime import datetime
import csv
import requests
from tqdm import tqdm
from bs4 import BeautifulSoup

In [None]:
BASE_URL = "https://horizon.fandom.com"
TARGET_URL = f"{BASE_URL}/wiki/Special:AllPages"
HEADERS = {'User-Agent': 'Mozilla/5.0'}

In [15]:
def get_content(soup):
    infobox = '[ Infobox source ]'
    content_parts = []
    content_div = soup.find("div", class_="mw-parser-output")
    if content_div:
        for p in content_div.find_all("p", recursive=False):
            text = p.get_text(" ", strip=True)
            if text:
                content_parts.append(text)
    content = "\n".join(content_parts)
    idx = content.find(infobox)
    return content[idx+len(infobox):]

In [16]:
def get_location(soup):
    location = None
    loc_div = soup.find(attrs={"data-source": "location"})
    if loc_div:
        a = loc_div.find("a")
        location = a.get_text(strip=True) if a else loc_div.get_text(strip=True)
    return location


In [17]:
def get_category(soup):
    cat_div = soup.find(attrs={"data-source": "category"})
    if cat_div:
        val = cat_div.find(class_="pi-data-value") or cat_div
        a = val.find("a")
        if a and a.get_text(strip=True):
            return a.get_text(strip=True)
        return val.get_text(" ", strip=True).replace("Category", "", 1).strip()
    foot = soup.select_one("#mw-normal-catlinks ul li a")
    return foot.get_text(strip=True) if foot else ""

In [None]:
def scrape_data():
    # Get the current datetime object
    current_datetime = datetime.now()

    # Format the datetime object into a string
    date_time_string = current_datetime.strftime("%Y-%m-%d %H:%M:%S")
    main_page = requests.get(f"{TARGET_URL}", headers=HEADERS)
    main_soup = BeautifulSoup(main_page.text, 'html.parser')
    next_page = main_soup.find("a", string=lambda text: text and text.startswith("Next page"))

    with open(f"horizon_data_{date_time_string}.csv", 'a', newline='') as csv_file:
        writer = csv.writer(csv_file)
        writer.writerow(['address', 'category', 'location', 'content'])
        while not next_page is None :
            page_links = main_soup.find_all('div', attrs="mw-allpages-body")[0].find_all('a')
            for link in tqdm(page_links):
                link = link.get('href')
                new_page = requests.get(f"{BASE_URL}/{link}", headers=HEADERS)
                new_soup = BeautifulSoup(new_page.text, 'html.parser')
                writer.writerow([link, get_category(new_soup), get_location(new_soup), get_content(new_soup)])
        
            main_page = requests.get(f"{BASE_URL}/{next_page.get('href')}", headers=HEADERS)
            main_soup = BeautifulSoup(main_page.text, 'html.parser')
            next_page = main_soup.find("a", string=lambda text: text and text.startswith("Next page"))

100%|██████████| 345/345 [01:52<00:00,  3.05it/s]
100%|██████████| 345/345 [01:44<00:00,  3.31it/s]
100%|██████████| 345/345 [02:20<00:00,  2.45it/s]
100%|██████████| 345/345 [02:04<00:00,  2.77it/s]
100%|██████████| 345/345 [01:59<00:00,  2.89it/s]
100%|██████████| 345/345 [02:20<00:00,  2.46it/s]
100%|██████████| 345/345 [02:22<00:00,  2.42it/s]
100%|██████████| 345/345 [02:16<00:00,  2.53it/s]
100%|██████████| 345/345 [02:20<00:00,  2.46it/s]
100%|██████████| 345/345 [02:14<00:00,  2.56it/s]
