# General

This notebook is used to scrape the data used for this tutorial from the HPC wiki.  

# Imports

In [7]:
from pathlib import Path

import re
import requests

from datetime import datetime
import os

# Env config

In [2]:
t0 = datetime.now()
print(f"Last execution {t0}")

# Endpoints
BASE_URL = "https://docs.hpc.cineca.it/_sources"

# Paths
DUMP_POSITION = "../data/input"

Path(DUMP_POSITION).mkdir(exist_ok=True, parents=True)

Last execution 2025-07-22 11:19:25.577657


# Scraping

In [3]:
# Get the content of the index of the wiki
res = requests.get(BASE_URL + "/index.rst.txt")

In [9]:
def extract_urls_from_page(text:str):
    # Index items are preceeded by 3 spaces, then we search for letters + / + other chars + /n
    urls = re.findall(r"(?:\.\.\stoctree::\n.*?)((\s{3}[\w\/\-]+\n)+)", text, flags=re.S)

    cleaned_pages = []

    for pages in urls:
        for page in pages:
            for item in page.split("\n"):
                if len(item):
                    cleaned_pages.append(item.strip())
    return list(set(cleaned_pages))

In [10]:
index_pages = extract_urls_from_page(res.text)
index_pages

['hpc/hpc_data_storage',
 'cloud/systems/index_system_specifics',
 'general/users_account',
 'specific_users/specific_users',
 'general/access',
 'cloud/os_overview/index_openstack_overview',
 'hpc/hpc_clusters',
 'hpc/hpc_scheduler',
 'cloud/tutorials/index_tutorials_and_repos',
 'cloud/tenant_adm/index_tenants_administration',
 'cloud/operative/index_operative_manual',
 'faq',
 'hpc/hpc_intro',
 'general/getting_started',
 'hpc/hpc_software',
 'services/services_and_tools',
 'hpc/hpc_enviroment',
 'cloud/general/general_info',
 'general/general_info']

# Download data

Download data from each page. Check on each page if there are other links which were not in index, and add them to the list of pages we must scrape.

In [11]:
pages_to_be_visited = index_pages.copy()

while pages_to_be_visited:
    page_url:str = pages_to_be_visited.pop()
    page_url = BASE_URL + "/" +  page_url + ".rst.txt"
    print(f"Visiting {page_url}")
    
    page = requests.get(page_url)
    
    # Search for additional urls in the page
    additional_pages = extract_urls_from_page(page.text)
    
    # ADD THE HEAD OF THE URL, THE URLS ARE "RELATIVE" (? CHECK)
    for item in additional_pages:
        item = "/".join(page.url.split("/")[4:-1]) + "/" + item
        if item not in index_pages:
            # Do not include a page if this is already in the index. This is probably a loop.
            index_pages.append(item)
            pages_to_be_visited.append(item)
            print(f"Page {item} was not in index. Adding it to the list of pages to be dumped.")
    
    with open(os.path.join(DUMP_POSITION, page_url.split("/")[-1]), mode="w") as f:
        f.write(page.text)

Visiting https://docs.hpc.cineca.it/_sources/general/general_info.rst.txt
Visiting https://docs.hpc.cineca.it/_sources/cloud/general/general_info.rst.txt
Page cloud/general/budget_accounting was not in index. Adding it to the list of pages to be dumped.
Page cloud/general/cineca_cloud_model was not in index. Adding it to the list of pages to be dumped.
Page cloud/general/what_is_cloud was not in index. Adding it to the list of pages to be dumped.
Visiting https://docs.hpc.cineca.it/_sources/cloud/general/what_is_cloud.rst.txt
Visiting https://docs.hpc.cineca.it/_sources/cloud/general/cineca_cloud_model.rst.txt
Visiting https://docs.hpc.cineca.it/_sources/cloud/general/budget_accounting.rst.txt
Visiting https://docs.hpc.cineca.it/_sources/hpc/hpc_enviroment.rst.txt
Page hpc/hpc_cineca-ai-hpyc was not in index. Adding it to the list of pages to be dumped.
Visiting https://docs.hpc.cineca.it/_sources/hpc/hpc_cineca-ai-hpyc.rst.txt
Visiting https://docs.hpc.cineca.it/_sources/services/serv