In [1]:
# Scrape the RIPE Atlas ping data for a given date range

import requests
from bs4 import BeautifulSoup
from datetime import datetime, timedelta

def get_date_dirs(index_url):
    resp = requests.get(index_url)
    soup = BeautifulSoup(resp.text, "html.parser")
    dirs = []
    for link in soup.find_all("a"):
        href = link.get("href")
        if href and href.endswith("/") and href[:4].isdigit():
            dirs.append(href.strip("/"))
    return sorted(dirs)

def get_ping_files_for_date(date_url, date_str, start_hour=None, end_hour=None, start_dt=None, end_dt=None):
    resp = requests.get(date_url)
    soup = BeautifulSoup(resp.text, "html.parser")
    files = []
    for link in soup.find_all("a"):
        href = link.get("href")
        if href and href.startswith("ping") and href.endswith(".bz2"):
            # Extract hour from filename: ping-YYYY-MM-DDTHHMM.bz2
            try:
                file_dt = datetime.strptime(href[5:18], "%Y-%m-%dT%H%M")
            except Exception:
                continue
            if start_dt and end_dt:
                if start_dt <= file_dt <= end_dt:
                    files.append(f"{date_str}/{href}")
            elif start_hour is not None and end_hour is not None:
                hour = int(href[16:18])
                if start_hour <= hour <= end_hour:
                    files.append(f"{date_str}/{href}")
            else:
                files.append(f"{date_str}/{href}")
    return files

def get_ping_files_between(start, end):
    # start, end: "YYYY-MM-DDTHH"
    start_dt = datetime.strptime(start, "%Y-%m-%dT%H")
    end_dt = datetime.strptime(end, "%Y-%m-%dT%H")
    index_url = "https://data-store.ripe.net/datasets/atlas-daily-dumps/"
    date_dirs = get_date_dirs(index_url)
    results = []
    for date_str in date_dirs:
        date_dt = datetime.strptime(date_str, "%Y-%m-%d")
        # Only process dates in range
        if start_dt.date() <= date_dt.date() <= end_dt.date():
            date_url = f"{index_url}{date_str}/"
            results.extend(get_ping_files_for_date(
                date_url, date_str, start_dt=start_dt, end_dt=end_dt
            ))
    return sorted(results)

# Example usage:
start = "2025-06-07T00"
end = "2025-06-14T00"
ping_files = get_ping_files_between(start, end)
# Example output:


In [4]:
# Do the same thing as above but generating the pattern without scraping the website

from datetime import datetime, timedelta

def generate_ping_files(start, end):
    # start, end: "YYYY-MM-DDTHH"
    start_dt = datetime.strptime(start, "%Y-%m-%dT%H")
    end_dt = datetime.strptime(end, "%Y-%m-%dT%H")
    results = []
    current = start_dt
    while current <= end_dt:
        date_str = current.strftime("%Y-%m-%d")
        hour_str = current.strftime("%H%M")
        results.append(f"{date_str}/ping-{date_str}T{hour_str}.bz2")
        current += timedelta(hours=1)
    return results

# Example usage:
start = "2025-06-24T00"
end = "2025-07-24T00"
ping_files = generate_ping_files(start, end)
print(ping_files)


['2025-06-24/ping-2025-06-24T0000.bz2', '2025-06-24/ping-2025-06-24T0100.bz2', '2025-06-24/ping-2025-06-24T0200.bz2', '2025-06-24/ping-2025-06-24T0300.bz2', '2025-06-24/ping-2025-06-24T0400.bz2', '2025-06-24/ping-2025-06-24T0500.bz2', '2025-06-24/ping-2025-06-24T0600.bz2', '2025-06-24/ping-2025-06-24T0700.bz2', '2025-06-24/ping-2025-06-24T0800.bz2', '2025-06-24/ping-2025-06-24T0900.bz2', '2025-06-24/ping-2025-06-24T1000.bz2', '2025-06-24/ping-2025-06-24T1100.bz2', '2025-06-24/ping-2025-06-24T1200.bz2', '2025-06-24/ping-2025-06-24T1300.bz2', '2025-06-24/ping-2025-06-24T1400.bz2', '2025-06-24/ping-2025-06-24T1500.bz2', '2025-06-24/ping-2025-06-24T1600.bz2', '2025-06-24/ping-2025-06-24T1700.bz2', '2025-06-24/ping-2025-06-24T1800.bz2', '2025-06-24/ping-2025-06-24T1900.bz2', '2025-06-24/ping-2025-06-24T2000.bz2', '2025-06-24/ping-2025-06-24T2100.bz2', '2025-06-24/ping-2025-06-24T2200.bz2', '2025-06-24/ping-2025-06-24T2300.bz2', '2025-06-25/ping-2025-06-25T0000.bz2', '2025-06-25/ping-2025-06

In [5]:
# Download the files
ping_file_urls = [f"https://data-store.ripe.net/datasets/atlas-daily-dumps/{f}" for f in ping_files]
import os
import urllib.request
from concurrent.futures import ThreadPoolExecutor, as_completed

os.makedirs("data/ping/", exist_ok=True)
import threading

in_progress = set()
in_progress_lock = threading.Lock()

def download_file(url):
    filename = os.path.basename(url)
    paths_to_check = [
        os.path.join("data/raw_ping", filename),
        os.path.join("data/decomp_ping", filename),
        os.path.join("data/ping", filename),
    ]
    for path in paths_to_check:
        if os.path.exists(path):
            print(f"File {path} already exists, skipping.")
            return
    
    dest_path = os.path.join("data/raw_ping", filename)
    print(f"Downloading {url} to {dest_path}...")
    with in_progress_lock:
        in_progress.add(dest_path)
    try:
        with urllib.request.urlopen(url) as response, open(dest_path, "wb") as out_file:
            while True:
                chunk = response.read(1024 * 1024)
                if not chunk:
                    break
                out_file.write(chunk)
    except Exception as e:
        print(f"Failed to download {url}: {e}")
    finally:
        with in_progress_lock:
            in_progress.discard(dest_path)

try:
    with ThreadPoolExecutor() as executor:
        futures = [executor.submit(download_file, url) for url in ping_file_urls]
        for future in as_completed(futures):
            future.result()
except KeyboardInterrupt:
    print("\nKeyboardInterrupt detected! Cleaning up unfinished downloads...")
    with in_progress_lock:
        for f in list(in_progress):
            if os.path.exists(f):
                try:
                    os.remove(f)
                    print(f"  Deleted unfinished file: {f}")
                except Exception as e:
                    print(f"  Could not delete {f}: {e}")

File data/raw_ping/ping-2025-06-24T0100.bz2 already exists, skipping.
File data/raw_ping/ping-2025-06-24T0000.bz2 already exists, skipping.
File data/raw_ping/ping-2025-06-24T0200.bz2 already exists, skipping.
File data/raw_ping/ping-2025-06-24T0300.bz2 already exists, skipping.
File data/raw_ping/ping-2025-06-24T0400.bz2 already exists, skipping.
File data/raw_ping/ping-2025-06-24T0600.bz2 already exists, skipping.
File data/raw_ping/ping-2025-06-24T0500.bz2 already exists, skipping.
File data/raw_ping/ping-2025-06-24T0800.bz2 already exists, skipping.
File data/raw_ping/ping-2025-06-24T0900.bz2 already exists, skipping.
File data/raw_ping/ping-2025-06-24T0700.bz2 already exists, skipping.
File data/raw_ping/ping-2025-06-24T1000.bz2 already exists, skipping.
File data/raw_ping/ping-2025-06-24T1100.bz2 already exists, skipping.
File data/raw_ping/ping-2025-06-24T1200.bz2 already exists, skipping.
File data/raw_ping/ping-2025-06-24T1300.bz2 already exists, skipping.
File data/raw_ping/p

Downloading https://data-store.ripe.net/datasets/atlas-daily-dumps/2025-07-23/ping-2025-07-23T0900.bz2 to data/raw_ping/ping-2025-07-23T0900.bz2...
Downloading https://data-store.ripe.net/datasets/atlas-daily-dumps/2025-07-23/ping-2025-07-23T1000.bz2 to data/raw_ping/ping-2025-07-23T1000.bz2...
Downloading https://data-store.ripe.net/datasets/atlas-daily-dumps/2025-07-23/ping-2025-07-23T1100.bz2 to data/raw_ping/ping-2025-07-23T1100.bz2...
Downloading https://data-store.ripe.net/datasets/atlas-daily-dumps/2025-07-23/ping-2025-07-23T1200.bz2 to data/raw_ping/ping-2025-07-23T1200.bz2...
Downloading https://data-store.ripe.net/datasets/atlas-daily-dumps/2025-07-23/ping-2025-07-23T1300.bz2 to data/raw_ping/ping-2025-07-23T1300.bz2...
Downloading https://data-store.ripe.net/datasets/atlas-daily-dumps/2025-07-23/ping-2025-07-23T1400.bz2 to data/raw_ping/ping-2025-07-23T1400.bz2...
Downloading https://data-store.ripe.net/datasets/atlas-daily-dumps/2025-07-23/ping-2025-07-23T1500.bz2 to data/r