In [None]:
import requests
from urllib.parse import urlparse
from requests.exceptions import RequestException

def retrieve_archive_html(original_url: str, timestamp: str):
    """
    Downloads the HTML content of a URL from the Wayback Machine at a specific timestamp.

    :param original_url: The original URL to be archived.
    :param timestamp: The timestamp in the format 'YYYYMMDDhhmmss'.
    :return: The HTML content of the archived page.
    """

    wayback_url = f"https://web.archive.org/web/{timestamp}id_/{original_url}"
    
    try:
        response = requests.get(wayback_url, timeout=10)
        response.raise_for_status()
        return response.text
    except RequestException as e:
        raise RuntimeError(f"Failed to download archive: {e}") 



In [2]:
from pathlib import Path
import random
import csv

# Go through each folder in data/, check the content of the cdx.csv file, select 10 URLs to download. 

for folder in Path("data").iterdir():
    if folder.is_dir():
        print(f"Processing folder: {folder}")
        cdx_file = folder / "cdx.csv"
        downloaded_files = [file for file in folder.glob("*.html") if file.is_file()]
        if cdx_file.exists():
            with open(cdx_file, 'r', encoding='utf-8') as file:
                reader = csv.reader(file)
                url_rows = [row for row in reader if len(row) > 4 and row[4] == '200']
                # get only URLs whose HTML files have not been downloaded yet
                url_rows = [row for row in url_rows if not any(file.stem in row[2] for file in downloaded_files)]
            if url_rows:
                selected_rows = random.sample(url_rows, min(10, len(url_rows)))
                for row in selected_rows:
                    url = row[2]
                    timestamp = row[1]
                    try:
                        html_content = retrieve_archive_html(url, timestamp)
                        # Save the HTML content to a file
                        html_file_path = folder / f"{row[0]}.html"
                        with open(html_file_path, 'w', encoding='utf-8') as html_file:
                            html_file.write(html_content)
                        print(f"Downloaded {url} at {timestamp} to {html_file_path}")
                    except RuntimeError as e:
                        print(f"Error downloading {url} at {timestamp}: {e}")
        else:
            print(f"No cdx.csv found in {folder}")

Processing folder: data/1
Processing folder: data/3
Processing folder: data/4
Processing folder: data/5
Processing folder: data/6
Processing folder: data/7
Processing folder: data/8
Processing folder: data/10
Processing folder: data/11
Processing folder: data/12
Processing folder: data/13
Processing folder: data/14
Processing folder: data/15
Processing folder: data/16
Processing folder: data/17
Processing folder: data/18
Processing folder: data/19
Processing folder: data/20
Processing folder: data/21
Processing folder: data/22
Processing folder: data/23
Processing folder: data/24
Processing folder: data/25
Processing folder: data/26
Processing folder: data/27
Processing folder: data/28
Processing folder: data/29
Processing folder: data/30
Processing folder: data/31
Processing folder: data/32
Processing folder: data/33
Processing folder: data/34
Processing folder: data/35
Processing folder: data/36
Processing folder: data/37
Processing folder: data/38
Processing folder: data/39
Processi