In [8]:
import requests
from urllib.parse import urlparse
from requests.exceptions import RequestException

def retrieve_archive_html(original_url: str, timestamp: str):
    """
    Downloads the HTML content of a URL from the Wayback Machine at a specific timestamp.

    :param original_url: The original URL to be archived.
    :param timestamp: The timestamp in the format 'YYYYMMDDhhmmss'.
    :return: The HTML content of the archived page.
    """

    wayback_url = f"https://web.archive.org/web/{timestamp}id_/{original_url}"
    
    try:
        response = requests.get(wayback_url, timeout=10)
        response.raise_for_status()
        return response.text
    except RequestException as e:
        raise RuntimeError(f"Failed to download archive: {e}") 



In [None]:
from pathlib import Path
import random
import csv

# Go through each folder in data/, check the content of the cdx.csv file, select 10 URLs to download. 

for folder in Path("data").iterdir():
    if folder.is_dir():
        print(f"Processing folder: {folder}")
        cdx_file = folder / "cdx.csv"
        downloaded_files = [file for file in folder.glob("*.html") if file.is_file()]
        if cdx_file.exists():
            with open(cdx_file, 'r', encoding='utf-8') as file:
                reader = csv.reader(file, delimiter=' ')
                # make reader into a list and filter rows
                url_rows = []
                for row in reader:
                    print(f"Processing row: {row}")
                    # Check if the row has enough columns and the status code is 200
                    if len(row) > 4 and row[4] == '200':
                        url_rows.append(row)
                print(f"Found {len(url_rows)} valid URLs in {cdx_file}.")
            # Filter out URLs that have already been downloaded
            url_rows = [row for row in url_rows if not any(file.stem == row[0] for file in downloaded_files)]
            print(f"Filtered down to {len(url_rows)} URLs that have not been downloaded yet.")

            if url_rows:
                selected_rows = random.sample(url_rows, min(10, len(url_rows)))
                for row in selected_rows:
                    url = row[2]
                    timestamp = row[1]
                    try:
                        html_content = retrieve_archive_html(url, timestamp)
                        # Save the HTML content to a file
                        html_file_path = folder / f"{row[0]}.html"
                        with open(f'{folder}/{timestamp}.html', 'w', encoding='utf-8') as html_file:
                            html_file.write(html_content)
                        print(f"Downloaded {url} at {timestamp} to {html_file_path}")
                    except RuntimeError as e:
                        print(f"Error downloading {url} at {timestamp}: {e}")
        else:
            print(f"No cdx.csv found in {folder}")

Processing folder: data/1
Processing row: ['jp,co,yahoo)/', '20000229163149', 'http://www121.yahoo.co.jp:80/', 'text/html', '200', '7OY5HCVOJALTGSUPICD5HN5AYQMVZAXI', '5282']
Processing row: ['jp,co,yahoo)/', '20000229171208', 'http://www.yahoo.co.jp:80/?', 'text/plain', '302', 'VXEDWGPHSNERWHDOUD6YWRWNT4ZOLEX4', '120']
Processing row: ['jp,co,yahoo)/', '20000301054711', 'http://www119.yahoo.co.jp:80/', 'text/html', '200', 'DQCTETZ3VATNVTTOBLXJVWNUYCFAY4V4', '5285']
Processing row: ['jp,co,yahoo)/', '20000301060532', 'http://www.yahoo.co.jp:80/', 'text/html', '200', 'DQCTETZ3VATNVTTOBLXJVWNUYCFAY4V4', '5279']
Processing row: ['jp,co,yahoo)/', '20000301070501', 'http://www.yahoo.co.jp:80/', 'text/html', '200', 'FRPHCDHKU33VGEG4A6D3FLPSDMBG3HP5', '5244']
Processing row: ['jp,co,yahoo)/', '20000301081906', 'http://www.yahoo.co.jp:80/', 'text/html', '200', 'DA7FCQ6NDKG5BL6EUPY65N6SFREKHCJ6', '5243']
Processing row: ['jp,co,yahoo)/', '20000301143405', 'http://www.yahoo.co.jp:80/', 'text/htm

FileNotFoundError: [Errno 2] No such file or directory: 'data/1/jp,co,yahoo)/.html'