In [1]:
# This block shows how to use the Wayback Machine CDX API to get a list of available snapshots of a URL.
# It uses the requests library to make HTTP requests and the json library to parse the JSON response.
import requests
import json

# Function to get the CDX API URL for a given URL
def make_wm_cdx_url(url, from_time="19960101", to_time="20051231"):
    """
    Construct a URL to query the Wayback Machine CDX API
    for a given URL and time range
    """
    base_url = "https://web.archive.org/cdx/search/cdx"
    params = {
        "url": url,
        "from": from_time,
        "to": to_time,
    }

    # this will create a URL with the parameters
    # eg. https://web.archive.org/cdx/search/cdx?url=example.com&from=19960101&to=20051231
    url_with_params = requests.Request("GET", base_url, params=params).prepare().url
    return url_with_params


In [None]:
# download CDX data for all URLs in a CSV file. 
import csv
# load csv file nikkeibp-may2000-ja_only.csv 
def load_urls_from_csv(file_path):
    urls = []
    with open(file_path, 'r', encoding='utf-8') as csvfile:
        reader = csv.reader(csvfile)
        for row in reader:
            if row:  # Check if the row is not empty
                urls.append(row)  # Assuming the URL is in the first column
    return urls


urls = load_urls_from_csv('nikkeibp-may2000-ja_only.csv')


import os

for url in urls:
    url_ranking = url[0]
    url = url[1]
    cdx_api_url = make_wm_cdx_url(url, from_time="20000101", to_time="20010101")
    print(f"Requesting CDX data for URL: {url_ranking} - {url}")
    # Make a request to the CDX API for the given URL, and save the response to a file named cdx.csv
    # stored in a subdirectory named after the URL ranking number

    # Check if there is already a cdx.csv file in the directory
    if os.path.exists(f"data/{url_ranking}/cdx.csv"):
        print(f"CDX data already exists for URL: {url_ranking} - {url}. Skipping download.")
        continue  # Skip to the next URL if the file already exists

    try:
        response = requests.get(cdx_api_url)
        response.raise_for_status()  # Raise an error for bad responses
        # Save the response content to a file named cdx.csv in a subdirectory named after the URL ranking number
        # Create the directory if it does not exist
        os.makedirs(f"data/{url_ranking}", exist_ok=True)
        # Write the response text to a file
        with open(f"data/{url_ranking}/cdx.csv", 'w', encoding='utf-8') as f:
            f.write(response.text)
        print(f"CDX data saved for URL: {url_ranking} - {url}")
    except requests.RequestException as e:
        print(f"Error fetching CDX data for URL: {url_ranking} - {url}. Error: {e}")
    except Exception as e:
        print(f"An unexpected error occurred for URL: {url_ranking} - {url}. Error: {e}")
    



Requesting CDX data for URL: 1 - yahoo.co.jp
CDX data saved for URL: 1 - yahoo.co.jp
Requesting CDX data for URL: 3 - biglobe.ne.jp
CDX data saved for URL: 3 - biglobe.ne.jp
Requesting CDX data for URL: 4 - geocities.co.jp
CDX data saved for URL: 4 - geocities.co.jp
Requesting CDX data for URL: 5 - nifty.com
CDX data saved for URL: 5 - nifty.com
Requesting CDX data for URL: 6 - nifty.ne.jp
CDX data saved for URL: 6 - nifty.ne.jp
Requesting CDX data for URL: 7 - dti.ne.jp
CDX data saved for URL: 7 - dti.ne.jp
Requesting CDX data for URL: 8 - so-net.ne.jp
CDX data saved for URL: 8 - so-net.ne.jp
Requesting CDX data for URL: 10 - hi-ho.ne.jp
CDX data saved for URL: 10 - hi-ho.ne.jp
Requesting CDX data for URL: 11 - infoweb.ne.jp
CDX data saved for URL: 11 - infoweb.ne.jp
Requesting CDX data for URL: 12 - mbn.or.jp
CDX data saved for URL: 12 - mbn.or.jp
Requesting CDX data for URL: 13 - lycos.co.jp
CDX data saved for URL: 13 - lycos.co.jp
Requesting CDX data for URL: 14 - goo.ne.jp
CDX dat