# Case study notebook
The notebook features executable code snippets in the lesson. 

In [1]:
# Downloading CDX data for each URL

import tenacity

retry = tenacity.retry(
    stop=tenacity.stop_after_attempt(10),
    wait=tenacity.wait_exponential(multiplier=1, min=2, max=32),
    after=lambda _: time.sleep(2),
)
import time
import requests

@retry
def download_cdx_data(url):
    time.sleep(1.5) # Observing the CDX Server rate limit as stipulated in the Github post cited above. 
    cdx_url = f"https://web.archive.org/cdx/search/cdx?url={url}&from=20000401000000&to=20000630235959&filter=statuscode:200&collapse=digest"
    print(f"Fetching CDX data for: {url}")
    response = requests.get(cdx_url, timeout=10)
    response.raise_for_status()

    if response.status_code == 200:
        return response.text
    else:
        raise Exception(f"Failed to fetch CDX data for {url}: {response.status_code}")


In [None]:
import csv
import hashlib
from pathlib import Path

csv_file = "nikkeibp-may2000-abridged.csv"

urls_data = []

# Load CSV file into dictionary
with open(csv_file, mode='r', encoding='utf-8') as file:
    reader = csv.DictReader(file)
    urls_data = list(reader)

# Generate a MD5 hash for each URL. 
# We will use this hash for saving the CDX data and HTML data. 

for url in urls_data:
    url['md5'] = hashlib.md5(url['url'].encode('utf-8')).hexdigest()

print(urls_data)

# Expected output: 
# [{'rank': '1', 'url': 'yahoo.co.jp', 'md5': 'e5fd6205dda399e39d9c3b055734f83f'}, ...


[{'rank': '1', 'url': 'yahoo.co.jp', 'md5': 'e5fd6205dda399e39d9c3b055734f83f'}, {'rank': '3', 'url': 'biglobe.ne.jp', 'md5': '598ca1bee5d257ac6f3c8477a61c14a2'}, {'rank': '4', 'url': 'geocities.co.jp', 'md5': '03e55e791b0c30a0e70b7fcb73d01894'}, {'rank': '5', 'url': 'nifty.com', 'md5': '02efb45e70adeb94cccd008670559250'}, {'rank': '6', 'url': 'nifty.ne.jp', 'md5': '245505630037d8a520036ff590e3cd2e'}, {'rank': '7', 'url': 'dti.ne.jp', 'md5': '5d5f85669bd24bde374e4e2da080b205'}, {'rank': '8', 'url': 'so-net.ne.jp', 'md5': '535bc4bb48fda288effb74883d8ab222'}, {'rank': '10', 'url': 'hi-ho.ne.jp', 'md5': 'bef081a841799df11166f1d69851d534'}, {'rank': '11', 'url': 'infoweb.ne.jp', 'md5': 'b53ae61a77ead6d7949a01a8b562d704'}, {'rank': '12', 'url': 'mbn.or.jp', 'md5': '4c46800776c8bb778889fda89a6a6b90'}, {'rank': '13', 'url': 'lycos.co.jp', 'md5': '425b80310c0dcf16ac09e4c76a273734'}, {'rank': '14', 'url': 'goo.ne.jp', 'md5': '9cec14420da076836a3fdc6e832b9b92'}, {'rank': '15', 'url': 'odn.ne.jp'

In [4]:
# This section downloads the CDX data for each URL in the dataset.
# It checks if the CDX data for the URL has already been downloaded, 
# and if not, it fetches the data from the Internet Archive's CDX API.

for url in urls_data: 
    # If the CDX data for the URL has already been downloaded, skip it.
    cdx_file_path = Path(f"data/{url['md5']}/cdx.csv")
    if cdx_file_path.exists():
        print(f"CDX data for {url['url']} already exists at {cdx_file_path}. Skipping download.")
        continue
    try:
        cdx_data = download_cdx_data(url['url'])
        # Save the CDX data to a file named after the MD5 hash of the URL
        cdx_file_path.parent.mkdir(parents=True, exist_ok=True)
        with open(cdx_file_path, 'w', encoding='utf-8') as cdx_file:
            cdx_file.write(cdx_data)
        print(f"CDX data saved for {url['url']} at {cdx_file_path}")
    except Exception as e:
        print(f"Error fetching CDX data for {url['url']}: {e}")

CDX data for yahoo.co.jp already exists at data/e5fd6205dda399e39d9c3b055734f83f/cdx.csv. Skipping download.
CDX data for biglobe.ne.jp already exists at data/598ca1bee5d257ac6f3c8477a61c14a2/cdx.csv. Skipping download.
CDX data for geocities.co.jp already exists at data/03e55e791b0c30a0e70b7fcb73d01894/cdx.csv. Skipping download.
CDX data for nifty.com already exists at data/02efb45e70adeb94cccd008670559250/cdx.csv. Skipping download.
CDX data for nifty.ne.jp already exists at data/245505630037d8a520036ff590e3cd2e/cdx.csv. Skipping download.
CDX data for dti.ne.jp already exists at data/5d5f85669bd24bde374e4e2da080b205/cdx.csv. Skipping download.
CDX data for so-net.ne.jp already exists at data/535bc4bb48fda288effb74883d8ab222/cdx.csv. Skipping download.
CDX data for hi-ho.ne.jp already exists at data/bef081a841799df11166f1d69851d534/cdx.csv. Skipping download.
CDX data for infoweb.ne.jp already exists at data/b53ae61a77ead6d7949a01a8b562d704/cdx.csv. Skipping download.
CDX data for m

In [13]:
# Analyze each CDX file and choose random snapshots for each URL 
# (if there is only one available snapshot to choose from, choose that one). 
# Make sure that the two snapshots have different digests. 

import random
def choose_random_snapshots_to_download(cdx_file_path, num_snapshots=2):

    print(f"Choosing {num_snapshots} random snapshots from {cdx_file_path}")

    # check if there are already any downloaded snapshots
    snapshots_dir = Path(cdx_file_path).parent
    downloaded_htmls = snapshots_dir.glob('*.html')
    # get file names without the extension
    downloaded_htmls = [html_file.stem for html_file in downloaded_htmls]
    print(f"Already downloaded {len(downloaded_htmls)} snapshots for {cdx_file_path}")
    num_snapshots -= len(downloaded_htmls)
    print(f"Number of new snapshots to download: {num_snapshots}")

    if num_snapshots <= 0:
        print(f"No new snapshots to download for {cdx_file_path}. Already have {len(downloaded_htmls)} downloaded.")
        return []
    
    # Read the CDX file and collect snapshots
    snapshots = []
    with open(cdx_file_path, 'r', encoding='utf-8') as cdx_file:
        reader = csv.reader(cdx_file, delimiter=' ')
        for row in reader:
            if len(row) >= 3:  # Ensure there are enough columns
                snapshots.append(row)
    
    # Remove duplicates based on the digest (6th column)
    unique_snapshots = {snapshot[5]: snapshot for snapshot in snapshots}.values()

    # Remove snapshots that have already been downloaded
    unique_snapshots = [snapshot for snapshot in unique_snapshots if snapshot[1] not in downloaded_htmls]
    
    # Randomly select the target number of snapshots
    selected_snapshots = random.sample(list(unique_snapshots), min(num_snapshots, len(unique_snapshots)))
    return selected_snapshots

print(choose_random_snapshots_to_download('data/d91d0c585020a45454ea2f383d3632b2/cdx.csv'))


Choosing 2 random snapshots from data/d91d0c585020a45454ea2f383d3632b2/cdx.csv
Already downloaded 0 snapshots for data/d91d0c585020a45454ea2f383d3632b2/cdx.csv
Number of new snapshots to download: 2
[['jp,ne,freeweb)/', '20000510064154', 'http://www1.freeweb.ne.jp:80/', 'text/html', '200', 'GZAOLGGPTQHFVW7QNA6L33CXPQTIJFVP', '1013'], ['jp,ne,freeweb)/', '20000510125118', 'http://www3.freeweb.ne.jp:80/', 'text/html', '200', 'E6D7TWMPCWJ5R75AXYFUUIZ22UZ2I45Y', '489']]
