# Case study notebook
The notebook features executable code snippets in the lesson. 

In [1]:
# Downloading CDX data for each URL

import tenacity

retry = tenacity.retry(
    stop=tenacity.stop_after_attempt(10),
    wait=tenacity.wait_exponential(multiplier=1, min=2, max=32),
)

import time
import requests

@retry
def download_cdx_data(url):
    time.sleep(1.5) # Observing the CDX Server rate limit as stipulated in the Github post cited above. 
    cdx_url = f"https://web.archive.org/cdx/search/cdx?url={url}&from=20000501000000&to=20000531235959&filter=statuscode:200&collapse=digest"
    print(f"Fetching CDX data for: {url}")
    response = requests.get(cdx_url, timeout=10)
    response.raise_for_status()

    if response.status_code == 200:
        return response.text
    else:
        raise Exception(f"Failed to fetch CDX data for {url}: {response.status_code}")


In [2]:
import csv
import hashlib
from pathlib import Path

csv_file = "nikkeibp-may2000-abridged.csv"

urls_data = []

# Load CSV file into dictionary
with open(csv_file, mode='r', encoding='utf-8') as file:
    reader = csv.DictReader(file)
    urls_data = list(reader)

# Generate a MD5 hash for each URL. 
# We will use this hash for saving the CDX data and HTML data. 

for url in urls_data:
    url['md5'] = hashlib.md5(url['url'].encode('utf-8')).hexdigest()

print(urls_data)

# Expected output: 
# [{'rank': '1', 'url': 'yahoo.co.jp', 'md5': 'e5fd6205dda399e39d9c3b055734f83f'}, ...


[{'rank': '1', 'url': 'yahoo.co.jp', 'md5': 'e5fd6205dda399e39d9c3b055734f83f'}, {'rank': '3', 'url': 'biglobe.ne.jp', 'md5': '598ca1bee5d257ac6f3c8477a61c14a2'}, {'rank': '4', 'url': 'geocities.co.jp', 'md5': '03e55e791b0c30a0e70b7fcb73d01894'}, {'rank': '5', 'url': 'nifty.com', 'md5': '02efb45e70adeb94cccd008670559250'}, {'rank': '6', 'url': 'nifty.ne.jp', 'md5': '245505630037d8a520036ff590e3cd2e'}, {'rank': '7', 'url': 'dti.ne.jp', 'md5': '5d5f85669bd24bde374e4e2da080b205'}, {'rank': '8', 'url': 'so-net.ne.jp', 'md5': '535bc4bb48fda288effb74883d8ab222'}, {'rank': '10', 'url': 'hi-ho.ne.jp', 'md5': 'bef081a841799df11166f1d69851d534'}, {'rank': '11', 'url': 'infoweb.ne.jp', 'md5': 'b53ae61a77ead6d7949a01a8b562d704'}, {'rank': '12', 'url': 'mbn.or.jp', 'md5': '4c46800776c8bb778889fda89a6a6b90'}, {'rank': '13', 'url': 'lycos.co.jp', 'md5': '425b80310c0dcf16ac09e4c76a273734'}, {'rank': '14', 'url': 'goo.ne.jp', 'md5': '9cec14420da076836a3fdc6e832b9b92'}, {'rank': '15', 'url': 'odn.ne.jp'

In [3]:
# This section downloads the CDX data for each URL in the dataset.
# It checks if the CDX data for the URL has already been downloaded, 
# and if not, it fetches the data from the Internet Archive's CDX API.

for url in urls_data: 
    # If the CDX data for the URL has already been downloaded, skip it.
    cdx_file_path = Path(f"data/{url['md5']}/cdx.csv")
    if cdx_file_path.exists():
        print(f"CDX data for {url['url']} already exists at {cdx_file_path}. Skipping download.")
        continue
    try:
        cdx_data = download_cdx_data(url['url'])
        # Save the CDX data to a file named after the MD5 hash of the URL
        cdx_file_path.parent.mkdir(parents=True, exist_ok=True)
        with open(cdx_file_path, 'w', encoding='utf-8') as cdx_file:
            cdx_file.write(cdx_data)
        print(f"CDX data saved for {url['url']} at {cdx_file_path}")
    except Exception as e:
        print(f"Error fetching CDX data for {url['url']}: {e}")

CDX data for yahoo.co.jp already exists at data/e5fd6205dda399e39d9c3b055734f83f/cdx.csv. Skipping download.
CDX data for biglobe.ne.jp already exists at data/598ca1bee5d257ac6f3c8477a61c14a2/cdx.csv. Skipping download.
CDX data for geocities.co.jp already exists at data/03e55e791b0c30a0e70b7fcb73d01894/cdx.csv. Skipping download.
CDX data for nifty.com already exists at data/02efb45e70adeb94cccd008670559250/cdx.csv. Skipping download.
CDX data for nifty.ne.jp already exists at data/245505630037d8a520036ff590e3cd2e/cdx.csv. Skipping download.
CDX data for dti.ne.jp already exists at data/5d5f85669bd24bde374e4e2da080b205/cdx.csv. Skipping download.
CDX data for so-net.ne.jp already exists at data/535bc4bb48fda288effb74883d8ab222/cdx.csv. Skipping download.
CDX data for hi-ho.ne.jp already exists at data/bef081a841799df11166f1d69851d534/cdx.csv. Skipping download.
CDX data for infoweb.ne.jp already exists at data/b53ae61a77ead6d7949a01a8b562d704/cdx.csv. Skipping download.
CDX data for m

In [None]:
# Define a function to download web page snapshot. 
@retry
def download_snapshot(url,timestamp, request_flag="id_"):
    time.sleep(0.5)  # While this establishes a much lower request frequency than 480 per second, being extra conservative here is a good idea. 
    url = f"https://web.archive.org/web/{timestamp}{request_flag}/{url}"
    try:
        response = requests.get(url)
    # catch ChunkEncodingError and retry
    except requests.exceptions.ChunkedEncodingError as e:
        print(f"ChunkedEncodingError encountered for {url}: {e}. Retrying...")
        headers= {
            # don’t let the server send you gzip/deflate
            "Accept-Encoding": "identity",
            # hint to close the connection after this request
            "Connection": "close",
        }
        response = requests.get(url, headers=headers)
    response.encoding = response.apparent_encoding
    return response.text 

# Download snapshots for each URL by iterating through the downloaded CDX files
cdx_files = Path("data").glob("*/cdx.csv")
for cdx_file in cdx_files:
    print(f"Processing CDX file: {cdx_file}")
    snapshots = []
    with open(cdx_file, mode='r', encoding='utf-8') as file:
        reader = csv.reader(file, delimiter=" ")
        reader = list(reader)
        # For non-adjacent rows with the same digest, we will only keep the first one.
        seen_digests = set()
        for row in reader:
            digest = row[5]
            if digest not in seen_digests:
                seen_digests.add(digest)
                snapshots.append(row)
    print(f"Found {len(snapshots)} snapshots in {cdx_file}")

    # Check if there are any snapshots already downloaded for this CDX file
    html_files_downloaded = list(cdx_file.parent.glob("*.html"))
    if html_files_downloaded:
        html_file_names = [file.stem for file in html_files_downloaded]
        print(html_file_names)
        snapshots = [snapshot for snapshot in snapshots if snapshot[1] not in html_file_names]
    print(f"Remaining snapshots to download: {len(snapshots)}")

    # Iterate through the snapshots and download each one
    for snapshot in snapshots:
        url, timestamp = snapshot[2], snapshot[1]
        print(f"Downloading snapshot for URL: {url} at timestamp: {timestamp}...")
        
        try:
            html_content = download_snapshot(url, timestamp)
            # Save the HTML content to a file named after the timestamp of the snapshot
            html_file_path = cdx_file.parent / f"{timestamp}.html"
            with open(html_file_path, 'w', encoding='utf-8') as html_file:
                html_file.write(html_content)
            print(f"Snapshot saved at {html_file_path}")
        except Exception as e:
            print(f"Error downloading snapshot for {url} at {timestamp}: {e}")

Processing CDX file: data/a65a310ffdf9e798db8143f21de138ef/cdx.csv
Found 6 snapshots in data/a65a310ffdf9e798db8143f21de138ef/cdx.csv
['20000510113800', '20000510063138', '20000510073657', '20000520073937', '20000520061629', '20000511110758']
Remaining snapshots to download: 0
Processing CDX file: data/425b80310c0dcf16ac09e4c76a273734/cdx.csv
Found 12 snapshots in data/425b80310c0dcf16ac09e4c76a273734/cdx.csv
['20000510142936', '20000519223002', '20000510211920', '20000510044421', '20000510181458', '20000510052003', '20000511043728', '20000510044055', '20000510063003', '20000510090616', '20000510071029', '20000511171913']
Remaining snapshots to download: 0
Processing CDX file: data/5d5f85669bd24bde374e4e2da080b205/cdx.csv
Found 0 snapshots in data/5d5f85669bd24bde374e4e2da080b205/cdx.csv
Remaining snapshots to download: 0
Processing CDX file: data/439868a9c18529515a0be3529bfb20aa/cdx.csv
Found 16 snapshots in data/439868a9c18529515a0be3529bfb20aa/cdx.csv
['20000520005747', '20000510043

In [9]:
print(download_snapshot("http://www.highway.ne.jp/top.html", "2000"))


<html>
<head>
<meta http-equiv="Content-Type" content="text/html; charset=shift_jis">
<title>TOP</title>
<style>
<!--
div          { font-size: 10px ;}
-->
</style>
</head>
<BODY bgcolor="#009966" marginwidth="0" marginheight="0" leftmargin="0" topmargin="0">
<!-- title -->
<table border="0" cellpadding="0" cellspacing="0" width="487">
<tr><!-- row 01 -->
   <td rowspan="1" colspan="3"><img name="Nty2_01_01" src="img/bar.gif" width="487" height="9" border="0"></td>
   <td><img src="img/00.gif" width="1" height="9" border="0"></td>
  </tr>
  <tr><!-- row 02 -->
    <td rowspan="1" colspan="3" background="img/tybg.jpg"><img name="Nty2_01_01" src="img/ty.gif" border="0" width="487" height="121" alt="Highway Internet"></td>
   <td><!-- <img src="img/00.gif" width="1" height="144" border="0"> --></td>
  </tr>
  <tr><!-- row 04 -->
   <td rowspan="1" colspan="3">
<IMG WIDTH=1 HEIGHT=1 SRC="http://www.cyberclick.net/module/track?cid=c00004729">
      <CENTER><!--menu table -->
      <table bo

In [7]:
# Search for all downloaded HTML files and see if they contain the element <frameset>
from bs4 import BeautifulSoup
html_files = Path("data").glob("*/**/*.html")
html_files_with_frameset = []
for html_file in html_files:
    try:
        with open(html_file, 'r', encoding='utf-8') as file:
            content = file.read()
        soup = BeautifulSoup(content, 'html.parser')
        if soup.find('frameset'):
            html_files_with_frameset.append(html_file)
            print(f"Found <frameset> in {html_file}")
    except Exception as e:
        print(f"Error reading {html_file}: {e}")

# For each frameset found, we will download the frameset page and save it to a file.




Found <frameset> in data/b3bdcb712d38128626fbb0c76fefed0f/20000519230736.html
Found <frameset> in data/3e907a24a0769b56fc27880bc2db17be/20000510010756.html
Found <frameset> in data/58e98dfc94996a1d7f97f79d7f6ba9ed/20000511212508.html
Found <frameset> in data/8114c30ebe6c67f874c6bc24c94aaeb7/20000511101011.html
Found <frameset> in data/d4bffe3267a972738ae687eb26cf43d8/20000511200626.html


In [8]:
for frameset_file in html_files_with_frameset:
    timestamp = frameset_file.stem
    url_md5 = frameset_file.parent.name
    # look for snapshot URL by URL md5
    snapshot_url = next((url['url'] for url in urls_data if url['md5'] == url_md5), None)
    if not snapshot_url:
        raise ValueError(f"No URL found for MD5 {url_md5} in the dataset.")
    print(f"Processing frameset file: {frameset_file} for URL: {snapshot_url} at timestamp: {timestamp}")

    try:
        with open(frameset_file, 'r', encoding='utf-8') as file:
            content = file.read()
        soup = BeautifulSoup(content, 'html.parser')
        frames = soup.find_all('frame')
        for i in range(len(frames)):
            frame = frames[i]
            frame_src = frame.get('src')
            # detect if frame_src is a relative URL and convert it to an absolute URL
            if frame_src and not frame_src.startswith('http://'):
                frame_src = f"{snapshot_url}/{frame_src.lstrip('/')}"
            print(f"Downloading frame {i} from {frame_src} at timestamp {timestamp}...")
            if frame_src:
                # Download the frame content
                frame_content = download_snapshot(frame_src, timestamp, request_flag="fw_")
                # Save the frame content to a file named after the frame source
                frame_file_path = frameset_file.parent / f"{timestamp}-frame-{i}.html"
                with open(frame_file_path, 'w', encoding='utf-8') as frame_file:
                    frame_file.write(frame_content)
                print(f"Frame content saved at {frame_file_path}")
    except Exception as e:
        print(f"Error processing {frameset_file}: {e}")

Processing frameset file: data/b3bdcb712d38128626fbb0c76fefed0f/20000519230736.html for URL: plala.or.jp at timestamp: 20000519230736
Downloading frame 0 from plala.or.jp/index1.html at timestamp 20000519230736...
Frame content saved at data/b3bdcb712d38128626fbb0c76fefed0f/20000519230736-frame-0.html
Downloading frame 1 from plala.or.jp/index2.html at timestamp 20000519230736...
Frame content saved at data/b3bdcb712d38128626fbb0c76fefed0f/20000519230736-frame-1.html
Processing frameset file: data/3e907a24a0769b56fc27880bc2db17be/20000510010756.html for URL: nikkei.co.jp at timestamp: 20000510010756
Downloading frame 0 from nikkei.co.jp/cont.html at timestamp 20000510010756...
Frame content saved at data/3e907a24a0769b56fc27880bc2db17be/20000510010756-frame-0.html
Downloading frame 1 from http://globe.nikkei.co.jp/ad/control/ng01.html at timestamp 20000510010756...
Frame content saved at data/3e907a24a0769b56fc27880bc2db17be/20000510010756-frame-1.html
Processing frameset file: data/58