# Web archive's API

The notebook aims at exploring API options for web archive.


## Useful resources

- [Wayback Machine APIs](https://archive.org/help/wayback_api.php)
- [Wayback CDX API](https://github.com/internetarchive/wayback/tree/master/wayback-cdx-server)
- [Archive-it's CDX/C API](https://support.archive-it.org/hc/en-us/articles/115001790023-Access-Archive-It-s-Wayback-index-with-the-CDX-C-API) – includes useful general documentation of CDX format


In [28]:
# required modules
import requests
import pandas as pd
import json


APIS = {
    "ia": {"url": "http://web.archive.org/cdx/search/cdx", "type": "wb"},
    "nla": {"url": "https://web.archive.org.au/awa/cdx", "type": "pywb"},
    "bl": {"url": "https://www.webarchive.org.uk/wayback/archive/cdx", "type": "pywb"},
    "nlnz": {
        "url": "https://ndhadeliver.natlib.govt.nz/webarchive/cdx",
        "type": "pywb",
    },
    "ukgwa": {
        "url": "https://webarchive.nationalarchives.gov.uk/ukgwa/cdx",
        "type": "pywb",
    },
}


def raw_cdx_query(api, url, **kwargs):
    params = kwargs
    params["url"] = url
    params["output"] = "json"
    response = requests.get(APIS[api]["url"], params=params, timeout=60)
    response.raise_for_status()
    return response

## CDX request


In [14]:
param = {"url": "https://www.metservice.com/", "limit": 10, "output": "json"}

res = requests.get("https://ndhadeliver.natlib.govt.nz/webarchive/cdx", params=param)
# res = res.json()

# df = pd.DataFrame(res[1:], columns=res[0])
# df

In [31]:
json.loads(raw_cdx_query("nlnz", "metservice.com", limit=1, format="json").text)

{'urlkey': 'com,metservice)/',
 'timestamp': '20090912115209',
 'url': 'http://www.metservice.com/',
 'mime': 'text/html',
 'status': '302',
 'digest': 'JIEOSEEJQGNYE7KAP6CF6CYJZ6OJOHQR',
 'redirect': '-',
 'robotflags': '-',
 'length': '0',
 'offset': '51633167',
 'filename': 'V1-FL1277029.arc',
 'load_url': 'http://10.4.1.66:80/nlnzwebarchive_PROD/ap/20090912115209id_/http://www.metservice.com/',
 'source': 'webarchive',
 'source-coll': 'webarchive'}

In [34]:
# Look for an exact url
exact = len(
    raw_cdx_query(
        "nlnz",
        "http://www.metservice.com/",
        filter=["status:200", "mimetype:text/html"],
        format="json",
    ).text.splitlines()
)
exact

29

In [35]:
raw_cdx_query(
    "nlnz",
    "http://www.metservice.com/",
    filter=["status:200", "mimetype:text/html"],
    format="json",
).text.splitlines()

['{"urlkey": "com,metservice)/", "timestamp": "20200501111229", "url": "https://www.metservice.com/", "mime": "text/html", "status": "200", "digest": "FLFBP6WSUUPWXRNO63QFJAAVWPEWXISM", "redirect": "-", "robotflags": "-", "length": "0", "offset": "88300448", "filename": "V1-FL53804551.warc", "load_url": "http://10.4.1.66:80/nlnzwebarchive_PROD/ap/20200501111229id_/https://www.metservice.com/", "source": "webarchive", "source-coll": "webarchive"}',
 '{"urlkey": "com,metservice)/", "timestamp": "20200923034106", "url": "https://www.metservice.com/", "mime": "text/html", "status": "200", "digest": "WPC36SVOJ5B327MIIF3PO2ULLNKGPFPP", "redirect": "-", "robotflags": "-", "length": "0", "offset": "22139601", "filename": "V1-FL58704011.warc", "load_url": "http://10.4.1.66:80/nlnzwebarchive_PROD/ap/20200923034106id_/https://www.metservice.com/", "source": "webarchive", "source-coll": "webarchive"}',
 '{"urlkey": "com,metservice)/", "timestamp": "20201014100509", "url": "https://www.metservice.c

In [50]:
from fastwarc.stream_io import *

stream = GZipStream(
    FileStream(
        "../wb_collection/collections/my-web-archive/archive/rec-20230924224718994465-Legion.warc.gz",
        "rb",
    )
)

In [51]:
for record in ArchiveIterator(stream):
    record.headers  # Dict-like object containing the WARC headers
    record.record_id  # Shorthand for record.headers['WARC-Record-ID']
    record.record_type  # Shorthand for record.headers['WARC-Type']
    record.record_date  # Parsed record.headers['WARC-Date']
    record.content_length  # Effective record payload length
    record.stream_pos  # Record start offset in the (uncompressed) stream
    record.is_http  # Boolean indicating whether record is an HTTP record
    record.http_headers  # Dict-like object containing the parsed HTTP headers
    record.http_content_type  # Plain HTTP Content-Type without charset
    record.http_charset  # HTTP charset from the Content-Type header (if any)
    record.http_date  # Parsed HTTP Date header
    record.http_last_modified  # Parsed HTTP Last-Modified header
    record.reader  # A BufferedReader for the record content

    # Read and return up to 1024 bytes from the record stream
    body = record.reader.read(1024)

    # Consume and return the remaining record bytes
    body += record.reader.read()

    # Or: Consume rest of stream without allocating a buffer for it (i.e., skip over)
    # record.reader.consume()

In [52]:
body

b''