In [None]:
from dotenv import load_dotenv
from elasticsearch_dsl.connections import create_connection

load_dotenv(override=True)
create_connection(
    hosts="https://elasticsearch.srv.webis.de:9200",
    http_auth=("", ""),
    timeout=60,
    max_retries=5,
    retry_on_status=(502, 503, 504),
    retry_on_timeout=True,
)

In [None]:
from archive_query_log.orm import Provider


providers: list[Provider] = list(Provider.search(index="aql_providers").scan())
providers_by_domain = {domain: p for p in providers for domain in p.domains}

In [None]:
from archive_query_log.orm import Archive

archives: list[Archive] = list(Archive.search(index="aql_archives").scan())
archives_by_name = {a.name: a for a in archives}

WAYBACK_MACHINE = archives_by_name["Internet Archive"]
WAYBACK_MACHINE

In [None]:
from datetime import datetime
from uuid import uuid5

from pydantic import BaseModel, field_validator, HttpUrl

from archive_query_log.namespaces import NAMESPACE_CAPTURE
from archive_query_log.utils.time import CET, UTC
from archive_query_log.orm import Serp, InnerArchive, InnerProvider, InnerCapture


class _Serp(BaseModel):
    timestamp: datetime
    url: HttpUrl
    query: str

    @field_validator("timestamp", mode="before")
    def parse_timestamp(cls, value) -> datetime:
        timestamp = datetime.fromtimestamp(value, tz=UTC)
        # Bug fix because the AQL-22 data is in CET, but the timestamps are
        # not marked as such.
        timestamp = timestamp.astimezone(CET)
        timestamp = timestamp.replace(tzinfo=UTC)
        return timestamp

    def to_serp(
        self,
        last_modified: datetime,
    ) -> Serp:
        archive = WAYBACK_MACHINE
        domain = self.url.host
        if domain is None:
            raise ValueError(f"No provider found for domain {domain}")
        if domain.startswith("www."):
            domain = domain[4:]
        provider = providers_by_domain[domain]
        path = self.url.path
        if path is None:
            raise ValueError(f"No path found in URL {self.url}")
        path += "?"
        url_path_prefixes = [
            prefix for prefix in provider.url_path_prefixes if path.casefold().startswith(prefix)
        ]
        if len(url_path_prefixes) == 0:
            raise ValueError(
                f"No matching URL path prefix found for URL {self.url} in provider {provider.id} ({path}; {provider.url_path_prefixes})"
            )
        url_path_prefix = max(url_path_prefixes, key=len)
        capture_id_components = (
            archive.cdx_api_url.encoded_string(),
            self.url.encoded_string(),
            self.timestamp.astimezone(tz=UTC).strftime("%Y%m%d%H%M%S"),
        )
        capture_id = uuid5(
            NAMESPACE_CAPTURE,
            ":".join(capture_id_components),
        )
        return Serp(
            id=capture_id,
            last_modified=last_modified,
            archive=InnerArchive(
                id=archive.id,
                cdx_api_url=archive.cdx_api_url,
                memento_api_url=archive.memento_api_url,
                priority=archive.priority,
            ),
            provider=InnerProvider(
                id=provider.id,
                domain=domain,
                url_path_prefix=url_path_prefix,
            ),
            capture=InnerCapture(
                id=capture_id,
                url=self.url,
                timestamp=self.timestamp,
                status_code=200,
                digest="",
                mimetype="text/html",
            ),
            url_query=self.query,
        )

In [None]:
from pathlib import Path
from typing import Iterator

from warcio import ArchiveIterator
from warcio.recordloader import ArcWarcRecord

TEST_WARC_PATH = Path("../data/manual-annotations/archived-raw-serps/warcs")


def iter_test_serps() -> Iterator[tuple[str, Serp, ArcWarcRecord]]:
    for path in TEST_WARC_PATH.glob("*.warc.gz"):
        name = path.name.split("-")[0]
        with open(path, "rb") as stream:
            record: ArcWarcRecord
            for record in ArchiveIterator(stream):
                if record.rec_type == "response":
                    archived_url = record.rec_headers["Archived-URL"]
                    del record.rec_headers["Archived-URL"]
                    legacy_serp = _Serp.model_validate_json(archived_url)
                    serp = legacy_serp.to_serp(
                        last_modified=datetime.fromisoformat(
                            record.rec_headers["WARC-Date"]
                        ),
                    )
                    yield name, serp, record

In [None]:
test_serps = list(iter_test_serps())
test_serps.sort(key=lambda x: x[0])

In [None]:
TESTS_PATH = Path("../data/tests/")

In [None]:
from itertools import groupby

from warcio.recordloader import ArcWarcRecord
from warcio.warcwriter import WARCWriter

from archive_query_log.orm import WarcLocation

for name, group in groupby(test_serps, key=lambda x: x[0]):
    print(f"Writing test case {name}")
    tests_path = TESTS_PATH / name

    warc_path = tests_path.with_suffix(".warc.gz")
    serp_path = tests_path.with_suffix(".jsonl")
    with (
        warc_path.open("wb") as warc_file,
        serp_path.open("wt", encoding="utf-8") as serp_file,
    ):
        writer = WARCWriter(warc_file, gzip=True)
        # Write WARC info record.
        warc_info_record: ArcWarcRecord = writer.create_warcinfo_record(
            filename=warc_path.name, info={}
        )
        writer.write_record(warc_info_record)
        for _, serp, record in group:
            offset = warc_file.tell()
            writer.write_record(record)
            length = warc_file.tell() - offset
            serp.warc_location = WarcLocation(
                file=warc_path.name,
                offset=offset,
                length=length,
            )
            serp_file.write(serp.model_dump_json() + "\n")