In [None]:
from datetime import datetime, UTC
from typing import Literal, Annotated, TypeAlias
from uuid import UUID

from annotated_types import Ge
from elasticsearch_dsl import (
    Date as _Date,
    RankFeature as _RankFeature,
    Keyword as _Keyword,
)
from pydantic import Field, AliasChoices

from elasticsearch_pydantic import (
    BaseDocument,
    BaseInnerDocument,
    KeywordField as Keyword,
)


IntKeyword: TypeAlias = Annotated[int, _Keyword]
Date: TypeAlias = Annotated[
    datetime,
    _Date(
        default_timezone="UTC",
        format="strict_date_time_no_millis",
    ),
]
DefaultDate: TypeAlias = Annotated[
    Date,
    Field(default_factory=lambda: datetime.now(UTC)),
]
FloatRankFeature: TypeAlias = Annotated[
    float,
    Ge(0),
    _RankFeature(positive_score_impact=True),
]
IntRankFeature: TypeAlias = Annotated[
    int,
    Ge(0),
    _RankFeature(positive_score_impact=True),
]


class UuidBaseDocument(BaseDocument):
    id: UUID = Field(  # type: ignore[override]
        default_factory=UUID,
        validation_alias=AliasChoices("_id", "id"),
        serialization_alias="_id",
    )


class InnerProviderId(BaseInnerDocument):
    id: UUID


UrlQueryParserType = Literal[
    "query_parameter",
    "fragment_parameter",
    "path_segment",
]


class UrlQueryParser(UuidBaseDocument):
    last_modified: DefaultDate
    provider: InnerProviderId | None = None
    url_pattern_regex: Keyword | None = None
    priority: FloatRankFeature | None = None
    parser_type: UrlQueryParserType
    parameter: Keyword | None = None
    segment: IntKeyword | None = None
    remove_pattern_regex: Keyword | None = None
    space_pattern_regex: Keyword | None = None

    class Index:
        name = "aql_url_query_parsers"
        settings = {
            "number_of_shards": 1,
            "number_of_replicas": 2,
        }


UrlPageParserType = Literal[
    "query_parameter",
    "fragment_parameter",
    "path_segment",
]


class UrlPageParser(UuidBaseDocument):
    last_modified: DefaultDate
    provider: InnerProviderId | None = None
    url_pattern_regex: Keyword | None = None
    priority: FloatRankFeature | None = None
    parser_type: UrlPageParserType
    parameter: Keyword | None = None
    segment: IntKeyword | None = None
    remove_pattern_regex: Keyword | None = None
    space_pattern_regex: Keyword | None = None

    class Index:
        name = "aql_url_page_parsers"
        settings = {
            "number_of_shards": 1,
            "number_of_replicas": 2,
        }


UrlOffsetParserType = Literal[
    "query_parameter",
    "fragment_parameter",
    "path_segment",
]


class UrlOffsetParser(UuidBaseDocument):
    last_modified: DefaultDate
    provider: InnerProviderId | None = None
    url_pattern_regex: Keyword | None = None
    priority: FloatRankFeature | None = None
    parser_type: UrlOffsetParserType
    parameter: Keyword | None = None
    segment: IntKeyword | None = None
    remove_pattern_regex: Keyword | None = None
    space_pattern_regex: Keyword | None = None

    class Index:
        name = "aql_url_offset_parsers"
        settings = {
            "number_of_shards": 1,
            "number_of_replicas": 2,
        }


WarcQueryParserType = Literal["xpath"]


class WarcQueryParser(UuidBaseDocument):
    last_modified: DefaultDate
    provider: InnerProviderId | None = None
    url_pattern_regex: Keyword | None = None
    priority: FloatRankFeature | None = None
    parser_type: WarcQueryParserType
    xpath: Keyword | None = None
    remove_pattern_regex: Keyword | None = None
    space_pattern_regex: Keyword | None = None

    class Index:
        name = "aql_warc_query_parsers"
        settings = {
            "number_of_shards": 1,
            "number_of_replicas": 2,
        }


WarcWebSearchResultBlocksParserType = Literal["xpath"]


class WarcWebSearchResultBlocksParser(UuidBaseDocument):
    last_modified: DefaultDate
    provider: InnerProviderId | None = None
    url_pattern_regex: Keyword | None = None
    priority: FloatRankFeature | None = None
    parser_type: WarcWebSearchResultBlocksParserType
    xpath: Keyword | None = None
    url_xpath: Keyword | None = None
    title_xpath: Keyword | None = None
    text_xpath: Keyword | None = None

    class Index:
        name = "aql_warc_snippets_parsers"
        settings = {
            "number_of_shards": 1,
            "number_of_replicas": 2,
        }


WarcSpecialContentsResultBlocksParserType = Literal["xpath"]


class WarcSpecialContentsResultBlocksParser(UuidBaseDocument):
    last_modified: DefaultDate
    provider: InnerProviderId | None = None
    url_pattern_regex: Keyword | None = None
    priority: FloatRankFeature | None = None
    parser_type: WarcSpecialContentsResultBlocksParserType
    xpath: Keyword | None = None
    url_xpath: Keyword | None = None
    text_xpath: Keyword | None = None

    class Index:
        name = "aql_warc_direct_answers_parsers"
        settings = {
            "number_of_shards": 1,
            "number_of_replicas": 2,
        }

In [None]:
from dotenv import load_dotenv
from elasticsearch_dsl.connections import create_connection

load_dotenv(override=True)
create_connection( 
            hosts="https://elasticsearch.srv.webis.de:9200",
            http_auth=("ajjxp", "jE7CjHPhXA3zAHgPxnzypsjLpfrjfET7"),
            timeout=60,
            max_retries=5,
            retry_on_status=(502, 503, 504),
            retry_on_timeout=True,
            )

In [None]:
from archive_query_log.orm import Provider


providers: list[Provider] = list(Provider.search(index="aql_providers").scan())
provider_names = {p.id: p.name for p in providers}
provider_priorities = {p.id: p.priority for p in providers if p.priority is not None}
provider_domains = {p.id: p.domains[0] for p in providers if p.domains is not None}

In [None]:
from itertools import groupby

url_query_parsers: list[UrlQueryParser] = list(UrlQueryParser.search().scan())
url_query_parsers.sort(
    key=lambda parser: (
        provider_priorities.get(parser.provider.id, -1)
        if parser.provider is not None
        else None,
        parser.provider.id if parser.provider is not None else None,
        parser.priority or None,
        parser.id,
    ),
    reverse=True,
)
for provider_id, group in groupby(
    url_query_parsers, key=lambda p: p.provider.id if p.provider is not None else None
):
    print(
        f"# Provider: {provider_names.get(provider_id, 'Global') if provider_id is not None else 'Global'}"
        + f" ({provider_domains[provider_id]})"
        if provider_id is not None
        else ""
    )
    for url_query_parser in group:
        parameters = []
        if url_query_parser.provider is not None:
            parameters.append(f'provider_id=UUID("{url_query_parser.provider.id}")')
        if url_query_parser.url_pattern_regex is not None:
            parameters.append(
                f'url_pattern=re_compile(r"{url_query_parser.url_pattern_regex}")'
            )
        parser_class: str
        if url_query_parser.parser_type == "query_parameter":
            parser_class = "QueryParameterUrlQueryParser"
            if url_query_parser.parameter is None:
                raise ValueError("Missing parameter for parser.")
            parameters.append(f'parameter="{url_query_parser.parameter}"')
        elif url_query_parser.parser_type == "fragment_parameter":
            parser_class = "FragmentParameterUrlQueryParser"
            if url_query_parser.parameter is None:
                raise ValueError("Missing parameter for parser.")
            parameters.append(f'parameter="{url_query_parser.parameter}"')
        elif url_query_parser.parser_type == "path_segment":
            parser_class = "PathSegmentUrlQueryParser"
            if url_query_parser.segment is None:
                raise ValueError("Missing segment for parser.")
            parameters.append(f"segment={url_query_parser.segment}")
        else:
            raise ValueError(f"Unknown parser type: {url_query_parser.parser_type}")
        if url_query_parser.remove_pattern_regex is not None:
            parameters.append(
                f'remove_pattern=re_compile(r"{url_query_parser.remove_pattern_regex}")'
            )
        if url_query_parser.space_pattern_regex is not None:
            parameters.append(
                f'space_pattern=re_compile(r"{url_query_parser.space_pattern_regex}")'
            )
        print(
            f"""
{parser_class}(
    {",\n    ".join(parameters)},
),
    """.strip()
        )

In [None]:
from itertools import groupby

url_page_parsers: list[UrlPageParser] = list(UrlPageParser.search().scan())
url_page_parsers.sort(
    key=lambda parser: (
        provider_priorities.get(parser.provider.id, -1)
        if parser.provider is not None
        else None,
        parser.provider.id if parser.provider is not None else None,
        parser.priority or None,
        parser.id,
    ),
    reverse=True,
)
for provider_id, group in groupby(
    url_page_parsers, key=lambda p: p.provider.id if p.provider is not None else None
):
    print(
        f"# Provider: {provider_names.get(provider_id, 'Global') if provider_id is not None else 'Global'}"
        + f" ({provider_domains[provider_id]})"
        if provider_id is not None
        else ""
    )
    for url_page_parser in group:
        parameters = []
        if url_page_parser.provider is not None:
            parameters.append(f'provider_id=UUID("{url_page_parser.provider.id}")')
        if url_page_parser.url_pattern_regex is not None:
            parameters.append(
                f'url_pattern=re_compile(r"{url_page_parser.url_pattern_regex}")'
            )
        parser_class: str
        if url_page_parser.parser_type == "query_parameter":
            parser_class = "QueryParameterUrlPageParser"
            if url_page_parser.parameter is None:
                raise ValueError("Missing parameter for parser.")
            parameters.append(f'parameter="{url_page_parser.parameter}"')
        elif url_page_parser.parser_type == "fragment_parameter":
            parser_class = "FragmentParameterUrlPageParser"
            if url_page_parser.parameter is None:
                raise ValueError("Missing parameter for parser.")
            parameters.append(f'parameter="{url_page_parser.parameter}"')
        elif url_page_parser.parser_type == "path_segment":
            parser_class = "PathSegmentUrlPageParser"
            if url_page_parser.segment is None:
                raise ValueError("Missing segment for parser.")
            parameters.append(f"segment={url_page_parser.segment}")
        else:
            raise ValueError(f"Unknown parser type: {url_page_parser.parser_type}")
        if url_page_parser.remove_pattern_regex is not None:
            parameters.append(
                f'remove_pattern=re_compile(r"{url_page_parser.remove_pattern_regex}")'
            )
        if url_page_parser.space_pattern_regex is not None:
            parameters.append(
                f'space_pattern=re_compile(r"{url_page_parser.space_pattern_regex}")'
            )
        print(
            f"""
{parser_class}(
    {",\n    ".join(parameters)},
),
    """.strip()
        )

In [None]:
from itertools import groupby

url_offset_parsers: list[UrlOffsetParser] = list(UrlOffsetParser.search().scan())
url_offset_parsers.sort(
    key=lambda parser: (
        provider_priorities.get(parser.provider.id, -1)
        if parser.provider is not None
        else None,
        parser.provider.id if parser.provider is not None else None,
        parser.priority or None,
        parser.id,
    ),
    reverse=True,
)
for provider_id, group in groupby(
    url_offset_parsers, key=lambda p: p.provider.id if p.provider is not None else None
):
    print(
        f"# Provider: {provider_names.get(provider_id, 'Global') if provider_id is not None else 'Global'}"
        + f" ({provider_domains[provider_id]})"
        if provider_id is not None
        else ""
    )
    for url_offset_parser in group:
        parameters = []
        if url_offset_parser.provider is not None:
            parameters.append(f'provider_id=UUID("{url_offset_parser.provider.id}")')
        if url_offset_parser.url_pattern_regex is not None:
            parameters.append(
                f'url_pattern=re_compile(r"{url_offset_parser.url_pattern_regex}")'
            )
        parser_class: str
        if url_offset_parser.parser_type == "query_parameter":
            parser_class = "QueryParameterUrlOffsetParser"
            if url_offset_parser.parameter is None:
                raise ValueError("Missing parameter for parser.")
            parameters.append(f'parameter="{url_offset_parser.parameter}"')
        elif url_offset_parser.parser_type == "fragment_parameter":
            parser_class = "FragmentParameterUrlOffsetParser"
            if url_offset_parser.parameter is None:
                raise ValueError("Missing parameter for parser.")
            parameters.append(f'parameter="{url_offset_parser.parameter}"')
        elif url_offset_parser.parser_type == "path_segment":
            parser_class = "PathSegmentUrlOffsetParser"
            if url_offset_parser.segment is None:
                raise ValueError("Missing segment for parser.")
            parameters.append(f"segment={url_offset_parser.segment}")
        else:
            raise ValueError(f"Unknown parser type: {url_offset_parser.parser_type}")
        if url_offset_parser.remove_pattern_regex is not None:
            parameters.append(
                f'remove_pattern=re_compile(r"{url_offset_parser.remove_pattern_regex}")'
            )
        if url_offset_parser.space_pattern_regex is not None:
            parameters.append(
                f'space_pattern=re_compile(r"{url_offset_parser.space_pattern_regex}")'
            )
        print(
            f"""
{parser_class}(
    {",\n    ".join(parameters)},
),
    """.strip()
        )

In [None]:
from itertools import groupby

warc_query_parsers: list[WarcQueryParser] = list(WarcQueryParser.search().scan())
warc_query_parsers.sort(
    key=lambda parser: (
        provider_priorities.get(parser.provider.id, -1)
        if parser.provider is not None
        else None,
        parser.provider.id if parser.provider is not None else None,
        parser.priority or None,
        parser.id,
    ),
    reverse=True,
)
for provider_id, group in groupby(
    warc_query_parsers, key=lambda p: p.provider.id if p.provider is not None else None
):
    print(
        f"# Provider: {provider_names.get(provider_id, 'Global') if provider_id is not None else 'Global'}"
        + f" ({provider_domains[provider_id]})"
        if provider_id is not None
        else ""
    )
    for warc_query_parser in group:
        parameters = []
        if warc_query_parser.provider is not None:
            parameters.append(f'provider_id=UUID("{warc_query_parser.provider.id}")')
        if warc_query_parser.url_pattern_regex is not None:
            parameters.append(
                f'url_pattern=re_compile(r"{warc_query_parser.url_pattern_regex}")'
            )
        parser_class: str
        if warc_query_parser.parser_type == "xpath":
            parser_class = "XpathWarcQueryParser"
            if warc_query_parser.xpath is None:
                raise ValueError("Missing xpath for parser.")
            parameters.append(f'xpath="{warc_query_parser.xpath}"')
        else:
            raise ValueError(f"Unknown parser type: {warc_query_parser.parser_type}")
        if warc_query_parser.remove_pattern_regex is not None:
            parameters.append(
                f'remove_pattern=re_compile(r"{warc_query_parser.remove_pattern_regex}")'
            )
        if warc_query_parser.space_pattern_regex is not None:
            parameters.append(
                f'space_pattern=re_compile(r"{warc_query_parser.space_pattern_regex}")'
            )
        print(
            f"""
{parser_class}(
    {",\n    ".join(parameters)},
),
    """.strip()
        )

In [None]:
from itertools import groupby

warc_web_search_result_blocks_parsers: list[WarcWebSearchResultBlocksParser] = list(
    WarcWebSearchResultBlocksParser.search().scan()
)
warc_web_search_result_blocks_parsers.sort(
    key=lambda parser: (
        provider_priorities.get(parser.provider.id, -1)
        if parser.provider is not None
        else None,
        parser.provider.id if parser.provider is not None else None,
        parser.priority or None,
        parser.id,
    ),
    reverse=True,
)
for provider_id, group in groupby(
    warc_web_search_result_blocks_parsers,
    key=lambda p: p.provider.id if p.provider is not None else None,
):
    print(
        f"# Provider: {provider_names.get(provider_id, 'Global') if provider_id is not None else 'Global'}"
        + f" ({provider_domains[provider_id]})"
        if provider_id is not None
        else ""
    )
    for warc_web_search_result_blocks_parser in group:
        parameters = []
        if warc_web_search_result_blocks_parser.provider is not None:
            parameters.append(
                f'provider_id=UUID("{warc_web_search_result_blocks_parser.provider.id}")'
            )
        if warc_web_search_result_blocks_parser.url_pattern_regex is not None:
            parameters.append(
                f'url_pattern=re_compile(r"{warc_web_search_result_blocks_parser.url_pattern_regex}")'
            )
        parser_class: str
        if warc_web_search_result_blocks_parser.parser_type == "xpath":
            parser_class = "XpathWarcWebSearchResultBlocksParser"
            if warc_web_search_result_blocks_parser.xpath is None:
                raise ValueError("Missing xpath for parser.")
            parameters.append(f'xpath="{warc_web_search_result_blocks_parser.xpath}"')
            if warc_web_search_result_blocks_parser.url_xpath is not None:
                parameters.append(
                    f'url_xpath="{warc_web_search_result_blocks_parser.url_xpath}"'
                )
            if warc_web_search_result_blocks_parser.title_xpath is not None:
                parameters.append(
                    f'title_xpath="{warc_web_search_result_blocks_parser.title_xpath}"'
                )
            if warc_web_search_result_blocks_parser.text_xpath is not None:
                parameters.append(
                    f'text_xpath="{warc_web_search_result_blocks_parser.text_xpath}"'
                )
        else:
            raise ValueError(
                f"Unknown parser type: {warc_web_search_result_blocks_parser.parser_type}"
            )
        print(
            f"""
{parser_class}(
    {",\n    ".join(parameters)},
),
    """.strip()
        )

In [None]:
from itertools import groupby

warc_special_contents_result_blocks_parsers: list[WarcSpecialContentsResultBlocksParser] = list(
    WarcSpecialContentsResultBlocksParser.search().scan()
)
warc_special_contents_result_blocks_parsers.sort(
    key=lambda parser: (
        provider_priorities.get(parser.provider.id, -1)
        if parser.provider is not None
        else None,
        parser.provider.id if parser.provider is not None else None,
        parser.priority or None,
        parser.id,
    ),
    reverse=True,
)
for provider_id, group in groupby(
    warc_special_contents_result_blocks_parsers,
    key=lambda p: p.provider.id if p.provider is not None else None,
):
    print(
        f"# Provider: {provider_names.get(provider_id, 'Global') if provider_id is not None else 'Global'}"
        + f" ({provider_domains[provider_id]})"
        if provider_id is not None
        else ""
    )
    for warc_special_contents_result_blocks_parser in group:
        parameters = []
        if warc_special_contents_result_blocks_parser.provider is not None:
            parameters.append(
                f'provider_id=UUID("{warc_special_contents_result_blocks_parser.provider.id}")'
            )
        if warc_special_contents_result_blocks_parser.url_pattern_regex is not None:
            parameters.append(
                f'url_pattern=re_compile(r"{warc_special_contents_result_blocks_parser.url_pattern_regex}")'
            )
        parser_class: str
        if warc_special_contents_result_blocks_parser.parser_type == "xpath":
            parser_class = "XpathWarcSpecialContentsResultBlocksParser"
            if warc_special_contents_result_blocks_parser.xpath is None:
                raise ValueError("Missing xpath for parser.")
            parameters.append(f'xpath="{warc_special_contents_result_blocks_parser.xpath}"')
            if warc_special_contents_result_blocks_parser.url_xpath is not None:
                parameters.append(
                    f'url_xpath="{warc_special_contents_result_blocks_parser.url_xpath}"'
                )
            if warc_special_contents_result_blocks_parser.text_xpath is not None:
                parameters.append(
                    f'text_xpath="{warc_special_contents_result_blocks_parser.text_xpath}"'
                )
        else:
            raise ValueError(
                f"Unknown parser type: {warc_special_contents_result_blocks_parser.parser_type}"
            )
        print(
            f"""
{parser_class}(
    {",\n    ".join(parameters)},
),
    """.strip()
        )