# Onboarding Notebook

The aim is to import webpages by providing a list of URLs. The list may come from any source, typically from a sitemap.



In [None]:
import sys

if "google.colab" in sys.modules:
    !pip install \
    "advertools>=0.16.4,<0.17.0" \
    "wordlift-client>=1.75.0,<2.0.0" \
    "tqdm>=4.67.1,<5.0.0" \
    "gql[aiohttp]>=3.5.1,<4.0.0" \
    "beautifulsoup4>=4.13.3,<5.0.0" \
    "rdflib>=7.1.3,<8.0.0" \
    "tenacity>=9.0.0,<10.0.0" \
    "wordlift-sdk @ git+https://github.com/wordlift/python-sdk.git"


In [None]:

import pandas as pd
import logging

import advertools as adv
import wordlift_sdk.entity as entity
import wordlift_sdk.graphql as graphql
from bs4 import BeautifulSoup
from rdflib import URIRef, Literal
from tenacity import retry, stop_after_attempt, wait_fixed
from tqdm.asyncio import tqdm
from wordlift_client import SitemapImportsApi, SitemapImportRequest, EmbeddingRequest, EntityPatchRequest
from wordlift_sdk.client import ClientConfigurationFactory
from wordlift_sdk.utils import create_entity_patch_request, delayed

try:
    # Configuration is in the `config/default.py` file.
    from config import default as config
except ImportError:
    logging.warn("Cannot import configuration from local `config/default.py` file.")

try:
    from google.colab import userdata

    WORDLIFT_KEY = userdata.get('WORDLIFT_KEY')
    OUTPUT_TYPE = 'http://schema.org/WebPage'
    SITEMAP_URL = '...set here the sitemap url...'
except ImportError:
    logging.warn("Cannot import configuration from google.colab.usermap.")

logging.basicConfig(level=logging.WARN, force=True)
logger = logging.getLogger(__name__)

# Defining the host is optional and defaults to https://api.wordlift.io
# See configuration.py for a list of all supported configuration parameters.
api_url = 'https://api.wordlift.io'
output_type = config.OUTPUT_TYPE or 'http://schema.org/WebPage'
configuration = ClientConfigurationFactory(key=config.WORDLIFT_KEY).create()


@retry(
    stop=stop_after_attempt(5),
    wait=wait_fixed(2)
)
async def import_url(url_list: list[str]) -> None:
    import wordlift_client

    async with wordlift_client.ApiClient(configuration) as api_client:
        imports_api = SitemapImportsApi(api_client)
        request = SitemapImportRequest(
            embedding=EmbeddingRequest(
                properties=["http://schema.org/headline", "http://schema.org/abstract", "http://schema.org/text"]
            ),
            output_types=[output_type],
            urls=url_list,
            overwrite=True,
            id_generator="headline-with-url-hash"
        )

        try:
            await imports_api.create_sitemap_import(sitemap_import_request=request)
        except Exception as e:
            logger.error("Error importing URLs: %s", e)


async def parse_html(entity_id: str, html: str) -> list[EntityPatchRequest]:
    soup = BeautifulSoup(html, 'html.parser')

    # Initialize an empty list to hold the combined results
    combined_content = []

    # Extract the 'mz:section' meta tag
    section_meta = soup.find('meta', attrs={'property': 'mz:section'})
    if section_meta and 'content' in section_meta.attrs:
        section_content = [item.strip() for item in section_meta['content'].split(',')]
        combined_content.extend(section_content)

    # Extract the 'mz:subsection' meta tag
    subsection_meta = soup.find('meta', attrs={'property': 'mz:subsection'})
    if subsection_meta and 'content' in subsection_meta.attrs:
        subsection_content = [item.strip() for item in subsection_meta['content'].split(',')]
        combined_content.extend(subsection_content)

    resource = URIRef(entity_id)

    payloads = []

    for value in combined_content:
        payloads.append(
            create_entity_patch_request(
                resource,
                URIRef('http://schema.org/keywords'),
                Literal(value)
            )
        )

    return payloads


async def kg() -> pd.DataFrame:
    return await graphql.query(
        key=config.WORDLIFT_KEY,
        query_string="""
            query getEntities($type_constraint: String!) {
              entities(
                query: { typeConstraint: { in: [$type_constraint] } }
              ) {
                id: iri
                keywords: string(name: "schema:keywords")
                url: string(name: "schema:url")
              }
            }
        """,
        root_element="entities",
        columns=['id', 'keywords', 'url'],
        variable_values={
            "type_constraint": output_type
        }
    )


async def main():
    # Get the list of URLs from the sitemap (`loc` column)
    sitemap_df = adv.sitemap_to_df(config.SITEMAP_URL)

    # Get the data from the KG to determine which URLs are already imported and which not.
    kg_df = kg()

    # Get the list of missing URLs, these are the URLs we'll import.
    missing_url_list = list(set(sitemap_df['loc']) - set(kg_df['url']))

    # Import the URLs by calling the `import_url` method. We use `delayed` to parallelize work.
    await tqdm.gather(*[delayed(import_url)([url]) for url in missing_url_list], total=len(missing_url_list))

    # Reload the Kg after the import to get the list of URLs that are missing the `keywords` field.
    # @@TODO we can call a different graphql query that filters already by keywords not present or empty instead of filtering client-side.
    kg_df = kg()

    # Filter the KG to list only the URLs without `keywords`.
    no_keywords_df = kg_df[kg_df['keywords'].isna()]

    # Enrich the Graph, notice that here we pass our callback `parse_html` which will return Patch requests, no need to deal with the actual API.
    await tqdm.gather(
        *[delayed(entity.enrich(configuration, parse_html))(row) for index, row in no_keywords_df.iterrows()],
        total=len(no_keywords_df)
    )


await main()

