# Google Search Console data

## Configuration

There are two configuration sources, at least one of the two is needed, and they're applied in order:

1. A file config/default.py
2. Local constants and WordLift Key in Google Colab Secrets

There's only one configuration settings:

* `WORDLIFT_KEY`, holding the WordLift Key, when using Google Colab, it can be set in the secrets

In [None]:
import logging

from wordlift_client import AnalyticsImportRequest

logging.basicConfig(level=logging.WARNING, force=True)
logger = logging.getLogger(__name__)
logger.setLevel(logging.INFO)

# Configuration from config/default.py file.
try:
    # Configuration is in the `config/default.py` file.
    from config import default as config

    WORDLIFT_KEY = config.WORDLIFT_KEY
    OUTPUT_TYPE = config.OUTPUT_TYPE or 'http://schema.org/WebPage'
    URLS = config.URLS
except ImportError:
    logging.warning("Cannot import configuration from local `config/default.py` file.")

# Configuration from Google Colab Secrets.
try:
    from google.colab import userdata

    WORDLIFT_KEY = userdata.get('WORDLIFT_KEY')
    OUTPUT_TYPE = 'http://schema.org/WebPage'
    URLS = []
except ImportError:
    logging.warning("Cannot import configuration from google.colab.usermap.")

if WORDLIFT_KEY is None:
    raise ValueError('Configuration not set')

# Dependencies

This part is only for Google Colab. When the notebook is used locally we recommend using `poetry install`.

In [None]:
import sys

if "google.colab" in sys.modules:
    !pip install \
    "tenacity>=9.0.0,<10.0.0" \
    "tqdm>=4.67.1,<5.0.0" \
    "wordlift-sdk @ git+https://github.com/wordlift/python-sdk.git"

# Imports

This section provides general imports and basic configuration, no need to do anything here.

In [None]:

from wordlift_client import SitemapImportsApi, SitemapImportRequest, EmbeddingRequest
from wordlift_sdk.client import ClientConfigurationFactory
import wordlift_client
from tenacity import retry, stop_after_attempt, wait_fixed
from wordlift_sdk.utils import delayed, create_dataframe_of_url_id
from tqdm.asyncio import tqdm
from pandas import Series

# Defining the host is optional and defaults to https://api.wordlift.io
# See configuration.py for a list of all supported configuration parameters.
api_url = 'https://api.wordlift.io'
configuration = ClientConfigurationFactory(key=WORDLIFT_KEY).create()

In [None]:


@retry(
    stop=stop_after_attempt(5),
    wait=wait_fixed(2)
)
async def import_url(url_list: list[str]) -> None:
    import wordlift_client

    async with wordlift_client.ApiClient(configuration) as api_client:
        imports_api = SitemapImportsApi(api_client)
        request = SitemapImportRequest(
            embedding=EmbeddingRequest(
                properties=["http://schema.org/headline", "http://schema.org/abstract", "http://schema.org/text"]
            ),
            output_types=[OUTPUT_TYPE],
            urls=url_list,
            overwrite=True,
            id_generator="headline-with-url-hash"
        )

        try:
            await imports_api.create_sitemap_import(sitemap_import_request=request)
        except Exception as e:
            logger.error("Error importing URLs: %s", e)

# Main Function

This is the main notebook function code.

## How does it work



In [None]:

@retry(
    stop=stop_after_attempt(5),
    wait=wait_fixed(2)
)
async def process(url: str) -> None:
    async with wordlift_client.ApiClient(configuration) as api_client:
        api_instance = wordlift_client.AnalyticsImportsApi(api_client)
        request = AnalyticsImportRequest(urls=[url])
        await api_instance.create_analytics_import(request)


async def main() -> None:
    url_id_df = await create_dataframe_of_url_id(key=WORDLIFT_KEY, url_list=URLS)
    unique_urls = url_id_df['url'].unique()

    while len(url_id_df) < len(URLS):
        # Get the list of missing URLs, these are the URLs we'll import.
        missing_url_list = list(set(URLS) - set(unique_urls))
        logging.warning(
            "You provided %d URLs, %d URLs found in graph.\nMissing URLs:\n%s",
            len(URLS),
            len(unique_urls),
            '\n'.join(missing_url_list)
        )

        logging.info("Importing %d URLs", len(missing_url_list))

        # Import the URLs by calling the `import_url` method. We use `delayed` to parallelize work.
        await tqdm.gather(*[delayed(import_url)([url]) for url in missing_url_list], total=len(missing_url_list))

        url_id_df = await create_dataframe_of_url_id(key=WORDLIFT_KEY, url_list=URLS)

    logging.info("Loading Google Search Console data")
    # We're polite and not making more than 2 concurrent reqs.
    await tqdm.gather(
        *[delayed(process, 2)(url) for url in unique_urls],
        total=len(url_id_df)
    )


await main()
