# Google Search Console data

## Configuration

There are two configuration sources, at least one of the two is needed, and they're applied in order:

1. A file config/default.py
2. Local constants and WordLift Key in Google Colab Secrets

There's only one configuration settings:

* `WORDLIFT_KEY`, holding the WordLift Key, when using Google Colab, it can be set in the secrets

In [None]:
import logging

from pandas import Series, DataFrame
from wordlift_client import (
    AnalyticsImportRequest,
    AccountApi,
    EntityGapsApi,
    EntityGapRequest,
)

logging.basicConfig(level=logging.WARNING, force=True)
logger = logging.getLogger(__name__)
logger.setLevel(logging.INFO)

# Configuration from config/default.py file.
try:
    # Configuration is in the `config/default.py` file.
    from config import default as config

    WORDLIFT_KEY = config.WORDLIFT_KEY
    OUTPUT_TYPE = config.OUTPUT_TYPE or "http://schema.org/WebPage"
    URLS = config.URLS
except ImportError:
    logging.warning("Cannot import configuration from local `config/default.py` file.")

# Configuration from Google Colab Secrets.
try:
    from google.colab import userdata

    WORDLIFT_KEY = userdata.get("WORDLIFT_KEY")
    OUTPUT_TYPE = "http://schema.org/WebPage"
    URLS = []
except ImportError:
    logging.warning("Cannot import configuration from google.colab.usermap.")

if WORDLIFT_KEY is None:
    raise ValueError("Configuration not set")

# Dependencies

This part is only for Google Colab. When the notebook is used locally we recommend using `poetry install`.

In [None]:
import sys

if "google.colab" in sys.modules:
    !pip install \
    "tenacity>=9.0.0,<10.0.0" \
    "tqdm>=4.67.1,<5.0.0" \
    "wordlift-sdk @ git+https://github.com/wordlift/python-sdk.git"

# Imports

This section provides general imports and basic configuration, no need to do anything here.

In [None]:
from wordlift_client import SitemapImportsApi, SitemapImportRequest, EmbeddingRequest
from wordlift_sdk.client import ClientConfigurationFactory
import wordlift_client
from tenacity import retry, stop_after_attempt, wait_fixed
from wordlift_sdk.utils import delayed, create_dataframe_of_url_id
from tqdm.asyncio import tqdm
from datetime import datetime, timedelta
from dataclasses import dataclass, field
from typing import Optional, Awaitable
import pandas as pd
from wordlift_sdk.graphql import GraphQLClientFactory

# Defining the host is optional and defaults to https://api.wordlift.io
# See configuration.py for a list of all supported configuration parameters.
api_url = "https://api.wordlift.io"
configuration = ClientConfigurationFactory(key=WORDLIFT_KEY).create()

In [None]:
@retry(stop=stop_after_attempt(5), wait=wait_fixed(2))
async def import_url(url_list: list[str]) -> None:
    import wordlift_client

    async with wordlift_client.ApiClient(configuration) as api_client:
        imports_api = SitemapImportsApi(api_client)
        request = SitemapImportRequest(
            embedding=EmbeddingRequest(
                properties=[
                    "http://schema.org/headline",
                    "http://schema.org/abstract",
                    "http://schema.org/text",
                ]
            ),
            output_types=[OUTPUT_TYPE],
            urls=url_list,
            overwrite=True,
            id_generator="headline-with-url-hash",
        )

        try:
            await imports_api.create_sitemap_import(sitemap_import_request=request)
        except Exception as e:
            logger.error("Error importing URLs: %s", e)

# Main Function

This is the main notebook function code.

## How does it work



In [None]:
from typing import Callable
from wordlift_client import AccountInfo, AnalysesResponse
from pycountry import countries


@retry(stop=stop_after_attempt(5), wait=wait_fixed(2))
async def process(row: Series) -> None:
    url = row["url"]
    async with wordlift_client.ApiClient(configuration) as api_client:
        api_instance = wordlift_client.AnalyticsImportsApi(api_client)
        request = AnalyticsImportRequest(urls=[url])
        await api_instance.create_analytics_import(request)


@dataclass
class EntityTopQuery:
    iri: str
    url: str
    name: str
    headline: str
    title: str
    top_query_iri: Optional[str] = field(default=None)
    top_query_name: Optional[str] = field(default=None)
    top_query_impressions: Optional[int] = field(default=None)
    top_query_clicks: Optional[int] = field(default=None)
    top_query_date_created: Optional[str] = field(default=None)

    @staticmethod
    def from_graphql_response(entity_data: dict) -> "EntityTopQuery":
        # Initialize top_query fields with default values
        top_query_iri = top_query_name = top_query_impressions = top_query_clicks = (
            top_query_date_created
        ) = None

        # Check if there are any top queries
        if entity_data.get("top_query"):
            top_query_data = entity_data["top_query"][0]
            top_query_iri = top_query_data.get("iri")
            top_query_name = top_query_data.get("name")
            top_query_impressions = top_query_data.get("impressions")
            top_query_clicks = top_query_data.get("clicks")
            top_query_date_created = top_query_data.get("date_created")

        # Create an Entity instance
        return EntityTopQuery(
            iri=entity_data["iri"],
            url=entity_data["url"],
            name=entity_data["name"],
            headline=entity_data["headline"],
            title=entity_data["title"],
            top_query_iri=top_query_iri,
            top_query_name=top_query_name,
            top_query_impressions=top_query_impressions,
            top_query_clicks=top_query_clicks,
            top_query_date_created=top_query_date_created,
        )


@retry(stop=stop_after_attempt(5), wait=wait_fixed(2))
async def entity_with_top_query(url: str) -> Optional[EntityTopQuery]:
    from gql import gql

    # Create a GraphQL client using the defined transport
    client = GraphQLClientFactory(key=WORDLIFT_KEY).create()

    # Define the GraphQL query
    gql_query = gql("""
        query($url: String!) {
          entities(query: { urlConstraint: { in: [$url] } }) {
            iri
            url: string(name: "schema:url")
            name: string(name: "schema:name")
            headline: string(name: "schema:headline")
            title: string(name: "schema:title")
            top_query: topN(
              name: "seovoc:hasQuery"
              sort: { field: "seovoc:impressions3Months", direction: DESC }
              limit: 1
            ) {
              iri
              name: string(name: "seovoc:name")
              impressions: int(name: "seovoc:impressions3Months")
              clicks: int(name: "seovoc:clicks3Months")
              date_created: date(name: "seovoc:dateCreated")
            }
          }
        }
    """)

    # Asynchronous function to execute the query
    async with client as session:
        response = await session.execute(gql_query, variable_values={"url": url})

        if len(response["entities"]) == 0:
            return None

        return EntityTopQuery.from_graphql_response(response["entities"][0])


async def error_if_requirements_unsatisfied() -> Optional[AccountInfo]:
    async with wordlift_client.ApiClient(configuration) as api_client:
        api = AccountApi(api_client)
        account = await api.get_me()
        if account.google_search_console_site_url is None:
            logger.error(
                "%s is not connected to Google Search Console, open https://my.wordlift.io to connect it.",
                account.dataset_uri,
            )
            return None

        if account.country_code is None:
            logger.error(
                "%s country code not configured, open https://my.wordlift.io to configure it.",
                account.dataset_uri,
            )
            return None

        return account


async def create_entities_with_top_query_dataframe(url_list: list[str]) -> DataFrame:
    # Get the entities data with the top query.
    logger.info("Loading entities with top query...")
    entities_with_top_query = await tqdm.gather(
        *[delayed(entity_with_top_query, 4)(url) for url in url_list],
        total=len(url_list),
    )

    entities_with_top_query_df = pd.DataFrame(entities_with_top_query)
    entities_with_top_query_df["calc_name"] = (
        entities_with_top_query_df[["name", "headline", "title", "url"]]
        .bfill(axis=1)
        .iloc[:, 0]
    )
    entities_with_top_query_df["top_query_date_created"] = pd.to_datetime(
        entities_with_top_query_df["top_query_date_created"], errors="coerce"
    )

    return entities_with_top_query_df


async def create_url_id_dataframe_importing_missing_urls(
    key: str, url_list: list[str]
) -> DataFrame:
    url_id_df = await create_dataframe_of_url_id(key=key, url_list=url_list)
    unique_urls = url_id_df["url"].unique()

    # Try importing the missing URLs.
    while len(url_id_df) < len(URLS):
        # Get the list of missing URLs, these are the URLs we'll import.
        missing_url_list = list(set(URLS) - set(unique_urls))
        logging.warning(
            "You provided %d URLs, %d URLs found in graph.\nMissing URLs:\n%s",
            len(URLS),
            len(unique_urls),
            "\n".join(missing_url_list),
        )

        logger.info("Importing %d URLs", len(missing_url_list))

        # Import the URLs by calling the `import_url` method. We use `delayed` to parallelize work.
        await tqdm.gather(
            *[delayed(import_url)([url]) for url in missing_url_list],
            total=len(missing_url_list),
        )

        url_id_df = await create_dataframe_of_url_id(key=WORDLIFT_KEY, url_list=URLS)

    return url_id_df


async def create_entity_gaps_factory(
    query_location_name: str,
) -> Callable[[Series], Awaitable[Optional[AnalysesResponse]]]:
    @retry(stop=stop_after_attempt(10), wait=wait_fixed(2))
    async def create_entity_gaps(row: Series) -> Optional[AnalysesResponse]:
        url = row["url"]
        query = row["top_query_name"]
        if query is None:
            return None

        async with wordlift_client.ApiClient(configuration) as api_client:
            api = EntityGapsApi(api_client)
            return await api.create_entity_gap(
                EntityGapRequest(
                    url=url, query=query, query_location_name=query_location_name
                )
            )

    return create_entity_gaps


async def append_entity_gaps_response_to_row_factory(
    create_entity_gaps: Callable[[Series], Awaitable[Optional[AnalysesResponse]]],
) -> Callable[[Series], Awaitable[Series]]:
    async def append_entity_gaps_response_to_row(row: Series) -> Series:
        response = await create_entity_gaps(row)
        if response:
            row["entity_gaps"] = response.items
        return row

    return append_entity_gaps_response_to_row


async def main() -> None:
    # Exit if Google Search Console isn't connected.
    account = await error_if_requirements_unsatisfied()
    if account is None:
        return

    # Get the country name
    country = countries.get(alpha_2=account.country_code.upper())
    if country is None:
        logger.error(
            "Country code %s is invalid, open https://my.wordlift.io to reconfigure it."
        )
        return

    # Get the list of URLs in the dataframe.
    url_id_df = await create_url_id_dataframe_importing_missing_urls(
        key=WORDLIFT_KEY, url_list=URLS
    )
    unique_urls = url_id_df["url"].unique()

    # Get the entities data with the top query.
    entities_with_top_query_df = await create_entities_with_top_query_dataframe(
        unique_urls
    )

    # Calculate the date 7 days ago from today
    seven_days_ago = datetime.now() - timedelta(days=7)

    # Filter the DataFrame
    entities_with_stale_data_df = entities_with_top_query_df[
        entities_with_top_query_df["top_query_date_created"].isna()
        | (entities_with_top_query_df["top_query_date_created"] < seven_days_ago)
    ]

    if len(entities_with_stale_data_df) > 0:
        logger.info("Updating missing or stale Google Search Console data...")
        # We're polite and not making more than 2 concurrent reqs.
        await tqdm.gather(
            *[
                delayed(process, 2)(row)
                for index, row in entities_with_stale_data_df.iterrows()
            ],
            total=len(entities_with_stale_data_df),
        )

        entities_with_top_query_df = await create_entities_with_top_query_dataframe(
            unique_urls
        )

    await tqdm.gather(
        *[
            delayed(
                await append_entity_gaps_response_to_row_factory(
                    create_entity_gaps=await create_entity_gaps_factory(country.name)
                ),
                2,
            )(row)
            for index, row in entities_with_top_query_df.iterrows()
        ],
        total=len(entities_with_top_query_df),
    )

    [
        logger.info("%s, top query '%s':", row["calc_name"], row["top_query_name"])
        for index, row in entities_with_top_query_df.iterrows()
    ]


await main()