In [15]:
import logging

logging.basicConfig(level=logging.WARNING, force=True)
logger = logging.getLogger(__name__)
logger.setLevel(logging.INFO)

# Configuration from config/default.py file.
try:
    # Configuration is in the `config/default.py` file.
    from config import default as config

    WORDLIFT_KEY = config.WORDLIFT_KEY
except ImportError:
    logging.warning("Cannot import configuration from local `config/default.py` file.")

# Configuration from Google Colab Secrets.
try:
    from google.colab import userdata

    WORDLIFT_KEY = userdata.get('WORDLIFT_KEY')
except ImportError:
    logging.warning("Cannot import configuration from google.colab.usermap.")

if WORDLIFT_KEY is None:
    raise ValueError('Configuration not set')



In [16]:
import sys

if "google.colab" in sys.modules:
    !pip install \
    "wordlift-client>=1.75.0,<2.0.0" \
    "tqdm>=4.67.1,<5.0.0" \
    "gql[aiohttp]>=3.5.1,<4.0.0"
    "tenacity>=9.0.0,<10.0.0" \
    "wordlift-sdk @ git+https://github.com/wordlift/python-sdk.git"

In [17]:

import pandas as pd

from wordlift_client import InternalLinkRequest, Item
from tqdm.asyncio import tqdm
import wordlift_sdk.graphql as graphql
from wordlift_sdk.client import ClientConfigurationFactory
from wordlift_sdk.utils import delayed, create_entity_patch_request
from pandas import Series
from tenacity import retry, stop_after_attempt, wait_fixed
from wordlift_client import InternalLink
from rdflib import Graph, URIRef, RDF, Literal, XSD
from urllib.parse import quote
import re
from wordlift_client import InternalLinksApi, AnchorText, VectorSearchQueryRequest, EntityPatchRequest
import wordlift_sdk.entity as entity

# Defining the host is optional and defaults to https://api.wordlift.io
# See configuration.py for a list of all supported configuration parameters.
api_url = 'https://api.wordlift.io'
configuration = ClientConfigurationFactory(key=WORDLIFT_KEY).create()

In [18]:
import wordlift_client


@retry(
    stop=stop_after_attempt(5),
    wait=wait_fixed(2)
)
async def internal_link(row: Series) -> InternalLink | None:
    import wordlift_client
    entity_url = row['url']
    entity_id = row['iri']

    async with wordlift_client.ApiClient(configuration) as api_client:
        api = InternalLinksApi(api_client)
        request = InternalLinkRequest(
            anchor_text=AnchorText(
                enabled=True
            ),
            items=[
                Item(
                    id=entity_id,
                    query=VectorSearchQueryRequest(
                        query_url=entity_url,
                        similarity_top_k=10
                    )
                )
            ]
        )

        try:
            results = await api.create_internal_link_suggestion(internal_link_request=request, _request_timeout=120)
            return results[0]
        except Exception as e:
            logger.error("Error creating Internal Links: %s", e)
            raise e


class InternalLinkData:
    source_id: str
    source_patch_request: EntityPatchRequest
    link_group_graph: Graph

    def __init__(self, source_id: str, source_patch_request: EntityPatchRequest, link_group_graph: Graph):
        self.source_id = source_id
        self.source_patch_request = source_patch_request
        self.link_group_graph = link_group_graph


async def create_internal_link_data(internal_link: InternalLink, group_id: str) -> InternalLinkData:
    """
    Create an RDFlib Graph from an InternalLink object using the SEO vocabulary.
    
    Args:
        internal_link: InternalLink object from wordlift_client
        
    Returns:
        RDFlib Graph containing the mapped data
    """

    # This is an example structure:
    #
    # InternalLink(
    #     destinations=[
    #         InternalLinkDestination(
    #             name='SEO Strategies',
    #             position=1,
    #             url='https://wordlift.io/blog/en/advanced-seo-natural-language-processing/'
    #         ),
    #         InternalLinkDestination(
    #             name='SERP Analysis',
    #             position=2,
    #             url='https://wordlift.io/blog/en/serp-analysis/'
    #         ),
    #         InternalLinkDestination(
    #             name='Semantic Search',
    #             position=3,
    #             url='https://wordlift.io/blog/en/semantic-search/'
    #         ),
    #         InternalLinkDestination(
    #             name='Text Summarize',
    #             position=4,
    #             url='https://wordlift.io/blog/en/text-summarization-in-seo/'
    #         ),
    #         InternalLinkDestination(
    #             name='RankBrain In SEO',
    #             position=5,
    #             url='https://wordlift.io/blog/en/rankbrain-will-make-blog-worthless-unless/'
    #         ),
    #         InternalLinkDestination(
    #             name='SEO and AI',
    #             position=6,
    #             url='https://wordlift.io/blog/en/how-expert-professional-seo-evolves-with-ai/'
    #         ),
    #         InternalLinkDestination(
    #             name='Content Optimize',
    #             position=7,
    #             url='https://wordlift.io/blog/en/seo-content-optimization/'
    #         ),
    #         InternalLinkDestination(
    #             name='Google Advances',
    #             position=8,
    #             url='https://wordlift.io/blog/en/advances-in-image-understanding/'
    #         ),
    #         InternalLinkDestination(
    #             name='Knowledge Graphs',
    #             position=9,
    #             url='https://wordlift.io/blog/en/finding-entities-knowledge-graphs/'
    #         )
    #     ],
    #     source=InternalLinkSource(
    #         id='https://data.wordlift.io/wl1505904/title-tag-seo-using-deep-learning-and-tensorflow-3e9202b7c7a6fde83605021a5820ab04',
    #         name=None,
    #         url='https://wordlift.io/blog/en/title-tag-seo-using-ai/'
    #     )
    # )

    # Validate group_id
    if not group_id or not isinstance(group_id, str):
        raise ValueError("group_id must be a non-empty string")

    # Check for valid characters (alphanumeric, hyphen, underscore)
    if not re.match(r'^[a-zA-Z0-9\-_]+$', group_id):
        raise ValueError("group_id must contain only alphanumeric characters, hyphens, or underscores")

    # URL encode the group_id for extra safety
    safe_group_id = quote(group_id)

    link_group_graph = Graph()
    source_graph = Graph()

    source_graph.bind("seovoc", "https://w3id.org/seovoc/")

    # Define namespaces
    link_group_graph.bind("schema", "http://schema.org/")
    link_group_graph.bind("seovoc", "https://w3id.org/seovoc/")
    link_group_graph.bind("xsd", "http://www.w3.org/2001/XMLSchema#")

    # Create source resource
    source = internal_link.source
    source_resource = URIRef(source.id)

    # Create a default link group for the destinations
    link_group_id = f"{source.id}/linkgroup_{safe_group_id}"
    link_group = URIRef(link_group_id)

    has_link_group = URIRef("https://w3id.org/seovoc/hasLinkGroup")
    source_graph.add((source_resource, has_link_group, link_group))

    link_group_graph.add((link_group, RDF.type, URIRef("https://w3id.org/seovoc/LinkGroup")))
    link_group_graph.add((link_group, URIRef("http://schema.org/identifier"), Literal(group_id)))
    link_group_graph.add((link_group, URIRef("http://schema.org/name"), Literal("Related Links")))
    link_group_graph.add((link_group, URIRef("https://w3id.org/seovoc/isLinkGroupOf"), source_resource))

    # Add destinations as links
    for dest in internal_link.destinations:
        # Create link resource
        link_id = f"{link_group_id}/link_{dest.position}"
        link_resource = URIRef(link_id)
        link_group_graph.add((link_resource, RDF.type, URIRef("https://w3id.org/seovoc/Link")))

        # Add link properties
        link_group_graph.add(
            (link_resource, URIRef("http://schema.org/position"), Literal(dest.position, datatype=XSD.integer)))
        link_group_graph.add((link_resource, URIRef("http://schema.org/name"), Literal(dest.name)))
        link_group_graph.add((link_resource, URIRef("https://w3id.org/seovoc/anchorText"), Literal(dest.name)))
        link_group_graph.add((link_resource, URIRef("https://w3id.org/seovoc/anchorValue"), URIRef(dest.url)))
        link_group_graph.add((link_resource, URIRef("https://w3id.org/seovoc/isLinkOf"), link_group))
        link_group_graph.add((link_group, URIRef("https://w3id.org/seovoc/hasLink"), link_resource))

    source_patch_request = EntityPatchRequest(
        op='add',
        path='/' + str(has_link_group),
        value=source_graph.serialize(format='json-ld', auto_compact=True)
    )

    return InternalLinkData(source.id, source_patch_request, link_group_graph)


async def create_has_link_group_property(internal_link: InternalLink, group_id: str) -> Graph:
    # Validate group_id
    if not group_id or not isinstance(group_id, str):
        raise ValueError("group_id must be a non-empty string")

    # Check for valid characters (alphanumeric, hyphen, underscore)
    if not re.match(r'^[a-zA-Z0-9\-_]+$', group_id):
        raise ValueError("group_id must contain only alphanumeric characters, hyphens, or underscores")

    # URL encode the group_id for extra safety
    safe_group_id = quote(group_id)

    g = Graph()

    # Define namespaces
    g.bind("schema", "http://schema.org/")
    g.bind("seovoc", "https://w3id.org/seovoc/")
    g.bind("xsd", "http://www.w3.org/2001/XMLSchema#")

    # Create source resource
    source = internal_link.source
    source_resource = URIRef(source.id)

    # Create a default link group for the destinations
    link_group_id = f"{source.id}/linkgroup_{safe_group_id}"
    link_group = URIRef(link_group_id)
    g.add((source_resource, URIRef("https://w3id.org/seovoc/hasLinkGroup"), link_group))


async def handle(row: Series) -> None:
    response = await internal_link(row)

    if not response:
        return

    data = await create_internal_link_data(response, 'onboarding_notebook')

    await entity.patch(configuration, data.source_id, [data.source_patch_request])

    async with wordlift_client.ApiClient(configuration) as api_client:
        # Create an instance of the API class
        api_instance = wordlift_client.EntitiesApi(api_client)
        body = data.link_group_graph.serialize(format="turtle")
        await api_instance.create_or_update_entities(body, _content_type="text/turtle")



In [19]:
from pprint import pprint


@retry(
    stop=stop_after_attempt(5),
    wait=wait_fixed(2)
)
async def kg() -> pd.DataFrame:
    return await graphql.query(
        key=WORDLIFT_KEY,
        query_string="""
        query {
          entities(
            query: {
              embeddingValueConstraint: { exists: { exists: true, excludeEmpty: true } }
            }
          ) {
            iri
            url: string(name: "schema:url")
          }
        }
    """,
        root_element='entities',
        columns=['iri', 'url'],
    )


async def main() -> None:
    id_url_df = await kg()

    # Enrich the Graph, notice that here we pass our callback `parse_html` which will return Patch requests, no need to deal with the actual API. We're polite and not making more than 2 concurrent reqs.
    sel_df = id_url_df.head(10)
    await tqdm.gather(
        *[delayed(handle, 2)(row) for index, row in sel_df.iterrows()],
        total=len(sel_df)
    )


await main()


 80%|████████  | 8/10 [00:20<00:05,  2.83s/it]ERROR:__main__:Error creating Internal Links: 
ERROR:__main__:Error creating Internal Links: 
100%|██████████| 10/10 [02:44<00:00, 16.40s/it]
