In [1]:
import requests
from dotenv import load_dotenv

import nest_asyncio
nest_asyncio.apply()

from langchain.document_loaders.sitemap import SitemapLoader
import sys
sys.path.append('../')
from SitemapLoaderExtended import SitemapLoaderExtended
from KBSitemapLoaderExtended import KBSitemapLoaderExtended
from bs4 import BeautifulSoup

from langchain.schema import Document
from typing import Any, Callable, Generator, Iterable, List, Optional, Iterator
import json

import re



### Docsite scraping

In [2]:
#specifically scrape managed-cloud and english docs
sitemap_loader = SitemapLoaderExtended(web_path="https://doc.sitecore.com/sitemap-index.xml", filter_urls=[r".+/en/.+/managed-cloud/.+"])

#rate limit
sitemap_loader.requests_per_second = 2

docs = sitemap_loader.lazy_load()


In [3]:
# lazy loading, keep calling next to iterate through the generator object
# a = next(docs)

#to load all docs, simply use a list function to return a list of
docs_site = []
for page in docs:
    docs_site.append(page)

In [5]:
len(docs_site)

563

In [6]:
for i in docs_site:
    if i.metadata["source"] == "https://doc.sitecore.com/xp/en/developers/101/managed-cloud/configure-managed-cloud.html":
        print(i.page_content)
        print(i.metadata)

 Edit in Paligo



Current version: 10.1



Abstract
Learn how to configure your Managed Cloud solution to best suit the needs of your organization.




This topic describes how to scale, size, and tune your Managed Cloud container solutions. It describes how to find and work with the configuration files in the infrastructure and application repositories.







Manage the size of Azure components





You might want to customize your Azure Kubernetes Service (AKS) cluster size, SQL database, and/or Azure Container Registry (ACR) tier size.
To manage the size of Azure components:



In the Infrastructure repository, go to: .\config\resources\resources.json





Manage the AKS cluster size, SQL database, and/or ACR tier size by updating the following parameters:


Parameter

Description

windows_vm_size

The Windows VM AKS size.

windows_scaled_vm_size

The Windows Scaled VM Size. Used for more efficient scaling of Content Delivery and XDBCollection pods.

windows_node_count

The amount

In [6]:
#save scraped data as json


def save_docs_to_jsonl(array:Iterable[Document], file_path:str)->None:
    with open(file_path, 'w') as jsonl_file:
        for doc in array:
            jsonl_file.write(doc.json() + '\n')

def load_docs_from_jsonl(file_path)->Iterable[Document]:
    array = []
    with open(file_path, 'r') as jsonl_file:
        for line in jsonl_file:
            data = json.loads(line)
            obj = Document(**data)
            array.append(obj)
    return array
    
# save_docs_to_jsonl(docs_site,'docsites_without_jap.jsonl')

# docsites_doc = load_docs_from_jsonl('docsites_without_jap.jsonl')

In [9]:
for i in docs_site:
    print(i.metadata["title"])

Sitecore Cloud Services overview
Managed Cloud
Managed Cloud Standard
Sitecore Managed Cloud Standard overview
Topology and tiers for Sitecore Managed Cloud
The Managed Cloud Standard onboarding process
The Sitecore Managed Cloud Standard policies
SLA for Managed Cloud Standard
Request a Managed Cloud Standard environment
Access the Managed Cloud Standard portals
Managed Cloud Premium
Topologies and tiers for Sitecore Managed Cloud Premium
The Managed Cloud Premium onboarding process
Sitecore Experience Platform behavior on Azure
Managed Cloud Premium portals and account management
Using Managed Cloud Premium
Managed Cloud Premium FAQs
Planning for Managed Cloud
Planning: Commerce considerations for Managed Cloud
Capacity planning for Sitecore Managed Cloud
Comparing on-premise, IaaS, and PaaS for Managed Cloud
Sitecore module compatibility with Managed Cloud
Plan your RPO and RTO in case of an outage
Disaster recovery for the Managed Cloud service
Technical specifications for Managed 

In [28]:
len(docsites_doc)

563

### KB scraping

In [2]:
kb_sitemap_loader = KBSitemapLoaderExtended(web_path="https://support.sitecore.com/kb_sitemap.do")

#rate limit
kb_sitemap_loader.requests_per_second = 2

kb_docs = kb_sitemap_loader.lazy_load()

In [3]:
# lazy loading, keep calling next to iterate through the generator object
# a = next(docs)

kbs = []

#to load all docs, simply use a list function to return a list of documents
for kb in kb_docs:
    kbs.append(kb)


In [4]:
len(kbs)

1554

In [7]:
save_docs_to_jsonl(kbs,'kbs_without_jap.jsonl')

In [9]:
kbs

[Document(page_content="Systems infrastructure\r\nSitecore Discover\xa0is designed to automatically scale capacity as the traffic demand increases. However, Sitecore Discover also preemptively ramps up the capacity of all services during this time of the year which includes Thanksgiving, Black Friday, Cyber Monday, Christmas, and New Year.\r\nDuring Sitecore Discover's load testing, the system was able to easily handle\xa05x the daily peak traffic. Based on this experience, the Sitecore Discover engineering and operations teams are confident that the system can comfortably handle the anticipated holiday load.\r\nPersonnel and processes\r\nIn addition to the above, Sitecore Discover will also have additional personnel on duty to monitor and assist with any issues that are reported during the peak holiday traffic. Below is a summary of the holiday support plan in place.\r\nSupport structure\r\n\r\n\r\n\r\n\r\nOn-call Support\r\n\r\n\r\nTechnical Support Solution Engineers are on call 24/

In [11]:
kb_articles = load_docs_from_jsonl('kbs.jsonl')

### testing KB article scraping

In [7]:
headers = {
    "X-portal":"45d6680fdb52220099f93691f0b8f5ad",
    "Accept":"application/json"
}

response = requests.get("https://support.sitecore.com/api/now/sp/page?id=kb_article_view&sysparm_article=KB1003172", headers=headers)

In [8]:
a = response.json()

In [9]:
a

{'result': {'$$uiNotification': [],
  'metatags': [],
  'theme': {'footer': {}, 'header': {}},
  'containers': [{'sys_id': '0cc4f1a01b4e281016486392f54bcb8a',
    'bootstrap_alt': False,
    'subheader': True,
    'background': '{"background-size":"initial","background-position":"center center"}',
    'width': 'container',
    'container_class_name': 's_kb-container s_kb-container-1',
    'title': '',
    'rows': [{'sys_id': '8cc4f1a01b4e281016486392f54bcb8d',
      'columns': [{'sys_id': 'c4c4f1a01b4e281016486392f54bcb90',
        'size_classes': 'col-md-12 ',
        'rows': [],
        'widgets': [{'sys_id': '88c4f1a01b4e281016486392f54bcb9f',
          'widget': {'template': '<div class="s_breadcrumbs"> \n <div ng-class="c.data.showSearchBox ? \'col-md-9\' : \'col-md-12\'" class="breadcrumbs hidden-sm hidden-xs"> \n  <ul class="nav nav-pills nav-sm"> \n   <li><a ng-href="?id={{portal.homepage_dv}}">Home</a></li> \n   <li><i class="fa fa-chevron-right"></i></li> \n   <li ng-if="!c.b

In [10]:
a["result"]["containers"]

[{'sys_id': '0cc4f1a01b4e281016486392f54bcb8a',
  'bootstrap_alt': False,
  'subheader': True,
  'background': '{"background-size":"initial","background-position":"center center"}',
  'width': 'container',
  'container_class_name': 's_kb-container s_kb-container-1',
  'title': '',
  'rows': [{'sys_id': '8cc4f1a01b4e281016486392f54bcb8d',
    'columns': [{'sys_id': 'c4c4f1a01b4e281016486392f54bcb90',
      'size_classes': 'col-md-12 ',
      'rows': [],
      'widgets': [{'sys_id': '88c4f1a01b4e281016486392f54bcb9f',
        'widget': {'template': '<div class="s_breadcrumbs"> \n <div ng-class="c.data.showSearchBox ? \'col-md-9\' : \'col-md-12\'" class="breadcrumbs hidden-sm hidden-xs"> \n  <ul class="nav nav-pills nav-sm"> \n   <li><a ng-href="?id={{portal.homepage_dv}}">Home</a></li> \n   <li><i class="fa fa-chevron-right"></i></li> \n   <li ng-if="!c.breadcrumbs"><a href ng-bind="::page.title"></a></li> \n   <li ng-repeat-start="item in c.breadcrumbs"> <a ng-class="item.css_class" ng-

In [11]:
def findkeys(node, kv):
    if isinstance(node, list):
        for i in node:
            for x in findkeys(i, kv):
               yield x
    elif isinstance(node, dict):
        if kv in node:
            yield node[kv]
        for j in node.values():
            for x in findkeys(j, kv):
                yield x

In [20]:
title = findkeys(a, "page_title")
content = findkeys(a, "kbContentData")
language = findkeys(a, "langList")

In [21]:
list(title)

['Known Issues - Sitecore Identity Server 7.xのライセンス キーの有効期限が切れると、ログに警告メッセージが出力される']

In [27]:
language = list(language)
isJap = False
for i in language:
    for j in i:
        if j["language"] == "ja" and j["selected"] == True:
            isJap = True
            break

print(isJap)
print(language)


True
[[{'sys_id': '92b850c81b757d14722d4042b24bcb32', 'number': 'KB1003162', 'language': 'en', 'label': 'English', 'selected': False}, {'sys_id': '40c16b8e1b7d7594722d4042b24bcb11', 'number': 'KB1003172', 'language': 'ja', 'label': 'Japanese', 'selected': True}]]


In [23]:
print(language)

[[{'sys_id': '92b850c81b757d14722d4042b24bcb32', 'number': 'KB1003162', 'language': 'en', 'label': 'English', 'selected': False}, {'sys_id': '40c16b8e1b7d7594722d4042b24bcb11', 'number': 'KB1003172', 'language': 'ja', 'label': 'Japanese', 'selected': True}]]


In [46]:
b = list(content)


In [47]:
b[0]["data"]

'<h3><a id="Description" href="#Description">Description</a></h3>\r\n<p><strong>Purpose</strong></p>\r\n<p>This article describes the eligibility, scope, service levels, and other details of Sitecore\'s support offering.</p>\r\n<p><strong>Overview</strong></p>\r\n<p>Sitecore\'s core support offerings consist of two levels of support service: <strong>Standard Support</strong> and <strong>24x7 Premium Support</strong>. These support services provide increasing levels of responsiveness, from three business days for low priority problems, down to as little as one hour for critical issues, and varying hours of coverage.</p>\r\n<p>Access to Sitecore support is typically provided to a Sitecore Solution Partner on the customer\'s behalf. This ensures that experienced and knowledgeable support personnel are available to provide direct support for the solution.</p>\r\n<p>Customers with an active Sitecore Software Subscription are also provided access to Sitecore support.</p>\r\n<p>In addition to

### test scraping docsite

In [23]:
response = requests.get("https://doc.sitecore.com/xp/en/developers/100/managed-cloud/capacity-planning-for-sitecore-managed-cloud.html")  
soup = BeautifulSoup(response.content, 'xml') 
sections = soup.find_all('section')

In [24]:
sections[0]

<section class="section relative overflow-hidden" data-originid="UUID-8eaa56f7-41f1-f756-7c24-88d5f9c143e2" data-permalink="capacity-planning-for-sitecore-managed-cloud.html" data-publicationdate="04/03/23" data-relativeprefix="" data-timemodified="08/21/2019" data-topiclevel="1" dir="ltr" id="UUID-6e45c13d-259d-4b3d-e049-81fc43f0872d" lang="en" xml:lang="en"><div class="relative w-auto px-4 py-1 items-center justify-center mb-2 text-white transition-all duration-500 ease-in-out rounded-md opacity-75 align-self-start hover:opacity-100 bg-brand-violet-dark paligo-edit-link-container hidden"><a class="relative flex items-center justify-center w-full h-full focus-ring gap-x-4" href="https://sitecore.paligoapp.com/document/edit/UUID-8eaa56f7-41f1-f756-7c24-88d5f9c143e2" rel="noopener noreferrer" target="_blank"><svg aria-hidden="true" class="w-4 h-4 text-white" fill="none" stroke="currentColor" stroke-width="1.5" viewBox="0 0 24 24" xmlns="http://www.w3.org/2000/svg"><path d="M16.862 4.487

In [25]:
sections[0].find_all("h1", {"class": "title"})[0].get_text()

'Capacity planning for Sitecore Managed Cloud'

### Post-processing docsite

In [13]:

def process_docsite(data):
    for doc in data:   
        doc.page_content = re.sub(r'^ Edit in Paligo\n\n\n\nCurrent version: .+\n\n\n\n(Abstract\n)?', "", doc.page_content)
        doc.page_content = re.sub(r'\n+', '\n', doc.page_content).strip('\n')
        doc.page_content = re.sub(r' +', ' ', doc.page_content)
        doc.page_content = re.sub(r'\t+', ' ', doc.page_content)
        # doc.metadata["title"] = doc.metadata["title"].strip('\n')

docsites_doc_copy = load_docs_from_jsonl('docsites_without_jap.jsonl')
process_docsite(docsites_doc_copy)

# docsites_doc_copy[:10]
docsites_doc_copy


[Document(page_content='View all the Sitecore Cloud offerings to best target campaigns, scale your solution, and develop a disaster recovery plan that is custom-made for your organization and requirements.\nThis overview, and the following Sitecore Cloud Services overview diagram, takes you through all the Sitecore Cloud offerings, what their functionality is, what you can use them for, and how the different modules integrate with each other and the Sitecore Experience Platform (XP).\nNote\nThis topic describes how to deploy Sitecore on Microsoft Azure with Azure App Services (PaaS). If you are deploying with Docker Containers with Kubernetes, please refer to this Managed Cloud documentation version instead.\nRefer to the Module versions section to understand how all of the different Cloud modules relate to the various versions of the Sitecore platform, and use the Sitecore Cloud Status page to check how the various Cloud services are operating.\nModule versions\nSome of the Sitecore C

In [14]:
save_docs_to_jsonl(docsites_doc_copy, "docsites_without_jap2.jsonl")

### Post-processing KB

In [12]:

def process_kb(data):
    for doc in data:
        doc.page_content = re.sub(r'(\r\n)+', "\r\n", doc.page_content).strip("\r\n")
        doc.page_content = re.sub(r'\n+', "\n", doc.page_content).strip("\n")
        doc.page_content = re.sub(r' +', ' ', doc.page_content)
        doc.page_content = re.sub(r'\t+', ' ', doc.page_content)
        doc.page_content = re.sub(r'(\r\n-)+', ' ', doc.page_content)
        

kb_doc_copy = load_docs_from_jsonl('kbs_without_jap.jsonl')
process_kb(kb_doc_copy)

kb_doc_copy

[Document(page_content='Description\r\nThe IP address of the Solr server gets cached on the .NET level. This behavior might lead to connection issues if the solr.search connection string value is a domain name instead of a static IP address. If the DNS mapping for the domain is changed to a new IP address, the changes are not picked up by Sitecore instances until the instances have been restarted.This is a common scenario when using SearchStax Disaster Recovery. This article provides recommendations on how to reestablish connection to Solr when the DNS is updated to point to a new IP address.\r\nSolution\r\nTo reestablish connection to Solr when the DNS is updated to point to a new IP address, consider one of the following options:\r\nFor\xa0Sitecore XP 10.2.1 and later, enable the feature using the configuration file \\App_Config\\Include\\Examples\\Sitecore.ContentSearch.Solr.EnableConnectionLeaseTimeout.config.example.\r\nFor\xa0Sitecore XP 10.2.0, download and install the latest cu

In [13]:
save_docs_to_jsonl(kb_doc_copy, "kbs_without_jap2.jsonl")