In [1]:
import os
import openai
import requests
from dotenv import load_dotenv

_ = load_dotenv(dotenv_path="../.env") # read local .env file
openai.api_key  = os.environ['OPENAI_API_KEY']
openai.api_type = "azure"
openai.api_base = os.environ['OPENAI_ENDPOINT']
openai.api_version = os.environ['OPENAI_API_VERSION']

import nest_asyncio
nest_asyncio.apply()

from langchain.document_loaders.sitemap import SitemapLoader
import sys
sys.path.append('../')
from SitemapLoaderExtended import SitemapLoaderExtended
from KBSitemapLoaderExtended import KBSitemapLoaderExtended
from bs4 import BeautifulSoup





### Docsite scraping

In [13]:
#specifically scrape managed-cloud docs
sitemap_loader = SitemapLoaderExtended(web_path="https://doc.sitecore.com/sitemap-index.xml", filter_urls=[r".+/managed-cloud/.+"])

#rate limit
sitemap_loader.requests_per_second = 2

docs = sitemap_loader.lazy_load()


In [14]:
# lazy loading, keep calling next to iterate through the generator object
# a = next(docs)

#to load all docs, simply use a list function to return a list of
a = []
for page in docs:
    a.append(page)

In [15]:
len(a)

563

In [20]:
for i in a:
    if i.metadata["source"] == "https://doc.sitecore.com/xp/en/developers/101/managed-cloud/configure-managed-cloud.html":
        print(i.page_content)
        print(i.metadata)

 Edit in Paligo





Abstract
Learn how to configure your Managed Cloud solution to best suit the needs of your organization.




This topic describes how to scale, size, and tune your Managed Cloud container solutions. It describes how to find and work with the configuration files in the infrastructure and application repositories.







Manage the size of Azure components





You might want to customize your Azure Kubernetes Service (AKS) cluster size, SQL database, and/or Azure Container Registry (ACR) tier size.
To manage the size of Azure components:



In the Infrastructure repository, go to: .\config\resources\resources.json





Manage the AKS cluster size, SQL database, and/or ACR tier size by updating the following parameters:


Parameter

Description

windows_vm_size

The Windows VM AKS size.

windows_scaled_vm_size

The Windows Scaled VM Size. Used for more efficient scaling of Content Delivery and XDBCollection pods.

windows_node_count

The amount of Windows nodes. A mi

In [7]:
#save scraped data as json
from langchain.schema import Document
from typing import Any, Callable, Generator, Iterable, List, Optional, Iterator
import json

def save_docs_to_jsonl(array:Iterable[Document], file_path:str)->None:
    with open(file_path, 'w') as jsonl_file:
        for doc in array:
            jsonl_file.write(doc.json() + '\n')

def load_docs_from_jsonl(file_path)->Iterable[Document]:
    array = []
    with open(file_path, 'r') as jsonl_file:
        for line in jsonl_file:
            data = json.loads(line)
            obj = Document(**data)
            array.append(obj)
    return array
    
# save_docs_to_jsonl(a,'docsites.jsonl')

docsites_doc = load_docs_from_jsonl('docsites.jsonl')

In [27]:
docsites_doc

[Document(page_content=' Edit in Paligo\n\n\n\n\n\nAbstract\nView all the Sitecore Cloud offerings to best target campaigns, scale your solution, and develop a disaster recovery plan that is custom-made for your organization and requirements.\n\n\n\n\nThis overview, and the following Sitecore Cloud Services overview diagram, takes you through all the Sitecore Cloud offerings, what their functionality is, what you can use them for, and how the different modules integrate with each other and the Sitecore Experience Platform (XP).\nNote\nThis topic describes how to deploy Sitecore on Microsoft Azure with Azure App Services (PaaS). If you are deploying with Docker Containers with Kubernetes, please refer to this Managed Cloud documentation version instead.\n\nRefer to the Module versions section to understand how all of the different Cloud modules relate to the various versions of the Sitecore platform, and use the Sitecore Cloud Status page to check how the various Cloud services are oper

In [28]:
len(docsites_doc)

563

### KB scraping

In [2]:
kb_sitemap_loader = KBSitemapLoaderExtended(web_path="https://support.sitecore.com/kb_sitemap.do")

#rate limit
kb_sitemap_loader.requests_per_second = 2

kb_docs = kb_sitemap_loader.lazy_load()

In [3]:
# lazy loading, keep calling next to iterate through the generator object
# a = next(docs)

kbs = []

#to load all docs, simply use a list function to return a list of documents
for kb in kb_docs:
    kbs.append(kb)


In [5]:
len(kbs)

2760

In [8]:
save_docs_to_jsonl(kbs,'kbs.jsonl')

In [9]:
kbs

[Document(page_content="Systems infrastructure\r\nSitecore Discover\xa0is designed to automatically scale capacity as the traffic demand increases. However, Sitecore Discover also preemptively ramps up the capacity of all services during this time of the year which includes Thanksgiving, Black Friday, Cyber Monday, Christmas, and New Year.\r\nDuring Sitecore Discover's load testing, the system was able to easily handle\xa05x the daily peak traffic. Based on this experience, the Sitecore Discover engineering and operations teams are confident that the system can comfortably handle the anticipated holiday load.\r\nPersonnel and processes\r\nIn addition to the above, Sitecore Discover will also have additional personnel on duty to monitor and assist with any issues that are reported during the peak holiday traffic. Below is a summary of the holiday support plan in place.\r\nSupport structure\r\n\r\n\r\n\r\n\r\nOn-call Support\r\n\r\n\r\nTechnical Support Solution Engineers are on call 24/

In [11]:
kb_articles = load_docs_from_jsonl('kbs.jsonl')

In [12]:
kb_articles

[Document(page_content="Systems infrastructure\r\nSitecore Discover\xa0is designed to automatically scale capacity as the traffic demand increases. However, Sitecore Discover also preemptively ramps up the capacity of all services during this time of the year which includes Thanksgiving, Black Friday, Cyber Monday, Christmas, and New Year.\r\nDuring Sitecore Discover's load testing, the system was able to easily handle\xa05x the daily peak traffic. Based on this experience, the Sitecore Discover engineering and operations teams are confident that the system can comfortably handle the anticipated holiday load.\r\nPersonnel and processes\r\nIn addition to the above, Sitecore Discover will also have additional personnel on duty to monitor and assist with any issues that are reported during the peak holiday traffic. Below is a summary of the holiday support plan in place.\r\nSupport structure\r\n\r\n\r\n\r\n\r\nOn-call Support\r\n\r\n\r\nTechnical Support Solution Engineers are on call 24/

### testing KB article scraping

In [39]:
headers = {
    "X-portal":"45d6680fdb52220099f93691f0b8f5ad",
    "Accept":"application/json"
}

response = requests.get("https://support.sitecore.com/api/now/sp/page?id=kb_article_view&sysparm_article=KB0463549", headers=headers)

In [40]:
a = response.json()

In [41]:
a

{'result': {'$$uiNotification': [],
  'metatags': [],
  'theme': {'footer': {}, 'header': {}},
  'containers': [{'sys_id': '0cc4f1a01b4e281016486392f54bcb8a',
    'bootstrap_alt': False,
    'subheader': True,
    'background': '{"background-size":"initial","background-position":"center center"}',
    'width': 'container',
    'container_class_name': 's_kb-container s_kb-container-1',
    'title': '',
    'rows': [{'sys_id': '8cc4f1a01b4e281016486392f54bcb8d',
      'columns': [{'sys_id': 'c4c4f1a01b4e281016486392f54bcb90',
        'size_classes': 'col-md-12 ',
        'rows': [],
        'widgets': [{'sys_id': '88c4f1a01b4e281016486392f54bcb9f',
          'widget': {'template': '<div class="s_breadcrumbs"> \n <div ng-class="c.data.showSearchBox ? \'col-md-9\' : \'col-md-12\'" class="breadcrumbs hidden-sm hidden-xs"> \n  <ul class="nav nav-pills nav-sm"> \n   <li><a ng-href="?id={{portal.homepage_dv}}">Home</a></li> \n   <li><i class="fa fa-chevron-right"></i></li> \n   <li ng-if="!c.b

In [42]:
a["result"]["containers"]

[{'sys_id': '0cc4f1a01b4e281016486392f54bcb8a',
  'bootstrap_alt': False,
  'subheader': True,
  'background': '{"background-size":"initial","background-position":"center center"}',
  'width': 'container',
  'container_class_name': 's_kb-container s_kb-container-1',
  'title': '',
  'rows': [{'sys_id': '8cc4f1a01b4e281016486392f54bcb8d',
    'columns': [{'sys_id': 'c4c4f1a01b4e281016486392f54bcb90',
      'size_classes': 'col-md-12 ',
      'rows': [],
      'widgets': [{'sys_id': '88c4f1a01b4e281016486392f54bcb9f',
        'widget': {'template': '<div class="s_breadcrumbs"> \n <div ng-class="c.data.showSearchBox ? \'col-md-9\' : \'col-md-12\'" class="breadcrumbs hidden-sm hidden-xs"> \n  <ul class="nav nav-pills nav-sm"> \n   <li><a ng-href="?id={{portal.homepage_dv}}">Home</a></li> \n   <li><i class="fa fa-chevron-right"></i></li> \n   <li ng-if="!c.breadcrumbs"><a href ng-bind="::page.title"></a></li> \n   <li ng-repeat-start="item in c.breadcrumbs"> <a ng-class="item.css_class" ng-

In [43]:
def findkeys(node, kv):
    if isinstance(node, list):
        for i in node:
            for x in findkeys(i, kv):
               yield x
    elif isinstance(node, dict):
        if kv in node:
            yield node[kv]
        for j in node.values():
            for x in findkeys(j, kv):
                yield x

In [44]:
title = findkeys(a, "page_title")
content = findkeys(a, "kbContentData")

In [45]:
list(title)

['Support Information - Sitecore Support Program overview']

In [46]:
b = list(content)


In [47]:
b[0]["data"]

'<h3><a id="Description" href="#Description">Description</a></h3>\r\n<p><strong>Purpose</strong></p>\r\n<p>This article describes the eligibility, scope, service levels, and other details of Sitecore\'s support offering.</p>\r\n<p><strong>Overview</strong></p>\r\n<p>Sitecore\'s core support offerings consist of two levels of support service: <strong>Standard Support</strong> and <strong>24x7 Premium Support</strong>. These support services provide increasing levels of responsiveness, from three business days for low priority problems, down to as little as one hour for critical issues, and varying hours of coverage.</p>\r\n<p>Access to Sitecore support is typically provided to a Sitecore Solution Partner on the customer\'s behalf. This ensures that experienced and knowledgeable support personnel are available to provide direct support for the solution.</p>\r\n<p>Customers with an active Sitecore Software Subscription are also provided access to Sitecore support.</p>\r\n<p>In addition to

### test scraping docsite

In [10]:
response = requests.get("https://doc.sitecore.com/search/en/users/search-user-guide/introduction-to-sitecore-search.html")  
soup = BeautifulSoup(response.content, 'xml') 
sections = soup.find_all('section')

In [13]:
sections[0]

<section class="section relative overflow-hidden" data-originid="UUID-51d0515a-16ba-c611-06b9-c2d843819fa0" data-permalink="introduction-to-sitecore-search.html" data-publicationdate="10/11/23" data-relativeprefix="" data-timemodified="08/02/2023" data-topiclevel="1" dir="ltr" id="UUID-423a5909-d5ec-36f7-00fe-9e32a5ba23fe" lang="en" xml:lang="en"><div class="relative w-auto px-4 py-1 items-center justify-center mb-2 text-white transition-all duration-500 ease-in-out rounded-md opacity-75 align-self-start hover:opacity-100 bg-brand-violet-dark paligo-edit-link-container hidden"><a class="relative flex items-center justify-center w-full h-full focus-ring gap-x-4" href="https://sitecore.paligoapp.com/document/edit/UUID-51d0515a-16ba-c611-06b9-c2d843819fa0" rel="noopener noreferrer" target="_blank"><svg aria-hidden="true" class="w-4 h-4 text-white" fill="none" stroke="currentColor" stroke-width="1.5" viewBox="0 0 24 24" xmlns="http://www.w3.org/2000/svg"><path d="M16.862 4.487l1.687-1.688a

In [18]:
sections[0].find_all("div", {"class": "title"})[0].get_text()

'\nIntroduction to Sitecore Search\n'