In [31]:
import os
import openai
from dotenv import load_dotenv

_ = load_dotenv(dotenv_path="../.env") # read local .env file
openai.api_key  = os.environ['OPENAI_API_KEY']
openai.api_type = "azure"
openai.api_base = os.environ['OPENAI_API_BASE']
openai.api_version = os.environ['OPENAI_API_VERSION']

import sys
sys.path.append('../')
from json_module import load_docs_from_jsonl, save_docs_to_jsonl

from langchain.text_splitter import RecursiveCharacterTextSplitter, CharacterTextSplitter, TextSplitter





### KB splitting

In [107]:
kbs = load_docs_from_jsonl("kbs_without_jap2.jsonl")

In [108]:
kbs[0]

Document(page_content='Description\r\nThe IP address of the Solr server gets cached on the .NET level. This behavior might lead to connection issues if the solr.search connection string value is a domain name instead of a static IP address. If the DNS mapping for the domain is changed to a new IP address, the changes are not picked up by Sitecore instances until the instances have been restarted.This is a common scenario when using SearchStax Disaster Recovery. This article provides recommendations on how to reestablish connection to Solr when the DNS is updated to point to a new IP address.\r\nSolution\r\nTo reestablish connection to Solr when the DNS is updated to point to a new IP address, consider one of the following options:\r\nFor\xa0Sitecore XP 10.2.1 and later, enable the feature using the configuration file \\App_Config\\Include\\Examples\\Sitecore.ContentSearch.Solr.EnableConnectionLeaseTimeout.config.example.\r\nFor\xa0Sitecore XP 10.2.0, download and install the latest cum

In [109]:
def split_kb():
    chunks = [[500,120], [300,90]]
    chunks_encoder = [200,150]

    for i in chunks:
        text_splitter = RecursiveCharacterTextSplitter(
            separators=[r"(\r\n).*(\r\n)", " ", ""],
            chunk_size = i[0],
            chunk_overlap  = i[1],
            length_function = len,
            is_separator_regex = True,
        )

        texts = text_splitter.split_documents(kbs)
        save_docs_to_jsonl(texts, "kb_recursive_split_chunk{chunksize}_chunkoverlap{chunkoverlap}.jsonl".format(chunksize=i[0], chunkoverlap=i[1]))

    for i in chunks_encoder:
        rec_text_splitter = RecursiveCharacterTextSplitter.from_tiktoken_encoder(
            chunk_size = i,
            chunk_overlap  = 0
        )

        rec_texts = rec_text_splitter.split_documents(kbs)
        save_docs_to_jsonl(rec_texts, "kb_recursive_split_chunk{chunksize}_tiktoken_encoder.jsonl".format(chunksize=i))
        

split_kb()



In [55]:
char_text_splitter = CharacterTextSplitter(
    separator="\r\n",
    chunk_size = 300,
    chunk_overlap  = 50,
    length_function = len,
    is_separator_regex= False,
)

char_texts = char_text_splitter.split_documents(kbs)


Created a chunk of size 412, which is longer than the specified 300
Created a chunk of size 1087, which is longer than the specified 300
Created a chunk of size 456, which is longer than the specified 300
Created a chunk of size 1565, which is longer than the specified 300
Created a chunk of size 301, which is longer than the specified 300
Created a chunk of size 369, which is longer than the specified 300
Created a chunk of size 707, which is longer than the specified 300
Created a chunk of size 325, which is longer than the specified 300
Created a chunk of size 307, which is longer than the specified 300
Created a chunk of size 310, which is longer than the specified 300
Created a chunk of size 839, which is longer than the specified 300
Created a chunk of size 365, which is longer than the specified 300
Created a chunk of size 463, which is longer than the specified 300
Created a chunk of size 407, which is longer than the specified 300
Created a chunk of size 409, which is longer t

In [56]:
char_texts[0:10]

[Document(page_content='Systems infrastructure', metadata={'source': 'https://support.sitecore.com/kb?id=kb_article_view&sysparm_article=KB1003178', 'loc': 'https://support.sitecore.com/kb?id=kb_article_view&sysparm_article=KB1003178', 'lastmod': '2023-10-17', 'title': 'Support Information - Sitecore Discover - 2023 Holiday Preparedness'}),
 Document(page_content='Sitecore Discover\xa0is designed to automatically scale capacity as the traffic demand increases. However, Sitecore Discover also preemptively ramps up the capacity of all services during this time of the year which includes Thanksgiving, Black Friday, Cyber Monday, Christmas, and New Year.', metadata={'source': 'https://support.sitecore.com/kb?id=kb_article_view&sysparm_article=KB1003178', 'loc': 'https://support.sitecore.com/kb?id=kb_article_view&sysparm_article=KB1003178', 'lastmod': '2023-10-17', 'title': 'Support Information - Sitecore Discover - 2023 Holiday Preparedness'}),
 Document(page_content="During Sitecore Disco

In [88]:
rec_text_splitter = RecursiveCharacterTextSplitter.from_tiktoken_encoder(
    chunk_size = 300,
    chunk_overlap  = 0
)

rec_texts = rec_text_splitter.split_documents(kbs)



In [89]:
rec_texts[5000:5020]

[Document(page_content='XPでは、データがフラッシュされていなくても、設定されたlog4netアペンダーごとに新しいログ ファイルが作成されるため、この挙動は仕様通りのものです。', metadata={'source': 'https://support.sitecore.com/kb?id=kb_article_view&sysparm_article=KB1002591', 'loc': 'https://support.sitecore.com/kb?id=kb_article_view&sysparm_article=KB1002591', 'lastmod': '2022-10-22', 'title': 'Known Issues - クリーンなSitecoreインスタンスを使用する際に、contactMergeDataUpgrade.logファイルが作成される'}),
 Document(page_content='このような挙動は無視しても問題はありませんが、空のログ ファイルを作成されないようにしたい場合、下記の解決策を参照してください。\r\n解決策\r\ncontactMergeDataUpgrade.logを生成しないようにするには、\\App_Config\\Sitecore\\Marketing.xDB\\Sitecore.Xdb.Processing.ContactMerge.configファイルを無効化（ファイル名を変更）してください。', metadata={'source': 'https://support.sitecore.com/kb?id=kb_article_view&sysparm_article=KB1002591', 'loc': 'https://support.sitecore.com/kb?id=kb_article_view&sysparm_article=KB1002591', 'lastmod': '2022-10-22', 'title': 'Known Issues - クリーンなSitecoreインスタンスを使用する際に、contactMergeDataUpgrade.logファイルが作成される'}),
 Document(page_content='説明\r\n異なるC

In [90]:
save_docs_to_jsonl(rec_texts, "recursive_split_chunk300_tiktoken_encoder.jsonl")

### Docs splitting

In [102]:
docs = load_docs_from_jsonl("docsites_without_jap2.jsonl")

In [103]:
def split_docs():
    chunks = [[500,120], [200, 50]]
    chunks_encoder = [300,100]

    for i in chunks:
        text_splitter = RecursiveCharacterTextSplitter(
            separators=[r"(\n).*(\n)", " ", ""],
            chunk_size = i[0],
            chunk_overlap  = i[1],
            length_function = len,
            is_separator_regex = True,
        )

        texts = text_splitter.split_documents(docs)
        save_docs_to_jsonl(texts, "docs_recursive_split_chunk{chunksize}_chunkoverlap{chunkoverlap}.jsonl".format(chunksize=i[0], chunkoverlap=i[1]))

    for i in chunks_encoder:
        rec_text_splitter = RecursiveCharacterTextSplitter.from_tiktoken_encoder(
            chunk_size = i,
            chunk_overlap  = 0
        )

        rec_texts = rec_text_splitter.split_documents(docs)
        save_docs_to_jsonl(rec_texts, "docs_recursive_split_chunk{chunksize}_tiktoken_encoder.jsonl".format(chunksize=i))
        

split_docs()



In [98]:
text_splitter = RecursiveCharacterTextSplitter(
            separators=[r"(\n).*(\n)", " ", ""],
            chunk_size = 300,
            chunk_overlap  = 70,
            length_function = len,
            is_separator_regex = True,
)

In [99]:
texts = text_splitter.split_documents(docs)

In [100]:
texts[0:20]

[Document(page_content='View all the Sitecore Cloud offerings to best target campaigns, scale your solution, and develop a disaster recovery plan that is custom-made for your organization and requirements.', metadata={'source': 'https://doc.sitecore.com/xp/en/developers/100/managed-cloud/sitecore-cloud-services-overview.html', 'loc': 'https://doc.sitecore.com/xp/en/developers/100/managed-cloud/sitecore-cloud-services-overview.html', 'lastmod': '2021-11-26', 'title': 'Sitecore Cloud Services overview'}),
 Document(page_content='This overview, and the following Sitecore Cloud Services overview diagram, takes you through all the Sitecore Cloud offerings, what their functionality is, what you can use them for, and how the different modules integrate with each other and the Sitecore Experience Platform (XP).\n\n\nNote', metadata={'source': 'https://doc.sitecore.com/xp/en/developers/100/managed-cloud/sitecore-cloud-services-overview.html', 'loc': 'https://doc.sitecore.com/xp/en/developers/10