# Loading Environment Variable

In [1]:
from secret_key import hugging_facehub_key
import os
os.environ['HUGGINGFACEHUB_API_TOKEN'] = hugging_facehub_key

In [2]:
from langchain.text_splitter import CharacterTextSplitter

In [3]:
chunk_size =26
chunk_overlap = 4

In [4]:
# Initialize the CharacterTextSplitter
c_splitter = CharacterTextSplitter(
    chunk_size=chunk_size,
    chunk_overlap=chunk_overlap,
    separator=' '  # Optional, if you want to split by a separator
)

In [5]:
# Define the text
text = 'abcdefghijklmnopqrstuvwxyzabcdefg'

In [6]:
# Split the text using the CharacterTextSplitter
chunks = c_splitter.split_text(text)

In [7]:
print(chunks)

['abcdefghijklmnopqrstuvwxyzabcdefg']


# Recursive splitting details

In [8]:
from langchain.text_splitter import RecursiveCharacterTextSplitter, CharacterTextSplitter
from langchain.document_loaders import PyPDFLoader, NotionDirectoryLoader

In [9]:
# Define your text or load it from a document
some_text = """When writing documents, writers will use document structure to group content. \
This can convey to the reader, which idea's are related. For example, closely related ideas \
are in sentances. Similar ideas are in paragraphs. Paragraphs form a document. \n\n  \
Paragraphs are often delimited with a carriage return or two carriage returns. \
Carriage returns are the "backslash n" you see embedded in this string. \
Sentences have a period at the end, but also, have a space.\
and words are separated by space."""

In [10]:
#some_text = """Tamarai selvan"""

In [11]:
# Define your desired chunk size and overlap
chunk_size = 450
chunk_overlap = 0

In [12]:
# Initialize the RecursiveCharacterTextSplitter with appropriate separators
r_splitter = RecursiveCharacterTextSplitter(
    chunk_size=chunk_size,
    chunk_overlap=chunk_overlap,
    separators=["\n\n", "\n", " ", ""]
)

In [13]:
# Split the text using RecursiveCharacterTextSplitter
chunks = r_splitter.split_text(some_text)

In [14]:
# Optionally, you can define a CharacterTextSplitter for simpler text splitting
c_splitter = CharacterTextSplitter(
    separator="\n",  # Define your separator if needed
    chunk_size=1000,  # Define your chunk size
    chunk_overlap=150,  # Define your overlap
)

In [15]:
# Load documents from NotionDirectoryLoader
loader = NotionDirectoryLoader("notion")

In [16]:
# Load the documents
pages = loader.load()

In [17]:
# Split the documents using the defined text splitter
docs = c_splitter.split_documents(pages)

In [18]:
# Get the lengths of pages and docs
length_function = len
page_length = length_function(pages)
docs_length = length_function(docs)

# Token splitting

In [19]:
 # ! pip install tiktoken

In [20]:
from langchain.text_splitter import TokenTextSplitter

In [21]:
# Define your text
text1 = "foo bar bazzyfoo"

In [22]:
# Initialize TokenTextSplitter with desired parameters
text_splitter = TokenTextSplitter(chunk_size=1, chunk_overlap=0)

In [23]:
# Split the text
chunks = text_splitter.split_text(text1)

In [24]:
# Access the first document
print(docs[0])

page_content="# Blendle's Employee Handbook (1)\nThis is a living document with everything we've learned working with people while running a startup. And, of course, we continue to learn. Therefore it's a document that will continue to change. \n**Everything related to working at Blendle and the people of Blendle, made public.**\nThese are the lessons from three years of working with the people of Blendle. It contains everything from [how our leaders lead](https://www.notion.so/ecfb7e647136468a9a0a32f1771a8f52?pvs=21) to [how we increase salaries](https://www.notion.so/Salary-Review-e11b6161c6d34f5c9568bb3e83ed96b6?pvs=21), from [how we hire](https://www.notion.so/Hiring-451bbcfe8d9b49438c0633326bb7af0a?pvs=21) and [fire](https://www.notion.so/Firing-5567687a2000496b8412e53cd58eed9d?pvs=21) to [how we think people should give each other feedback](https://www.notion.so/Our-Feedback-Process-eb64f1de796b4350aeab3bc068e3801f?pvs=21) — and much more." metadata={'source': "notion\\Blendle's 

In [25]:
# Assuming `pages` is defined elsewhere and contains multiple texts, you can access metadata of the 10th page like this:
print(pages[0].metadata)

{'source': "notion\\Blendle's Employee Handbook (1) be259e603b0e436590c70bfa9482e4f7.md"}


# Context aware splitting

In [26]:
from langchain.document_loaders import NotionDirectoryLoader
from langchain.text_splitter import MarkdownHeaderTextSplitter

In [27]:
def split_markdown_document(markdown_document, headers_to_split_on):
    markdown_splitter = MarkdownHeaderTextSplitter(headers_to_split_on=headers_to_split_on)
    return markdown_splitter.split_text(markdown_document)

In [28]:
# Example usage:
markdown_document = """# Title\n\n \
## Chapter 1\n\n \
Hi this is sam\n\n  Hi this is ajith\n\n \
### Section \n\n \
Hi this is vijay \n\n 
## Chapter 2\n\n \
Hi this is Molly"""

In [29]:
headers_to_split_on = [
    ("#", "Header 1"),
    ("##", "Header 2"),
    ("###", "Header 3"),
]

In [30]:
md_header_splits = split_markdown_document(markdown_document, headers_to_split_on)

In [31]:
for split in md_header_splits:
    print(split)

page_content='Hi this is sam  \nHi this is ajith' metadata={'Header 1': 'Title', 'Header 2': 'Chapter 1'}
page_content='Hi this is vijay' metadata={'Header 1': 'Title', 'Header 2': 'Chapter 1', 'Header 3': 'Section'}
page_content='Hi this is Molly' metadata={'Header 1': 'Title', 'Header 2': 'Chapter 2'}
