#### Document splitting
- Splitting documents into smaller chunks to get semantically relevant chunks together.

In [1]:
from langchain.text_splitter import RecursiveCharacterTextSplitter, CharacterTextSplitter

In [2]:
chunk_size = 26
chunk_overlap = 4

In [3]:
r_splitter = RecursiveCharacterTextSplitter(chunk_size=chunk_size, chunk_overlap=chunk_overlap)

c_splitter = CharacterTextSplitter(chunk_size=chunk_size, chunk_overlap=chunk_overlap)

In [4]:
text1 = 'abcdefghijklmnopqrstuvwxyz'

In [5]:
r_splitter.split_text(text1)

['abcdefghijklmnopqrstuvwxyz']

In [6]:
text2 = 'abcdefghijklmnopqrstuvwxyzabcdefg'

In [7]:
r_splitter.split_text(text2)

['abcdefghijklmnopqrstuvwxyz', 'wxyzabcdefg']

In [8]:
text3 = "a b c d e f g h i j k l m n o p q r s t u v w x y z"

In [9]:
r_splitter.split_text(text3)

['a b c d e f g h i j k l m', 'l m n o p q r s t u v w x', 'w x y z']

In [10]:
c_splitter.split_text(text3)

['a b c d e f g h i j k l m n o p q r s t u v w x y z']

In [11]:
c_splitter = CharacterTextSplitter(chunk_size=chunk_size, chunk_overlap=chunk_overlap, separator="")

In [12]:
c_splitter.split_text(text3)

['a b c d e f g h i j k l m', 'l m n o p q r s t u v w x', 'w x y z']

In [13]:
some_text = """When writing documents, writers will use document structure to group content. \
This can convey to the reader, which idea's are related. For example, closely related ideas \
are in sentances. Similar ideas are in paragraphs. Paragraphs form a document. \n\n  \
Paragraphs are often delimited with a carriage return or two carriage returns. \
Carriage returns are the "backslash n" you see embedded in this string. \
Sentences have a period at the end, but also, have a space.\
and words are separated by space."""

In [14]:
len(some_text)

496

In [15]:
c_splitter = CharacterTextSplitter(chunk_size=150, chunk_overlap=0,separator="")

In [16]:
r_splitter = RecursiveCharacterTextSplitter(chunk_size=150, chunk_overlap=0, separators=["\n\n", "\n", " ", ""])

In [17]:
c_splitter.split_text(some_text)

["When writing documents, writers will use document structure to group content. This can convey to the reader, which idea's are related. For example, cl",
 'osely related ideas are in sentances. Similar ideas are in paragraphs. Paragraphs form a document. \n\n  Paragraphs are often delimited with a carriage',
 'return or two carriage returns. Carriage returns are the "backslash n" you see embedded in this string. Sentences have a period at the end, but also,',
 'have a space.and words are separated by space.']

In [18]:
r_splitter.split_text(some_text)

["When writing documents, writers will use document structure to group content. This can convey to the reader, which idea's are related. For example,",
 'closely related ideas are in sentances. Similar ideas are in paragraphs. Paragraphs form a document.',
 'Paragraphs are often delimited with a carriage return or two carriage returns. Carriage returns are the "backslash n" you see embedded in this',
 'string. Sentences have a period at the end, but also, have a space.and words are separated by space.']

In [19]:
r_splitter = RecursiveCharacterTextSplitter(chunk_size=150, chunk_overlap=0, separators=["\n\n", "\n", "(?<=\.)", " ", ""])

In [20]:
r_splitter.split_text(some_text)

["When writing documents, writers will use document structure to group content. This can convey to the reader, which idea's are related. For example,",
 'closely related ideas are in sentances. Similar ideas are in paragraphs. Paragraphs form a document.',
 'Paragraphs are often delimited with a carriage return or two carriage returns. Carriage returns are the "backslash n" you see embedded in this',
 'string. Sentences have a period at the end, but also, have a space.and words are separated by space.']

#### Splitting pdf data

In [21]:
from langchain.document_loaders import PyPDFLoader

In [22]:
loader = PyPDFLoader("./docs/MachineLearning-Lecture01.pdf")
pages = loader.load()

In [23]:
from langchain.text_splitter import CharacterTextSplitter
text_splitter = CharacterTextSplitter(chunk_size=1000, chunk_overlap=150, separator="\n", length_function=len)

In [24]:
len(pages)

22

In [25]:
pdf_docs = text_splitter.split_documents(pages)

In [26]:
len(pdf_docs)

77

#### Token splitting
- LLM have context windows that are designated by token count.

In [27]:
from langchain.text_splitter import TokenTextSplitter

In [28]:
token_splitter = TokenTextSplitter(chunk_size=1, chunk_overlap=0)

In [29]:
text1 = "foo bar bazzyfoo"

In [30]:
token_splitter.split_text(text1)

['foo', ' bar', ' b', 'az', 'zy', 'foo']

In [31]:
text_splitter = TokenTextSplitter(chunk_size=10, chunk_overlap=0)

In [32]:
pdf_docs = token_splitter.split_documents(pages)

In [33]:
pdf_docs[0]

Document(page_content='Machine', metadata={'source': './docs/MachineLearning-Lecture01.pdf', 'page': 0})

In [34]:
pages[0].metadata

{'source': './docs/MachineLearning-Lecture01.pdf', 'page': 0}

#### Context aware splitting, adding info in metadata

In [35]:
from langchain.document_loaders import NotionDirectoryLoader
from langchain.text_splitter import MarkdownHeaderTextSplitter

In [36]:
markdown_document = """# Title\n\n \
## Chapter 1\n\n \
Hi this is Jim\n\n Hi this is Joe\n\n \
### Section \n\n \
Hi this is Lance \n\n 
## Chapter 2\n\n \
Hi this is Molly"""

In [37]:
headers_to_split_on = [
    ("#", "Header 1"),
    ("##", "Header 2"),
    ("###", "Header 3"),
]

In [38]:
markdown_splitter = MarkdownHeaderTextSplitter(headers_to_split_on=headers_to_split_on)

In [39]:
md_header_splits = markdown_splitter.split_text(markdown_document)

In [40]:
md_header_splits[0]

Document(page_content='Hi this is Jim  \nHi this is Joe', metadata={'Header 1': 'Title', 'Header 2': 'Chapter 1'})

In [41]:
md_header_splits[1]

Document(page_content='Hi this is Lance', metadata={'Header 1': 'Title', 'Header 2': 'Chapter 1', 'Header 3': 'Section'})

In [42]:
md_header_splits[2]

Document(page_content='Hi this is Molly', metadata={'Header 1': 'Title', 'Header 2': 'Chapter 2'})