In [None]:
!pip install openai
!pip install langchain
!pip install -U langchain-community
!pip install pypdf

In [2]:
import os
import openai
import sys

In [3]:
from langchain.text_splitter import RecursiveCharacterTextSplitter, CharacterTextSplitter

In [4]:
chunk_size = 26
chunk_overlap = 4

In [5]:
r_splitter = RecursiveCharacterTextSplitter(
    chunk_size = chunk_size,
    chunk_overlap = chunk_overlap
)

c_splitter = CharacterTextSplitter(
    chunk_size = chunk_size,
    chunk_overlap = chunk_overlap
)

In [6]:
text1 = "abcdefghijklmnopqrstuvwxyz"

In [7]:
# Cuz the chunk size is 26, and the length of text1 is 26
# so do not split anything
r_splitter.split_text(text1)

['abcdefghijklmnopqrstuvwxyz']

In [8]:
text2 = "abcdefghijklmnopqrstuvwxyzabcdefg"

In [9]:
# As the chunk overlap is 4, so the last 4 characters will move to the next chunk
r_splitter.split_text(text2)

['abcdefghijklmnopqrstuvwxyz', 'wxyzabcdefg']

In [10]:
text3 = "a b c d e f g h i j k l m n o p q r s t u v w x y z"

In [11]:
r_splitter.split_text(text3)

['a b c d e f g h i j k l m', 'l m n o p q r s t u v w x', 'w x y z']

In [12]:
c_splitter.split_text(text3)

['a b c d e f g h i j k l m n o p q r s t u v w x y z']

In [16]:
c_splitter = CharacterTextSplitter(
    chunk_size = chunk_size,
    chunk_overlap = chunk_overlap,
    separator = " "
)
c_splitter.split_text(text3)

['a b c d e f g h i j k l m', 'l m n o p q r s t u v w x', 'w x y z']

In [17]:
# RecursiveCharacterTextSplitter is recommended for generic text.
some_text = """When writing documents, writers will use document structure to group content. \
This can convey to the reader, which idea's are related. For example, closely related ideas \
are in sentances. Similar ideas are in paragraphs. Paragraphs form a document. \n\n  \
Paragraphs are often delimited with a carriage return or two carriage returns. \
Carriage returns are the "backslash n" you see embedded in this string. \
Sentences have a period at the end, but also, have a space.\
and words are separated by space."""

In [18]:
len(some_text)

496

In [25]:
c_splitter = CharacterTextSplitter(
    chunk_size = 450,
    chunk_overlap = 0,
    separator = " "
)

r_splitter = RecursiveCharacterTextSplitter(
    chunk_size = 450,
    chunk_overlap = 0,
    separators = ["\n\n", "\n", " ", ""]
)

In [26]:
c_splitter.split_text(some_text)

['When writing documents, writers will use document structure to group content. This can convey to the reader, which idea\'s are related. For example, closely related ideas are in sentances. Similar ideas are in paragraphs. Paragraphs form a document. \n\n Paragraphs are often delimited with a carriage return or two carriage returns. Carriage returns are the "backslash n" you see embedded in this string. Sentences have a period at the end, but also,',
 'have a space.and words are separated by space.']

In [27]:
r_splitter.split_text(some_text)

["When writing documents, writers will use document structure to group content. This can convey to the reader, which idea's are related. For example, closely related ideas are in sentances. Similar ideas are in paragraphs. Paragraphs form a document.",
 'Paragraphs are often delimited with a carriage return or two carriage returns. Carriage returns are the "backslash n" you see embedded in this string. Sentences have a period at the end, but also, have a space.and words are separated by space.']

In [30]:
r_splitter = RecursiveCharacterTextSplitter(
    chunk_size=150,
    chunk_overlap=0,
    separators=["\n\n", "\n", "\. ", " ", ""]
)
r_splitter.split_text(some_text)

["When writing documents, writers will use document structure to group content. This can convey to the reader, which idea's are related. For example,",
 'closely related ideas are in sentances. Similar ideas are in paragraphs. Paragraphs form a document.',
 'Paragraphs are often delimited with a carriage return or two carriage returns. Carriage returns are the "backslash n" you see embedded in this',
 'string. Sentences have a period at the end, but also, have a space.and words are separated by space.']

In [31]:
r_splitter = RecursiveCharacterTextSplitter(
    chunk_size=150,
    chunk_overlap=0,
    separators=["\n\n", "\n", "(?<=\. )", " ", ""]
)
r_splitter.split_text(some_text)

["When writing documents, writers will use document structure to group content. This can convey to the reader, which idea's are related. For example,",
 'closely related ideas are in sentances. Similar ideas are in paragraphs. Paragraphs form a document.',
 'Paragraphs are often delimited with a carriage return or two carriage returns. Carriage returns are the "backslash n" you see embedded in this',
 'string. Sentences have a period at the end, but also, have a space.and words are separated by space.']

In [34]:
from langchain.document_loaders import PyPDFLoader
loader = PyPDFLoader("/content/Growth Stock List_August 2024.pdf")
pages = loader.load()

In [35]:
from langchain.text_splitter import CharacterTextSplitter
text_splitter = CharacterTextSplitter(
    separator = "\n",
    chunk_size = 100,
    chunk_overlap = 15,
    length_function = len
)

In [36]:
docs = text_splitter.split_documents(pages)



In [37]:
len(docs)

241

In [38]:
len(pages)

8

In [40]:
docs[1]

Document(metadata={'source': '/content/Growth Stock List_August 2024.pdf', 'page': 0}, page_content='30 Aug 1198 MAA 4 0.21 -15.15 0.00 -1.39 -22.4% +209.3% +260.2% True True')

### Token splitting

In [None]:
pip install tiktoken

In [41]:
from langchain.text_splitter import TokenTextSplitter

In [44]:
text_splitter = TokenTextSplitter(chunk_size = 1, chunk_overlap = 0)

In [45]:
text1 = "foo bar bazzyfoo"

In [46]:
text_splitter.split_text(text1)

['foo', ' bar', ' b', 'az', 'zy', 'foo']

In [50]:
text_splitter = TokenTextSplitter(chunk_size = 20, chunk_overlap = 0)

In [51]:
docs = text_splitter.split_documents(pages)

In [52]:
docs[0]

Document(metadata={'source': '/content/Growth Stock List_August 2024.pdf', 'page': 0}, page_content='Announced Date Symbol Short Name Quarter PB PE DY ROE Revenue QoQ Net Profit Q')

In [53]:
pages[0].metadata

{'source': '/content/Growth Stock List_August 2024.pdf', 'page': 0}

### Context aware splitting

In [54]:
from langchain.text_splitter import MarkdownHeaderTextSplitter

In [57]:
markdown_document = """# Title\n\n \
## Chapter 1\n\n \
Hi this is Jim\n\n Hi this is Joe\n\n \
### Section \n\n \
Hi this is Lance \n\n
## Chapter 2\n\n \
Hi this is Molly"""

print(markdown_document)

# Title

 ## Chapter 1

 Hi this is Jim

 Hi this is Joe

 ### Section 

 Hi this is Lance 

 
## Chapter 2

 Hi this is Molly


In [58]:
headers_to_split_on = [
    ("#", "Header 1"),
    ("##", "Header 2"),
    ("###", "Header 3"),
]

In [60]:
markdown_splitter = MarkdownHeaderTextSplitter(
    headers_to_split_on = headers_to_split_on
)

md_header_splits = markdown_splitter.split_text(markdown_document)

In [61]:
md_header_splits[0]

Document(metadata={'Header 1': 'Title', 'Header 2': 'Chapter 1'}, page_content='Hi this is Jim  \nHi this is Joe')

In [62]:
md_header_splits[1]

Document(metadata={'Header 1': 'Title', 'Header 2': 'Chapter 1', 'Header 3': 'Section'}, page_content='Hi this is Lance')