## 1. Character Text Splitter

In [1]:
from langchain_community.document_loaders import TextLoader
loader = TextLoader('mytext.txt')
docs = loader.load()
print(docs)

[Document(metadata={'source': 'mytext.txt'}, page_content='\nIt will be all the easier for us to conduct ourselves as belligerents in a high spirit of \nright and fairness because we act without animus, not in enmity toward a people or with \nthe desire to bring any injury or disadvantage upon them, but only in armed opposition to \nan irresponsible government which has thrown aside all considerations of humanity and of \nright and is running amuck. We are, let me say again, the sincere friends of the German \npeople, and shall desire nothing so much as the early reestablishment of intimate \nrelations of mutual advantage between us—however hard it may be for them, for the time \nbeing, to believe that this is spoken from our hearts.\n\n\n because of that friendship—exercising a patience and forbearance which would otherwise have been impossible. We shall, happily, still have an opportunity to prove that friendship in our daily attitude and actions toward the millions of men and women 

In [2]:
from langchain_text_splitters import CharacterTextSplitter
text_splitter = CharacterTextSplitter(separator='\n\n',chunk_size=100,chunk_overlap=20)
text_chunks = text_splitter.split_documents(docs)
print(text_chunks)

Created a chunk of size 676, which is longer than the specified 100
Created a chunk of size 908, which is longer than the specified 100
Created a chunk of size 789, which is longer than the specified 100


[Document(metadata={'source': 'mytext.txt'}, page_content='It will be all the easier for us to conduct ourselves as belligerents in a high spirit of \nright and fairness because we act without animus, not in enmity toward a people or with \nthe desire to bring any injury or disadvantage upon them, but only in armed opposition to \nan irresponsible government which has thrown aside all considerations of humanity and of \nright and is running amuck. We are, let me say again, the sincere friends of the German \npeople, and shall desire nothing so much as the early reestablishment of intimate \nrelations of mutual advantage between us—however hard it may be for them, for the time \nbeing, to believe that this is spoken from our hearts.'), Document(metadata={'source': 'mytext.txt'}, page_content='because of that friendship—exercising a patience and forbearance which would otherwise have been impossible. We shall, happily, still have an opportunity to prove that friendship in our daily attit

In [3]:
texts = [doc.page_content for doc in text_chunks]
print(texts)

['It will be all the easier for us to conduct ourselves as belligerents in a high spirit of \nright and fairness because we act without animus, not in enmity toward a people or with \nthe desire to bring any injury or disadvantage upon them, but only in armed opposition to \nan irresponsible government which has thrown aside all considerations of humanity and of \nright and is running amuck. We are, let me say again, the sincere friends of the German \npeople, and shall desire nothing so much as the early reestablishment of intimate \nrelations of mutual advantage between us—however hard it may be for them, for the time \nbeing, to believe that this is spoken from our hearts.', 'because of that friendship—exercising a patience and forbearance which would otherwise have been impossible. We shall, happily, still have an opportunity to prove that friendship in our daily attitude and actions toward the millions of men and women of German birth and native sympathy who live among us and shar

In [4]:
def create_text_chunks(texts, chunk_size=100, chunk_overlap=20):
    chunks = []
    for text in texts:
        start = 0
        
        while start < len(text):
            end = min(start + chunk_size, len(text))
            chunk = text[start:end]
            chunks.append(chunk)
            start += chunk_size - chunk_overlap  # Move forward with overlap
            
    return chunks

In [5]:
final_chunks = create_text_chunks(texts)

In [6]:
final_chunks[0]

'It will be all the easier for us to conduct ourselves as belligerents in a high spirit of \nright and'

## 2. Recursive Character Text Splitter

In [7]:
from langchain_community.document_loaders import PyPDFLoader
loader = PyPDFLoader('my.pdf')
docs = loader.load()
docs

[Document(metadata={'source': 'my.pdf', 'page': 0}, page_content='Openai Pr oject s:\n1\nOpenai Projects:\nHereʼs a breakdown of each line in your code:\nimport logging\nImports the logging module for handling log messages, useful for debugging \nand tracking events during program execution.\nfrom aiogram import Bot, Dispatcher, executor, types\nImports key components from the aiogram library, including:\nBot for interacting with the Telegram Bot API.\nDispatcher for managing bot event handlers.\nexecutor for running the bot.\ntypes for defining message and event types.\nfrom dotenv import load_dotenv\nImports load_dotenv from dotenv to load environment variables from a .env file.\nimport os'),
 Document(metadata={'source': 'my.pdf', 'page': 1}, page_content='Openai Pr oject s:\n2\nImports the os module for interacting with the operating system, particularly \nto access environment variables.\nload_dotenv()\nLoads environment variables from a .env file into the scriptʼs environment.\np

In [8]:
from langchain_text_splitters import RecursiveCharacterTextSplitter
text_splitter = RecursiveCharacterTextSplitter(chunk_size = 500,chunk_overlap=50)
final_documents = text_splitter.split_documents(docs)
final_documents

[Document(metadata={'source': 'my.pdf', 'page': 0}, page_content='Openai Pr oject s:\n1\nOpenai Projects:\nHereʼs a breakdown of each line in your code:\nimport logging\nImports the logging module for handling log messages, useful for debugging \nand tracking events during program execution.\nfrom aiogram import Bot, Dispatcher, executor, types\nImports key components from the aiogram library, including:\nBot for interacting with the Telegram Bot API.\nDispatcher for managing bot event handlers.\nexecutor for running the bot.\ntypes for defining message and event types.'),
 Document(metadata={'source': 'my.pdf', 'page': 0}, page_content='types for defining message and event types.\nfrom dotenv import load_dotenv\nImports load_dotenv from dotenv to load environment variables from a .env file.\nimport os'),
 Document(metadata={'source': 'my.pdf', 'page': 1}, page_content='Openai Pr oject s:\n2\nImports the os module for interacting with the operating system, particularly \nto access envi

In [9]:
text = [doc.page_content for doc in final_documents]

In [10]:
text[0]

'Openai Pr oject s:\n1\nOpenai Projects:\nHereʼs a breakdown of each line in your code:\nimport logging\nImports the logging module for handling log messages, useful for debugging \nand tracking events during program execution.\nfrom aiogram import Bot, Dispatcher, executor, types\nImports key components from the aiogram library, including:\nBot for interacting with the Telegram Bot API.\nDispatcher for managing bot event handlers.\nexecutor for running the bot.\ntypes for defining message and event types.'

In [11]:
text

['Openai Pr oject s:\n1\nOpenai Projects:\nHereʼs a breakdown of each line in your code:\nimport logging\nImports the logging module for handling log messages, useful for debugging \nand tracking events during program execution.\nfrom aiogram import Bot, Dispatcher, executor, types\nImports key components from the aiogram library, including:\nBot for interacting with the Telegram Bot API.\nDispatcher for managing bot event handlers.\nexecutor for running the bot.\ntypes for defining message and event types.',
 'types for defining message and event types.\nfrom dotenv import load_dotenv\nImports load_dotenv from dotenv to load environment variables from a .env file.\nimport os',
 'Openai Pr oject s:\n2\nImports the os module for interacting with the operating system, particularly \nto access environment variables.\nload_dotenv()\nLoads environment variables from a .env file into the scriptʼs environment.\npython\nCopy code\nTELEGRAM_BOT_TOKEN = os.getenv("TELEGRAM_BOT_TOKEN")\nRetrieves

In [12]:
def create_text_chunks(texts, chunk_size=100, chunk_overlap=20):
    chunks = []
    for text in texts:
        start = 0
        
        while start < len(text):
            end = min(start + chunk_size, len(text))
            chunk = text[start:end]
            chunks.append(chunk)
            start += chunk_size - chunk_overlap  # Move forward with overlap
            
    return chunks

In [13]:
final_chunks = create_text_chunks(texts)
text[0]

'Openai Pr oject s:\n1\nOpenai Projects:\nHereʼs a breakdown of each line in your code:\nimport logging\nImports the logging module for handling log messages, useful for debugging \nand tracking events during program execution.\nfrom aiogram import Bot, Dispatcher, executor, types\nImports key components from the aiogram library, including:\nBot for interacting with the Telegram Bot API.\nDispatcher for managing bot event handlers.\nexecutor for running the bot.\ntypes for defining message and event types.'

In [14]:
final_chunks[3]

't only in armed opposition to \nan irresponsible government which has thrown aside all considerations'

## 3. Recursive JSON Splitter

In [15]:
import json
import requests
from langchain_text_splitters import RecursiveJsonSplitter

In [17]:
response = requests.get("https://api.smith.langchain.com/openapi.json")
json_data = response.json()
json_data

{'openapi': '3.1.0',
 'info': {'title': 'LangSmith', 'version': '0.1.0'},
 'paths': {'/api/v1/sessions/{session_id}': {'get': {'tags': ['tracer-sessions'],
    'summary': 'Read Tracer Session',
    'description': 'Get a specific session.',
    'operationId': 'read_tracer_session_api_v1_sessions__session_id__get',
    'security': [{'API Key': []}, {'Tenant ID': []}, {'Bearer Auth': []}],
    'parameters': [{'name': 'session_id',
      'in': 'path',
      'required': True,
      'schema': {'type': 'string', 'format': 'uuid', 'title': 'Session Id'}},
     {'name': 'include_stats',
      'in': 'query',
      'required': False,
      'schema': {'type': 'boolean',
       'default': False,
       'title': 'Include Stats'}},
     {'name': 'accept',
      'in': 'header',
      'required': False,
      'schema': {'anyOf': [{'type': 'string'}, {'type': 'null'}],
       'title': 'Accept'}}],
    'responses': {'200': {'description': 'Successful Response',
      'content': {'application/json': {'sch

In [18]:
with open("new.json","w") as json_file:
    json.dump(json_data,json_file,indent=2)

In [19]:
with open("new.json","r") as json_file:
    loaded_data = json.load(json_file)

In [20]:
loaded_data

{'openapi': '3.1.0',
 'info': {'title': 'LangSmith', 'version': '0.1.0'},
 'paths': {'/api/v1/sessions/{session_id}': {'get': {'tags': ['tracer-sessions'],
    'summary': 'Read Tracer Session',
    'description': 'Get a specific session.',
    'operationId': 'read_tracer_session_api_v1_sessions__session_id__get',
    'security': [{'API Key': []}, {'Tenant ID': []}, {'Bearer Auth': []}],
    'parameters': [{'name': 'session_id',
      'in': 'path',
      'required': True,
      'schema': {'type': 'string', 'format': 'uuid', 'title': 'Session Id'}},
     {'name': 'include_stats',
      'in': 'query',
      'required': False,
      'schema': {'type': 'boolean',
       'default': False,
       'title': 'Include Stats'}},
     {'name': 'accept',
      'in': 'header',
      'required': False,
      'schema': {'anyOf': [{'type': 'string'}, {'type': 'null'}],
       'title': 'Accept'}}],
    'responses': {'200': {'description': 'Successful Response',
      'content': {'application/json': {'sch

In [22]:
json_splitter = RecursiveJsonSplitter(max_chunk_size=300)
json_chunks = json_splitter.split_json(loaded_data)
json_chunks

[{'openapi': '3.1.0',
  'info': {'title': 'LangSmith', 'version': '0.1.0'},
  'paths': {'/api/v1/sessions/{session_id}': {'get': {'tags': ['tracer-sessions'],
     'summary': 'Read Tracer Session',
     'description': 'Get a specific session.'}}}},
 {'paths': {'/api/v1/sessions/{session_id}': {'get': {'operationId': 'read_tracer_session_api_v1_sessions__session_id__get',
     'security': [{'API Key': []}, {'Tenant ID': []}, {'Bearer Auth': []}]}}}},
 {'paths': {'/api/v1/sessions/{session_id}': {'get': {'parameters': [{'name': 'session_id',
       'in': 'path',
       'required': True,
       'schema': {'type': 'string', 'format': 'uuid', 'title': 'Session Id'}},
      {'name': 'include_stats',
       'in': 'query',
       'required': False,
       'schema': {'type': 'boolean',
        'default': False,
        'title': 'Include Stats'}},
      {'name': 'accept',
       'in': 'header',
       'required': False,
       'schema': {'anyOf': [{'type': 'string'}, {'type': 'null'}],
        '

In [23]:
for i, chunk in enumerate(json_chunks):
    print(f"Chunk {i+1}:\n{json.dumps(chunk,indent=2)}\n")

Chunk 1:
{
  "openapi": "3.1.0",
  "info": {
    "title": "LangSmith",
    "version": "0.1.0"
  },
  "paths": {
    "/api/v1/sessions/{session_id}": {
      "get": {
        "tags": [
          "tracer-sessions"
        ],
        "summary": "Read Tracer Session",
        "description": "Get a specific session."
      }
    }
  }
}

Chunk 2:
{
  "paths": {
    "/api/v1/sessions/{session_id}": {
      "get": {
        "operationId": "read_tracer_session_api_v1_sessions__session_id__get",
        "security": [
          {
            "API Key": []
          },
          {
            "Tenant ID": []
          },
          {
            "Bearer Auth": []
          }
        ]
      }
    }
  }
}

Chunk 3:
{
  "paths": {
    "/api/v1/sessions/{session_id}": {
      "get": {
        "parameters": [
          {
            "name": "session_id",
            "in": "path",
            "required": true,
            "schema": {
              "type": "string",
              "format": "uuid",
     

In [24]:
print("The End")

The End
