In [5]:
# How recursively split text by characters from langchain_community.document_loaders import WikipediaLoader
from langchain.retrievers import WikipediaRetriever
from langchain_community.document_loaders import WikipediaLoader
loader = WikipediaLoader(query="Machine Learning")
documents = loader.load()
documents

[Document(metadata={'title': 'Machine learning', 'summary': 'Machine learning (ML) is a field of study in artificial intelligence concerned with the development and study of statistical algorithms that can learn from data and generalize to unseen data, and thus perform tasks without explicit instructions. Quick progress in the field of deep learning, beginning in 2010s, allowed neural networks to surpass many previous approaches in performance.\nML finds application in many fields, including natural language processing, computer vision, speech recognition, email filtering, agriculture, and medicine. The application of ML to business problems is known as predictive analytics.\nStatistics and mathematical optimization (mathematical programming) methods comprise the foundations of machine learning. Data mining is a related field of study, focusing on exploratory data analysis (EDA) via unsupervised learning. \nFrom a theoretical viewpoint, probably approximately correct (PAC) learning pro

In [7]:
from langchain_community.document_loaders import ArxivLoader
docs=ArxivLoader(query="2111.07139",load_max_docs=4).load()
docs 

[Document(metadata={'Published': '2021-11-13', 'Title': 'Full-attention based Neural Architecture Search using Context Auto-regression', 'Authors': 'Yuan Zhou, Haiyang Wang, Shuwei Huo, Boyu Wang', 'Summary': 'Self-attention architectures have emerged as a recent advancement for\nimproving the performance of vision tasks. Manual determination of the\narchitecture for self-attention networks relies on the experience of experts\nand cannot automatically adapt to various scenarios. Meanwhile, neural\narchitecture search (NAS) has significantly advanced the automatic design of\nneural architectures. Thus, it is appropriate to consider using NAS methods to\ndiscover a better self-attention architecture automatically. However, it is\nchallenging to directly use existing NAS methods to search attention networks\nbecause of the uniform cell-based search space and the lack of long-term\ncontent dependencies. To address this issue, we propose a full-attention based\nNAS method. More specifically

In [36]:
from pathlib import Path
from langchain_text_splitters import RecursiveCharacterTextSplitter

# Get the current working directory
script_dir = Path.cwd()

# Construct the relative path to the file
file_path = script_dir.parent / 'Data_Ingestion' / 'speech.txt'

# Open the file using the relative path
with file_path.open() as file:
    speech = file.read()

text_splitter = RecursiveCharacterTextSplitter(chunk_size=100, chunk_overlap=30)
text_document = text_splitter.create_documents([speech])
print(text_document)

[Document(metadata={}, page_content='Not to be confused with Artificial general intelligence. This page focuses on statistical machine'), Document(metadata={}, page_content='on statistical machine learning AI. For other topics, see Algorithmic composition, Algorithm art,'), Document(metadata={}, page_content='composition, Algorithm art, Generative art, Procedural generation.'), Document(metadata={}, page_content='Impressionistic image of figures in a futuristic opera scene'), Document(metadata={}, page_content="ThÃ©Ã¢tre D'opÃ©ra Spatial, an image made using generative artificial intelligence"), Document(metadata={}, page_content='Part of a series on\nArtificial intelligence'), Document(metadata={}, page_content='Major goals\nApproaches\nApplications\nPhilosophy\nHistory\nGlossary\nvte'), Document(metadata={}, page_content='Generative artificial intelligence (generative AI, GenAI,[1] or GAI) is artificial intelligence'), Document(metadata={}, page_content='is artificial intelligence ca

In [37]:
print(text_document[0])
print(text_document[1])

page_content='Not to be confused with Artificial general intelligence. This page focuses on statistical machine'
page_content='on statistical machine learning AI. For other topics, see Algorithmic composition, Algorithm art,'


In [42]:
from langchain_text_splitters import CharacterTextSplitter
char_splitter=CharacterTextSplitter(separator=" ",chunk_size=500,chunk_overlap=100)
char_splitter.split_documents(text_document)

[Document(metadata={}, page_content='Not to be confused with Artificial general intelligence. This page focuses on statistical machine'),
 Document(metadata={}, page_content='on statistical machine learning AI. For other topics, see Algorithmic composition, Algorithm art,'),
 Document(metadata={}, page_content='composition, Algorithm art, Generative art, Procedural generation.'),
 Document(metadata={}, page_content='Impressionistic image of figures in a futuristic opera scene'),
 Document(metadata={}, page_content="ThÃ©Ã¢tre D'opÃ©ra Spatial, an image made using generative artificial intelligence"),
 Document(metadata={}, page_content='Part of a series on\nArtificial intelligence'),
 Document(metadata={}, page_content='Major goals\nApproaches\nApplications\nPhilosophy\nHistory\nGlossary\nvte'),
 Document(metadata={}, page_content='Generative artificial intelligence (generative AI, GenAI,[1] or GAI) is artificial intelligence'),
 Document(metadata={}, page_content='is artificial intelli

In [6]:
from pathlib import Path
from langchain_text_splitters import CharacterTextSplitter

# Get the current working directory
script_dir = Path.cwd()

# Construct the relative path to the file
file_path = script_dir.parent / 'Data_Ingestion' / 'speech.txt'

# Open the file using the relative path
with file_path.open() as file:
    speech = file.read()

text_splitter = CharacterTextSplitter(chunk_size=1000, chunk_overlap=0)
text_document = text_splitter.create_documents([speech])
print(text_document[0])
print(text_document[1])

Created a chunk of size 1208, which is longer than the specified 1000


page_content='Not to be confused with Artificial general intelligence. This page focuses on statistical machine learning AI. For other topics, see Algorithmic composition, Algorithm art, Generative art, Procedural generation.
Impressionistic image of figures in a futuristic opera scene
ThÃ©Ã¢tre D'opÃ©ra Spatial, an image made using generative artificial intelligence
Part of a series on
Artificial intelligence

Major goals
Approaches
Applications
Philosophy
History
Glossary
vte
Generative artificial intelligence (generative AI, GenAI,[1] or GAI) is artificial intelligence capable of generating text, images, videos, or other data using generative models,[2] often in response to prompts.[3][4] Generative AI models learn the patterns and structure of their input training data and then generate new data that has similar characteristics.[5][6]'
page_content='Improvements in transformer-based deep neural networks, particularly large language models (LLMs), enabled an AI boom of generative AI

# How to split by HTMLHeaderTextLoader 
* The HTMLHeaderTextSplitter in LangChain is a specialized tool designed to split HTML documents based on specified headers. This splitter is particularly useful for processing structured HTML content while preserving the context and metadata associated with different sections of the document.

* Key Features
1. Header-Based Splitting: It splits the HTML content at specified header tags"(e.g.,h1,h2,etc.)".
2. Metadata Preservation: It adds metadata for each header relevant to a given chunk, which helps in maintaining the context.
3. Flexible Configuration: You can specify which headers to split on and whether to return each element with its associated headers.

In [8]:
from langchain_text_splitters import HTMLHeaderTextSplitter
html_code ="""<!DOCTYPE html>
<html>
<head>
  <title>My Enhanced Website</title>
</head>
<body>
  <h1>Welcome to My Enhanced Website</h1>
  <nav>
    <ul>
      <li><a href="#about">About Us</a></li>
      <li><a href="#services">Services</a></li>
      <li><a href="#contact">Contact</a></li>
    </ul>
  </nav>
  <section id="about">   

    <h2>About Us</h2>
    <p>Lorem ipsum dolor sit amet, consectetur adipiscing elit.</p>   

  </section>
  <section id="services">
    <h2>Our Services</h2>
    <ul>
      <li>Web Development</li>
      <li>Graphic Design</li>
      <li>Digital Marketing</li>
    </ul>
  </section>
  <section id="contact">
    <h2>Contact Us</h2>
    <p>Email:   
 info@example.com</p>
    <p>Phone: +1234567890</p>
  </section>
</body>
</html> """

headers_to_split_on = [
    ("h1", "Header 1"),
    ("h2", "Header 2"),
    ("h3", "Header 3"),
]
html_splitter = HTMLHeaderTextSplitter(headers_to_split_on)
html_header_splits = html_splitter.split_text(html_code)
html_header_splits

[Document(metadata={'Header 1': 'Welcome to My Enhanced Website'}, page_content='About Us Services Contact'),
 Document(metadata={'Header 1': 'Welcome to My Enhanced Website', 'Header 2': 'About Us'}, page_content='Lorem ipsum dolor sit amet, consectetur adipiscing elit.'),
 Document(metadata={'Header 1': 'Welcome to My Enhanced Website', 'Header 2': 'Our Services'}, page_content='Web Development Graphic Design Digital Marketing'),
 Document(metadata={'Header 1': 'Welcome to My Enhanced Website', 'Header 2': 'Contact Us'}, page_content='Email: \xa0 info@example.com  \nPhone: +1234567890')]

In [13]:
url="https://plato.stanford.edu/entries/goedel/"
headers_to_split_on = [
    ("h1", "Header 1"),
    ("h2", "Header 2"),
    ("div-id", "navigation")
]
html_splitter = HTMLHeaderTextSplitter(headers_to_split_on)
html_header_splits = html_splitter.split_text_from_url(url)
html_header_splits

[Document(metadata={}, page_content="Stanford Encyclopedia of Philosophy  \nMenu  \nBrowse About Support SEP  \nTable of Contents What's New Random Entry Chronological Archives  \nEditorial Information About the SEP Editorial Board How to Cite the SEP Special Characters Advanced Tools Contact  \nSupport the SEP PDFs for SEP Friends Make a Donation SEPIA for Libraries  \nEntry Navigation  \nEntry Contents Bibliography Academic Tools Friends PDF Preview Author and Citation Info Back to Top  \nKurt Gödel"),
 Document(metadata={'Header 1': 'Kurt Gödel'}, page_content='First published Tue Feb 13, 2007; substantive revision Fri Dec 11, 2015  \nKurt Friedrich Gödel (b. 1906, d. 1978) was one of the principal founders of the modern, metamathematical era in mathematical logic. He is widely known for his Incompleteness Theorems, which are among the handful of landmark theorems in twentieth century mathematics, but his work touched every field of mathematical logic, if it was not in most cases th

# How to split JSON data 
- In LangChain, the concept of a JSON splitter revolves around dividing large JSON data structures into smaller, more manageable pieces for processing. Here’s the theory in a nutshell:

1. Why Split JSON Data?
- JSON splitters are used to:
* Improve processing efficiency.
* Simplify data handling.
* Enable parallel processing of data chunks.
* Avoid memory overloads with large datasets.

2. Splitting Strategies
- Different strategies can be employed, such as:
* By Key: Splitting the JSON data based on specific keys or sections.
* By Size: Dividing data into chunks of a specified size.
* By Type: Splitting based on the type or structure of data elements (e.g., arrays vs. objects).
3. Implementation in LangChain
- LangChain can integrate a JSON splitter by:
- Reading the JSON data.
* Applying a splitting function to divide the data into smaller pieces.
* Processing each piece individually using LangChain's capabilities.

In [5]:
import json
import requests
json_data = requests.get("https://api.smith.langchain.com/openapi.json").json()
json_data


{'openapi': '3.1.0',
 'info': {'title': 'LangSmith', 'version': '0.1.0'},
 'paths': {'/api/v1/sessions/{session_id}': {'get': {'tags': ['tracer-sessions'],
    'summary': 'Read Tracer Session',
    'description': 'Get a specific session.',
    'operationId': 'read_tracer_session_api_v1_sessions__session_id__get',
    'security': [{'API Key': []}, {'Tenant ID': []}, {'Bearer Auth': []}],
    'parameters': [{'name': 'session_id',
      'in': 'path',
      'required': True,
      'schema': {'type': 'string', 'format': 'uuid', 'title': 'Session Id'}},
     {'name': 'include_stats',
      'in': 'query',
      'required': False,
      'schema': {'type': 'boolean',
       'default': False,
       'title': 'Include Stats'}},
     {'name': 'accept',
      'in': 'header',
      'required': False,
      'schema': {'anyOf': [{'type': 'string'}, {'type': 'null'}],
       'title': 'Accept'}}],
    'responses': {'200': {'description': 'Successful Response',
      'content': {'application/json': {'sch

In [6]:
from langchain_text_splitters import RecursiveJsonSplitter
splitter = RecursiveJsonSplitter(max_chunk_size=300)
# Recursively split json data - If you need to access/manipulate the smaller json chunks
json_chunks = splitter.split_json(json_data=json_data)

In [7]:
json_chunks

[{'openapi': '3.1.0',
  'info': {'title': 'LangSmith', 'version': '0.1.0'},
  'paths': {'/api/v1/sessions/{session_id}': {'get': {'tags': ['tracer-sessions'],
     'summary': 'Read Tracer Session',
     'description': 'Get a specific session.'}}}},
 {'paths': {'/api/v1/sessions/{session_id}': {'get': {'operationId': 'read_tracer_session_api_v1_sessions__session_id__get',
     'security': [{'API Key': []}, {'Tenant ID': []}, {'Bearer Auth': []}]}}}},
 {'paths': {'/api/v1/sessions/{session_id}': {'get': {'parameters': [{'name': 'session_id',
       'in': 'path',
       'required': True,
       'schema': {'type': 'string', 'format': 'uuid', 'title': 'Session Id'}},
      {'name': 'include_stats',
       'in': 'query',
       'required': False,
       'schema': {'type': 'boolean',
        'default': False,
        'title': 'Include Stats'}},
      {'name': 'accept',
       'in': 'header',
       'required': False,
       'schema': {'anyOf': [{'type': 'string'}, {'type': 'null'}],
        '

In [8]:
#printing few records 
for chunk in json_chunks[:3]:
    print(chunk)

{'openapi': '3.1.0', 'info': {'title': 'LangSmith', 'version': '0.1.0'}, 'paths': {'/api/v1/sessions/{session_id}': {'get': {'tags': ['tracer-sessions'], 'summary': 'Read Tracer Session', 'description': 'Get a specific session.'}}}}
{'paths': {'/api/v1/sessions/{session_id}': {'get': {'operationId': 'read_tracer_session_api_v1_sessions__session_id__get', 'security': [{'API Key': []}, {'Tenant ID': []}, {'Bearer Auth': []}]}}}}
{'paths': {'/api/v1/sessions/{session_id}': {'get': {'parameters': [{'name': 'session_id', 'in': 'path', 'required': True, 'schema': {'type': 'string', 'format': 'uuid', 'title': 'Session Id'}}, {'name': 'include_stats', 'in': 'query', 'required': False, 'schema': {'type': 'boolean', 'default': False, 'title': 'Include Stats'}}, {'name': 'accept', 'in': 'header', 'required': False, 'schema': {'anyOf': [{'type': 'string'}, {'type': 'null'}], 'title': 'Accept'}}]}}}}


In [9]:
# The splitter can also output documents
docs = splitter.create_documents(texts=[json_data])

for doc in docs[:3]:
    print(doc)

page_content='{"openapi": "3.1.0", "info": {"title": "LangSmith", "version": "0.1.0"}, "paths": {"/api/v1/sessions/{session_id}": {"get": {"tags": ["tracer-sessions"], "summary": "Read Tracer Session", "description": "Get a specific session."}}}}'
page_content='{"paths": {"/api/v1/sessions/{session_id}": {"get": {"operationId": "read_tracer_session_api_v1_sessions__session_id__get", "security": [{"API Key": []}, {"Tenant ID": []}, {"Bearer Auth": []}]}}}}'
page_content='{"paths": {"/api/v1/sessions/{session_id}": {"get": {"parameters": [{"name": "session_id", "in": "path", "required": true, "schema": {"type": "string", "format": "uuid", "title": "Session Id"}}, {"name": "include_stats", "in": "query", "required": false, "schema": {"type": "boolean", "default": false, "title": "Include Stats"}}, {"name": "accept", "in": "header", "required": false, "schema": {"anyOf": [{"type": "string"}, {"type": "null"}], "title": "Accept"}}]}}}}'


In [10]:
texts = splitter.split_text(json_data=json_data)

print(texts[0])
print(texts[1])

{"openapi": "3.1.0", "info": {"title": "LangSmith", "version": "0.1.0"}, "paths": {"/api/v1/sessions/{session_id}": {"get": {"tags": ["tracer-sessions"], "summary": "Read Tracer Session", "description": "Get a specific session."}}}}
{"paths": {"/api/v1/sessions/{session_id}": {"get": {"operationId": "read_tracer_session_api_v1_sessions__session_id__get", "security": [{"API Key": []}, {"Tenant ID": []}, {"Bearer Auth": []}]}}}}


# How to manage chunk sizes from list content

In [11]:
print([len(text) for text in texts][:10])
print()
print(texts[3])

[232, 197, 469, 210, 213, 237, 271, 191, 232, 215]

{"paths": {"/api/v1/sessions/{session_id}": {"get": {"responses": {"200": {"description": "Successful Response", "content": {"application/json": {"schema": {"$ref": "#/components/schemas/TracerSession"}}}}}}}}}


: 