### Intro to data ingestion

In [None]:
from typing import List, Dict, Any;
import pandas as pd;


In [9]:
from langchain_core.documents import Document
from langchain_text_splitters import (RecursiveCharacterTextSplitter, CharacterTextSplitter , TokenTextSplitter )
print("Set up complete")

Set up complete


### Understanding Document structure in Langchain

In [13]:
doc = Document(page_content="This is main content that will be embedded and searched",metadata={
    "source":"example.txt",
    "page":1,
    "author":"Yash"
})

print(f"Content :{doc.page_content}")
print(f"Metadeta :{doc.metadata}")

Content :This is main content that will be embedded and searched
Metadeta :{'source': 'example.txt', 'page': 1, 'author': 'Yash'}


In [14]:
type(doc)

langchain_core.documents.base.Document

## Text Directory (.txt) - The Simplest Case {#2- text-files}

In [15]:
## create a simple file
import os 
os.makedirs("data/text_files",exist_ok=True)

In [16]:
sample_texts = {
    "data/text_files/python-intro.txt":"""Python is a high-level, 
    interpreted programming language known for its simplicity, 
    readability, and versatility. It supports multiple programming 
    paradigms, including object-oriented, procedural, and functional 
    styles. With an extensive standard library and a massive ecosystem 
    of third-party packages, Python is used in web development, 
    data analysis, machine learning, automation, and scientific computing.
      Its clean syntax allows developers to focus on solving problems 
      rather than worrying about complex code structures, making it 
      an excellent choice for both beginners and professionals.
"""
}

for filepath, content in sample_texts.items():
    with open(filepath,'w',encoding="utf-8") as f:
        f.write(content)

print("Sample files created")

Sample files created


### Read Single File -  TextLoader

In [15]:
from langchain_community.document_loaders import TextLoader

loader = TextLoader("data/text_files/python-intro.txt",encoding="utf-8")

document = loader.load()

print(f"Loaded {len(document)} documents")
print(f"Conent Preview: {document[0].page_content[:100]}...")
print(f"Metadata: {document[0].metadata}")

Loaded 1 documents
Conent Preview: Python is a high-level, 
    interpreted programming language known for its simplicity, 
    readabi...
Metadata: {'source': 'data/text_files/python-intro.txt'}


### Directory Loader -  Sutable for multiple files

In [22]:
from langchain_community.document_loaders import DirectoryLoader

## load all files from directory

dir_loader = DirectoryLoader(
    "data/text_files",
    glob="**/*.txt", # path to match
    loader_cls=TextLoader, # loader class to use
    loader_kwargs={'encoding':'utf-8'},
    show_progress=True
)

documents = dir_loader.load()

print(f"Total Documents loaded {len(document)}")
for i , doc in enumerate(documents):
    print(f"\nDocument {i+1}:")
    print(f"Source: {doc.metadata}")
    print(f"Length: {len(doc.page_content)} characters")

100%|██████████| 2/2 [00:00<00:00, 1476.87it/s]

Total Documents loaded 1

Document 1:
Source: {'source': 'data/text_files/javascript-into.txt'}
Length: 640 characters

Document 2:
Source: {'source': 'data/text_files/python-intro.txt'}
Length: 643 characters





0.00s - make the debugger miss breakpoints. Please pass -Xfrozen_modules=off
0.00s - to python to disable frozen modules.
0.00s - Note: Debugging will proceed. Set PYDEVD_DISABLE_FILE_VALIDATION=1 to disable this validation.


#### Advantages and Disadvantages

Directory Loader Characteristics:


    - Loads multiple files at once
    - Support glob patterns
    - Progress tracking
    - Recursive directory scanning
    - All files must be of same type
    - Limited error handing per file
    - can be memory intensive for large files

In [None]:
## Different text splitting techniques

from langchain_text_splitters import RecursiveCharacterTextSplitter

# Split text when '/n' comes


char_splitter = RecursiveCharacterTextSplitter(
    chunk_size=100,
    chunk_overlap=0,
    length_function=len,
)

char_chunks = char_splitter.split_documents(document)

print(f"created chunks are : {len(char_chunks)}")
print(f"Chunks are : {char_chunks}")

created chunks are : 9
Chunks are : [Document(metadata={'source': 'data/text_files/python-intro.txt'}, page_content='Python is a high-level, \n    interpreted programming language known for its simplicity,'), Document(metadata={'source': 'data/text_files/python-intro.txt'}, page_content='readability, and versatility. It supports multiple programming'), Document(metadata={'source': 'data/text_files/python-intro.txt'}, page_content='paradigms, including object-oriented, procedural, and functional'), Document(metadata={'source': 'data/text_files/python-intro.txt'}, page_content='styles. With an extensive standard library and a massive ecosystem'), Document(metadata={'source': 'data/text_files/python-intro.txt'}, page_content='of third-party packages, Python is used in web development,'), Document(metadata={'source': 'data/text_files/python-intro.txt'}, page_content='data analysis, machine learning, automation, and scientific computing.'), Document(metadata={'source': 'data/text_files/pyth

In [19]:
# Token based splitter

from langchain_text_splitters import TokenTextSplitter

token_splitter = TokenTextSplitter(
    chunk_size=80,
    chunk_overlap=0
)

token_chunk = token_splitter.split_documents(document)

print(f"Created chunks are : {len(token_chunk)}")
print(f"Chunks are : {token_chunk}")

Created chunks are : 2
Chunks are : [Document(metadata={'source': 'data/text_files/python-intro.txt'}, page_content='Python is a high-level, \n    interpreted programming language known for its simplicity, \n    readability, and versatility. It supports multiple programming \n    paradigms, including object-oriented, procedural, and functional \n    styles. With an extensive standard library and a massive ecosystem \n    of third-party packages,'), Document(metadata={'source': 'data/text_files/python-intro.txt'}, page_content=' Python is used in web development, \n    data analysis, machine learning, automation, and scientific computing.\n      Its clean syntax allows developers to focus on solving problems \n      rather than worrying about complex code structures, making it \n      an excellent choice for both beginners and professionals.\n')]
