In [3]:
###Introduction to DataIngestion

In [25]:
import os
from typing import List,Dict,Any
import pandas as pd
import json

In [26]:
from langchain_core.documents import Document
from langchain_text_splitters import (
    RecursiveCharacterTextSplitter,
    CharacterTextSplitter,
    TokenTextSplitter,
)
print("All imports successful")

All imports successful


Document structure in langchain

In [28]:
doc = Document(
    page_content="This is a sample document.",
    metadata={
        "source": "sample_source.txt",
        "length": 27,
        "author": "leul tesfu",
        "language": "English",
    }
)

print("Document Structure")

print(f"content: {doc.page_content}")
print(f"metadata: {doc.metadata}")

Document Structure
content: This is a sample document.
metadata: {'source': 'sample_source.txt', 'length': 27, 'author': 'leul tesfu', 'language': 'English'}


Text files(.txt)-the simplest case

In [29]:
import os 
os.makedirs("data/text_files",exist_ok=True)

In [None]:
sample_texts={
"data/text_files/python_intro.txt": """
Python is a high-level, interpreted programming language known for its simplicity and readability. It was created by Guido van Rossum and first released in 1991. Python supports multiple programming paradigms, including procedural, object-oriented, and functional programming. It has a large standard library that provides tools suited to many tasks, making it a popular choice for web development, data analysis, artificial intelligence, scientific computing, and more. Python's syntax emphasizes code readability, allowing developers to express concepts in fewer lines of code compared to other languages. Its extensive ecosystem of third-party packages and frameworks further enhances its versatility and functionality.""",

"data/text_files/Machine_learning.txt":"""
Machine learning is a subset of artificial intelligence that focuses on the development of algorithms and statistical models that enable computers to perform tasks without explicit instructions. It relies on patterns and inference instead. Machine learning is widely used in various applications, including image and speech recognition, natural language processing, and predictive analytics. The field has gained significant traction in recent years, driven by the availability of large datasets and advancements in computing power.

"""
}

for filepath, content in sample_texts.items():
    with open(filepath, 'w', encoding='utf-8') as f:
        f.write(content)

print("✅Sample text files created.")

✅Sample text files created.


Textloader- Read a single file

In [36]:
from langchain_community.document_loaders import TextLoader

loader = TextLoader("data/text_files/python_intro.txt", encoding="utf-8")
documents = loader.load()

print(f"Number of documents loaded: {len(documents)}")
print(f"Content of the document:\n{documents[0].page_content[:100]}...") 
print(f"metadata of the document:\n{documents[0].metadata}")


Number of documents loaded: 1
Content of the document:

Python is a high-level, interpreted programming language known for its simplicity and readability. ...
metadata of the document:
{'source': 'data/text_files/python_intro.txt'}


DirectoryLoader- Read multiple files

In [41]:
from langchain_community.document_loaders import DirectoryLoader

dir_loader = DirectoryLoader(
    "data/text_files",
    glob="**/*.txt",
    loader_cls=TextLoader,
    loader_kwargs={"encoding": "utf-8"},
    show_progress=True,
)

documents = dir_loader.load()

print(f"Number of documents loaded from directory: {len(documents)}")

for i, document in enumerate(documents):
    print(f"document: {i + 1}\n")
    print(f"Content: {document.page_content[:100]}...")
    print(f"Metadata: {document.metadata}\n")

100%|██████████| 2/2 [00:00<00:00, 1001.51it/s]

Number of documents loaded from directory: 2
document: 1

Content: 
Machine learning is a subset of artificial intelligence that focuses on the development of algorith...
Metadata: {'source': 'data\\text_files\\Machine_learning.txt'}

document: 2

Content: 
Python is a high-level, interpreted programming language known for its simplicity and readability. ...
Metadata: {'source': 'data\\text_files\\python_intro.txt'}






Text-Splitting Strategies

In [44]:
from langchain_text_splitters import (
    RecursiveCharacterTextSplitter,
    CharacterTextSplitter,
    TokenTextSplitter,
)

print(documents)

[Document(metadata={'source': 'data\\text_files\\Machine_learning.txt'}, page_content='\nMachine learning is a subset of artificial intelligence that focuses on the development of algorithms and statistical models that enable computers to perform tasks without explicit instructions. It relies on patterns and inference instead. Machine learning is widely used in various applications, including image and speech recognition, natural language processing, and predictive analytics. The field has gained significant traction in recent years, driven by the availability of large datasets and advancements in computing power.\n\n'), Document(metadata={'source': 'data\\text_files\\python_intro.txt'}, page_content="\nPython is a high-level, interpreted programming language known for its simplicity and readability. It was created by Guido van Rossum and first released in 1991. Python supports multiple programming paradigms, including procedural, object-oriented, and functional programming. It has a l

In [53]:
text=documents[0].page_content
#####method 1: RecursiveCharacterTextSplitter
print("recursive character splitter")
recursive_splitter=RecursiveCharacterTextSplitter(
    separators=["\n\n", "\n", " ", "", "."],
    chunk_size=200,
    chunk_overlap=20,
    length_function=len,
)
recursive_chunks=recursive_splitter.split_text(text)
print(f"Number of chunks created by RecursiveCharacterTextSplitter: {len(recursive_chunks)}")


recursive character splitter
Number of chunks created by RecursiveCharacterTextSplitter: 3


In [54]:
print(recursive_chunks[0])
print("-----")
print(recursive_chunks[1])
print("-----")
print(recursive_chunks[2])

Machine learning is a subset of artificial intelligence that focuses on the development of algorithms and statistical models that enable computers to perform tasks without explicit instructions. It
-----
instructions. It relies on patterns and inference instead. Machine learning is widely used in various applications, including image and speech recognition, natural language processing, and predictive
-----
and predictive analytics. The field has gained significant traction in recent years, driven by the availability of large datasets and advancements in computing power.


In [51]:
#####method 2: CharacterTextSplitter
print("CharacterTextSplitter")
char_splitter=CharacterTextSplitter(
    separator=" ",
    chunk_size=200,
    chunk_overlap=20,
    length_function=len,
)

char_chunks=char_splitter.split_text(text)
print(f"Number of chunks created by CharacterTextSplitter: {len(char_chunks)}")

CharacterTextSplitter
Number of chunks created by CharacterTextSplitter: 3


In [52]:
print(char_chunks[0])
print("-----")
print(char_chunks[1])
print("-----")
print(char_chunks[2])

Machine learning is a subset of artificial intelligence that focuses on the development of algorithms and statistical models that enable computers to perform tasks without explicit instructions. It
-----
instructions. It relies on patterns and inference instead. Machine learning is widely used in various applications, including image and speech recognition, natural language processing, and predictive
-----
and predictive analytics. The field has gained significant traction in recent years, driven by the availability of large datasets and advancements in computing power.
