In [None]:
from llama_index.core import SimpleDirectoryReader
from llama_index.readers.web import SimpleWebPageReader
from llama_index.core.node_parser import (
    SimpleFileNodeParser,
    SemanticSplitterNodeParser,
    SentenceSplitter,
)
from llama_index.readers.file import FlatReader
from llama_parse import LlamaParse
from llama_index.embeddings.openai import OpenAIEmbedding
from llama_index.core import Document
from llama_index.core.extractors import (
    TitleExtractor,
    QuestionsAnsweredExtractor,
    SummaryExtractor,
    KeywordExtractor,
)
from llama_index.core.ingestion import IngestionPipeline

from pathlib import Path
import os
from dotenv import load_dotenv
import nest_asyncio

load_dotenv()
nest_asyncio.apply()

In [None]:
os.environ["LLAMA_CLOUD_API_KEY"] = os.getenv("LLAMA_CLOUD_API_KEY")
file_dir = "../data"

## SimpleDirectoryReader

In [None]:
docs = SimpleDirectoryReader(
    input_dir=file_dir,
    recursive=True,
).load_data()

for i in docs[2]:
    print(i)

In [None]:
docs = SimpleDirectoryReader(input_dir=file_dir, filename_as_id=True).load_data()

for i in docs[2]:
    print(i)

In [None]:
print(docs[2].text)

## SimpleWebPageReader

In [None]:
reader = SimpleWebPageReader(html_to_text=True)
pages = reader.load_data(
    urls=["https://www.sciencedaily.com/releases/2025/05/250527124428.htm"]
)

print(pages[0].text)

In [None]:
# LLamaParse
parser = LlamaParse(result_type="text")
file_extractor = {".pdf": parser}

reader = SimpleDirectoryReader(
    input_dir=file_dir,
    file_extractor=file_extractor,
)

docs = reader.load_data()

In [None]:
print(docs[0])

## Node

### Simple Split

In [None]:
md_docs = FlatReader().load_data(Path("../docs/Constraint_Essay.md"))
parser = SimpleFileNodeParser()
md_nodes = parser.get_nodes_from_documents(md_docs, show_progress=True)
print(md_nodes[-1])

### Semantic Split

In [None]:
print(md_docs[0].text)

In [None]:
doc = Document(text=md_docs[0].text)
embedding_model = OpenAIEmbedding()

splitter = SemanticSplitterNodeParser(
    embed_model=embedding_model,
    buffer_size=1,
    breakpoint_percentile_threshold=85,
)

nodes = splitter.get_nodes_from_documents([doc])

In [None]:
for n in nodes:
    print(n)

In [None]:
print(nodes[0].metadata)

### Extract Metadata

In [None]:
reader = SimpleDirectoryReader(
    input_files=["../docs/Constraint_Essay.md"],
)
docs = reader.load_data()

In [None]:
nodes = splitter.get_nodes_from_documents(docs)

In [None]:
print(docs[0].metadata)
print(nodes[0].metadata)

In [None]:
title_extractor = TitleExtractor()
metadata_ls = title_extractor.extract(nodes)

In [None]:
for m in metadata_ls:
    print(m)

In [None]:
qa_extractor = QuestionsAnsweredExtractor(questions=2)
metadata_ls = qa_extractor.extract(nodes)
for m in metadata_ls:
    print(m)

In [None]:
summary_extractor = SummaryExtractor(summaries=["prev", "self", "next"])
metadata_ls = summary_extractor.extract(nodes)
for m in metadata_ls:
    print(m)

In [None]:
keyword_extractor = KeywordExtractor(keywords=3)
metadata_ls = keyword_extractor.extract(nodes)
for m in metadata_ls:
    print(m)

## Ingest Pipeline

In [None]:
pipeline = IngestionPipeline(
    transformations=[
        SentenceSplitter(chunk_size=40, chunk_overlap=5),
        TitleExtractor(),
        OpenAIEmbedding(),
    ]
)

nodes = pipeline.run(documents=[doc])

In [None]:
for n in nodes:
    print(n)
    print(n.metadata)

In [None]:
pipeline.persist("pipeline_cache")

In [None]:
pipeline2 = IngestionPipeline(
    transformations=[
        SentenceSplitter(chunk_size=40, chunk_overlap=5),
        TitleExtractor(),
        OpenAIEmbedding(),
    ]
)
pipeline2.load("pipeline_cache")
nodes = pipeline2.run(documents=[doc])
for n in nodes:
    print(n)
    print(n.metadata)