In [None]:
from __future__ import annotations

# Document loading via langchain and llamaindex


- [Langchain](https://python.langchain.com/docs/integrations/document_loaders/)
- [LlamaIndex](https://developers.llamaindex.ai/python/framework/module_guides/loading/simpledirectoryreader/)


In [None]:
from langchain_community.document_loaders import CSVLoader, PyMuPDFLoader, TextLoader
from llama_index.core import SimpleDirectoryReader

In [None]:
# Langchain
TextLoader("../data/sample.txt").load()[0]

In [None]:
TextLoader("../data/sample.txt").load()[0].page_content

In [None]:
# langchain
CSVLoader("../data/sample.csv").load()

In [None]:
# langchain
PyMuPDFLoader("../data/sample.pdf").load()

In [None]:
# langchain
from langchain_docling import DoclingLoader

DoclingLoader("https://arxiv.org/pdf/2408.09869").load()

In [None]:
DoclingLoader("../data/Querying.docx").load()

In [None]:
from langchain_community.document_loaders import WebBaseLoader

WebBaseLoader("https://deepshieldai.com").load()

In [None]:
import json
from pathlib import Path
from typing import Any

from langchain.docstore.document import Document


def load_notion_json(path: str) -> dict[str, Any]:
	with Path(path).open(encoding="utf-8") as f:
		data = json.load(f)

		document = []
		for entry in data:
			page_content = f"Title: {entry['title']}\n\nContent: {entry['content']}"
			metadata = {"title": entry.get("title", ""), "source": path}
		return document.append(Document(page_content=page_content, metadata=metadata))

In [None]:
load_notion_json("../data/notion_export.json")

In [None]:
# llamaindex
SimpleDirectoryReader(input_files=["../data/sample.txt"]).load_data()

In [None]:
# Langchain
TextLoader("../data/sample.txt").load()

## It seems llamaindex has better metadata capturing mechanism


In [None]:
SimpleDirectoryReader(input_files=["../data/notion_export.json"]).load_data()