<h1> PDF loader


In [2]:
from dotenv import load_dotenv

load_dotenv()

In [None]:
FILE_PATH = "./IMF_gfsr.pdf"

In [None]:
def show_metadata(docs):
    if docs:
        print("[metadata]")
        print(list(docs[0].metadata.keys()))
        print("\n[examples]")
        max_key_length = max(len(k) for k in docs[0].metadata.keys())
        for k, v in docs[0].metadata.items():
            print(f"{k:<{max_key_length}} : {v}")

In [None]:
# PyPDF
# !pip install -qU pypdf
from langchain_community.document_loaders import PyPDFLoader

loader = PyPDFLoader(FILE_PATH)

docs = loader.load()

print(docs[10].page_content[:300])

In [None]:
# PDFplumber

from langchain_community.document_loaders import PDFPlumberLoader

loader = PDFPlumberLoader(FILE_PATH)

docs = loader.load()

print(docs[10].page_content[:300])

<h1> CSVloader

In [None]:
from langchain_community.document_loaders.csv_loader import CSVLoader

# CSV 로더 생성
loader = CSVLoader(file_path="./interestrate.csv")

# 데이터 로드
docs = loader.load()

print(len(docs))
print(docs[0].metadata)

In [None]:
# convert its rows into an XML-like structure for further processing or transformation
# Example 
# Date,US 10yr Interest Rate,KOR 10yr Interest Rate
# 2024-11-20, 3.5, 2.3
# 2024-11-19, 4.2, 2.0
#
# <row><Date>2024-11-20</Date><US 10yr Interest Rate>3.5</US 10yr Interest Rate><KOR 10yr Interest Rate>2.3</KOR 10yr Interest Rate></row>
# <row><Date>2024-11-19</Date><US 10yr Interest Rate>4.2</US 10yr Interest Rate><KOR 10yr Interest Rate>2.0</KOR 10yr Interest Rate></row>

# splits the content of the document into individual elements
row = docs[1].page_content.split("\n")

# The structure is concatenated into a single <row>...</row> string
row_str = "<row>"
for element in row:
    splitted_element = element.split(":")
    value = splitted_element[-1]
    col = ":".join(splitted_element[:-1])
    row_str += f"<{col}>{value.strip()}</{col}>"
row_str += "</row>"
print(row_str)

In [None]:
# Unlike CSV, XML can encode entity relationships by defining a structured, hierarchical format.

<h1> WebBasedLoader

In [None]:
import bs4
from langchain_community.document_loaders import WebBaseLoader

loader = WebBaseLoader(
    # Specifies the URL of the web page to scrape
    web_paths=("https://n.news.naver.com/article/437/0000378416",),
    
    # Passes options to BeautifulSoup to extract specific sections of the HTML (This is the core)
    bs_kwargs=dict(
        parse_only=bs4.SoupStrainer(
            "div",
            attrs={"class": ["newsct_article _article_body", "media_end_head_title"]},  # 브라우저에서 검사 누르고 가져올 부분 지정 
        )
    ),
    # Headers mimic a browser request to avoid getting blocked (warning) by the server
    header_template={
        "User_Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/102.0.0.0 Safari/537.36",
    },
)

# Configure the requests_kwargs attribute to set custom SSL behavior
loader.requests_kwargs = {"verify": True}

# Loading Documents
docs = loader.load()
print(f"문서의 수: {len(docs)}")
docs

In [None]:
# use a proxy to bypass network restrictions or anonymize the connection

loader = WebBaseLoader(
    "https://www.google.com/search?q=parrots",
    proxies={
        "http": "http://{username}:{password}:@proxy.service.com:6666/",
        "https": "https://{username}:{password}:@proxy.service.com:6666/",
    },

)

docs = loader.load()