Load csv data with a single row per document.

In [None]:
from langchain_community.document_loaders.csv_loader import CSVLoader

loader = CSVLoader(file_path="./example_data/mlb_teams_2012.csv")

data = loader.load()



In [None]:
data

Customizing the csv parsing and loading

In [49]:
loader = CSVLoader(
    file_path="./example_data/mlb_teams_2012.csv",
    csv_args={
        "delimiter": ",",
        "quotechar": '"',
        "fieldnames": ["MLB Team", "Payroll in millions", "Wins"],
    },
)

data = loader.load()



In [None]:
data

In [None]:
# Convert the list of objects into a JSON-serializable format
serializable_data = [
    {"page_content": doc.page_content, "metadata": doc.metadata} for doc in data
]

# Pretty print using json.dumps
import json
print(json.dumps(serializable_data, indent=4))


Specify a column to identify the document source


In [None]:
loader = CSVLoader(file_path="./example_data/mlb_teams_2012.csv", source_column="Team")

source_data = loader.load()



In [None]:
source_data

UnstructuredCSVLoader

In [None]:
%pip install unstructured pandas


In [None]:
from langchain_community.document_loaders.csv_loader import UnstructuredCSVLoader

loader = UnstructuredCSVLoader(
    file_path="example_data/mlb_teams_2012.csv", mode="elements"
)
docs = loader.load()

html_content = docs[0].metadata["text_as_html"]


print(html_content)

In [None]:
# Use pandas to read the HTML table directly
df = pd.read_html(html_content)[0]

# Display the DataFrame
print(df.to_string(index=False))

Initialization

In [None]:
%pip install -qU langchain_community jq 


In [56]:
from langchain_community.document_loaders import JSONLoader

loader = JSONLoader(
    file_path="./example_data/facebook_chat.json",
    jq_schema=".messages[].content",
    text_content=False,
)

In [None]:
json_docs = loader.load()
json_docs

In [None]:
json_docs[0]

In [None]:
json_docs[0].metadata


In [None]:
json_docs[0].page_content

Read from JSON Lines file


In [63]:
loader = JSONLoader(
    file_path="./example_data/facebook_chat_messages.jsonl",
    jq_schema=".content",
    text_content=False,
    json_lines=True,
)

jsnol_docs = loader.load()


In [None]:
jsnol_docs

Read specific content keys


In [None]:
loader = JSONLoader(
    file_path="./example_data/facebook_chat_messages.jsonl",
    jq_schema=".",
    content_key="sender_name",
    json_lines=True,
)

jsonl_key_docs = loader.load()


In [None]:
jsonl_key_docs

In [None]:
print(jsonl_key_docs[0])

In [None]:
# Define the metadata extraction function.
def metadata_func(record: dict, metadata: dict) -> dict:
    metadata["sender_name"] = record.get("sender_name")
    metadata["timestamp_ms"] = record.get("timestamp_ms")

    return metadata


loader = JSONLoader(
    file_path="./example_data/facebook_chat.json",
    jq_schema=".messages[]",
    content_key="content",
    metadata_func=metadata_func,
)

docs = loader.load()
print(docs[0].metadata)