Load csv data with a single row per document.

In [3]:
from langchain_community.document_loaders.csv_loader import CSVLoader

loader = CSVLoader(file_path="./example_data/mlb_teams_2012.csv")

data = loader.load()



In [4]:
data

[Document(metadata={'source': './example_data/mlb_teams_2012.csv', 'row': 0}, page_content='Team: Nationals\n"Payroll (millions)": 81.34\n"Wins": 98'),
 Document(metadata={'source': './example_data/mlb_teams_2012.csv', 'row': 1}, page_content='Team: Reds\n"Payroll (millions)": 82.20\n"Wins": 97'),
 Document(metadata={'source': './example_data/mlb_teams_2012.csv', 'row': 2}, page_content='Team: Yankees\n"Payroll (millions)": 197.96\n"Wins": 95'),
 Document(metadata={'source': './example_data/mlb_teams_2012.csv', 'row': 3}, page_content='Team: Giants\n"Payroll (millions)": 117.62\n"Wins": 94'),
 Document(metadata={'source': './example_data/mlb_teams_2012.csv', 'row': 4}, page_content='Team: Braves\n"Payroll (millions)": 83.31\n"Wins": 94'),
 Document(metadata={'source': './example_data/mlb_teams_2012.csv', 'row': 5}, page_content='Team: Athletics\n"Payroll (millions)": 55.37\n"Wins": 94'),
 Document(metadata={'source': './example_data/mlb_teams_2012.csv', 'row': 6}, page_content='Team: R

Customizing the csv parsing and loading

In [5]:
loader = CSVLoader(
    file_path="./example_data/mlb_teams_2012.csv",
    csv_args={
        "delimiter": ",",
        "quotechar": '"',
        "fieldnames": ["MLB Team", "Payroll in millions", "Wins"],
    },
)

data = loader.load()



In [6]:
data

[Document(metadata={'source': './example_data/mlb_teams_2012.csv', 'row': 0}, page_content='MLB Team: Team\nPayroll in millions: "Payroll (millions)"\nWins: "Wins"'),
 Document(metadata={'source': './example_data/mlb_teams_2012.csv', 'row': 1}, page_content='MLB Team: Nationals\nPayroll in millions: 81.34\nWins: 98'),
 Document(metadata={'source': './example_data/mlb_teams_2012.csv', 'row': 2}, page_content='MLB Team: Reds\nPayroll in millions: 82.20\nWins: 97'),
 Document(metadata={'source': './example_data/mlb_teams_2012.csv', 'row': 3}, page_content='MLB Team: Yankees\nPayroll in millions: 197.96\nWins: 95'),
 Document(metadata={'source': './example_data/mlb_teams_2012.csv', 'row': 4}, page_content='MLB Team: Giants\nPayroll in millions: 117.62\nWins: 94'),
 Document(metadata={'source': './example_data/mlb_teams_2012.csv', 'row': 5}, page_content='MLB Team: Braves\nPayroll in millions: 83.31\nWins: 94'),
 Document(metadata={'source': './example_data/mlb_teams_2012.csv', 'row': 6}, p

In [7]:
# Convert the list of objects into a JSON-serializable format
serializable_data = [
    {"page_content": doc.page_content, "metadata": doc.metadata} for doc in data
]

# Pretty print using json.dumps
import json
print(json.dumps(serializable_data, indent=4))


[
    {
        "page_content": "MLB Team: Team\nPayroll in millions: \"Payroll (millions)\"\nWins: \"Wins\"",
        "metadata": {
            "source": "./example_data/mlb_teams_2012.csv",
            "row": 0
        }
    },
    {
        "page_content": "MLB Team: Nationals\nPayroll in millions: 81.34\nWins: 98",
        "metadata": {
            "source": "./example_data/mlb_teams_2012.csv",
            "row": 1
        }
    },
    {
        "page_content": "MLB Team: Reds\nPayroll in millions: 82.20\nWins: 97",
        "metadata": {
            "source": "./example_data/mlb_teams_2012.csv",
            "row": 2
        }
    },
    {
        "page_content": "MLB Team: Yankees\nPayroll in millions: 197.96\nWins: 95",
        "metadata": {
            "source": "./example_data/mlb_teams_2012.csv",
            "row": 3
        }
    },
    {
        "page_content": "MLB Team: Giants\nPayroll in millions: 117.62\nWins: 94",
        "metadata": {
            "source": "./example_da

Specify a column to identify the document source


In [8]:
loader = CSVLoader(file_path="./example_data/mlb_teams_2012.csv", source_column="Team")

source_data = loader.load()



In [9]:
source_data

[Document(metadata={'source': 'Nationals', 'row': 0}, page_content='Team: Nationals\n"Payroll (millions)": 81.34\n"Wins": 98'),
 Document(metadata={'source': 'Reds', 'row': 1}, page_content='Team: Reds\n"Payroll (millions)": 82.20\n"Wins": 97'),
 Document(metadata={'source': 'Yankees', 'row': 2}, page_content='Team: Yankees\n"Payroll (millions)": 197.96\n"Wins": 95'),
 Document(metadata={'source': 'Giants', 'row': 3}, page_content='Team: Giants\n"Payroll (millions)": 117.62\n"Wins": 94'),
 Document(metadata={'source': 'Braves', 'row': 4}, page_content='Team: Braves\n"Payroll (millions)": 83.31\n"Wins": 94'),
 Document(metadata={'source': 'Athletics', 'row': 5}, page_content='Team: Athletics\n"Payroll (millions)": 55.37\n"Wins": 94'),
 Document(metadata={'source': 'Rangers', 'row': 6}, page_content='Team: Rangers\n"Payroll (millions)": 120.51\n"Wins": 93'),
 Document(metadata={'source': 'Orioles', 'row': 7}, page_content='Team: Orioles\n"Payroll (millions)": 81.43\n"Wins": 93'),
 Docume

UnstructuredCSVLoader

In [None]:
%pip install unstructured pandas


In [10]:
from langchain_community.document_loaders.csv_loader import UnstructuredCSVLoader

loader = UnstructuredCSVLoader(
    file_path="example_data/mlb_teams_2012.csv", mode="elements"
)
docs = loader.load()

html_content = docs[0].metadata["text_as_html"]


print(html_content)

<table><tr><td>Team</td><td>"Payroll (millions)"</td><td>"Wins"</td></tr><tr><td>Nationals</td><td>81.34</td><td>98</td></tr><tr><td>Reds</td><td>82.20</td><td>97</td></tr><tr><td>Yankees</td><td>197.96</td><td>95</td></tr><tr><td>Giants</td><td>117.62</td><td>94</td></tr><tr><td>Braves</td><td>83.31</td><td>94</td></tr><tr><td>Athletics</td><td>55.37</td><td>94</td></tr><tr><td>Rangers</td><td>120.51</td><td>93</td></tr><tr><td>Orioles</td><td>81.43</td><td>93</td></tr><tr><td>Rays</td><td>64.17</td><td>90</td></tr><tr><td>Angels</td><td>154.49</td><td>89</td></tr><tr><td>Tigers</td><td>132.30</td><td>88</td></tr><tr><td>Cardinals</td><td>110.30</td><td>88</td></tr><tr><td>Dodgers</td><td>95.14</td><td>86</td></tr><tr><td>White Sox</td><td>96.92</td><td>85</td></tr><tr><td>Brewers</td><td>97.65</td><td>83</td></tr><tr><td>Phillies</td><td>174.54</td><td>81</td></tr><tr><td>Diamondbacks</td><td>74.28</td><td>81</td></tr><tr><td>Pirates</td><td>63.43</td><td>79</td></tr><tr><td>Padres</

In [12]:
import pandas as pd

# Use pandas to read the HTML table directly
df = pd.read_html(html_content)[0]

# Display the DataFrame
print(df.to_string(index=False))

           0                    1      2
        Team "Payroll (millions)" "Wins"
   Nationals                81.34     98
        Reds                82.20     97
     Yankees               197.96     95
      Giants               117.62     94
      Braves                83.31     94
   Athletics                55.37     94
     Rangers               120.51     93
     Orioles                81.43     93
        Rays                64.17     90
      Angels               154.49     89
      Tigers               132.30     88
   Cardinals               110.30     88
     Dodgers                95.14     86
   White Sox                96.92     85
     Brewers                97.65     83
    Phillies               174.54     81
Diamondbacks                74.28     81
     Pirates                63.43     79
      Padres                55.24     76
    Mariners                81.97     75
        Mets                93.35     74
   Blue Jays                75.48     73
      Royals    

  df = pd.read_html(html_content)[0]


Initialization

In [13]:
%pip install -qU langchain_community jq 


Note: you may need to restart the kernel to use updated packages.


In [56]:
from langchain_community.document_loaders import JSONLoader

loader = JSONLoader(
    file_path="./example_data/facebook_chat.json",
    jq_schema=".messages[].content",
    text_content=False,
)

In [14]:
json_docs = loader.load()
json_docs

[Document(metadata={'source': 'example_data/mlb_teams_2012.csv', 'file_directory': 'example_data', 'filename': 'mlb_teams_2012.csv', 'last_modified': '2025-01-05T08:34:00', 'text_as_html': '<table><tr><td>Team</td><td>"Payroll (millions)"</td><td>"Wins"</td></tr><tr><td>Nationals</td><td>81.34</td><td>98</td></tr><tr><td>Reds</td><td>82.20</td><td>97</td></tr><tr><td>Yankees</td><td>197.96</td><td>95</td></tr><tr><td>Giants</td><td>117.62</td><td>94</td></tr><tr><td>Braves</td><td>83.31</td><td>94</td></tr><tr><td>Athletics</td><td>55.37</td><td>94</td></tr><tr><td>Rangers</td><td>120.51</td><td>93</td></tr><tr><td>Orioles</td><td>81.43</td><td>93</td></tr><tr><td>Rays</td><td>64.17</td><td>90</td></tr><tr><td>Angels</td><td>154.49</td><td>89</td></tr><tr><td>Tigers</td><td>132.30</td><td>88</td></tr><tr><td>Cardinals</td><td>110.30</td><td>88</td></tr><tr><td>Dodgers</td><td>95.14</td><td>86</td></tr><tr><td>White Sox</td><td>96.92</td><td>85</td></tr><tr><td>Brewers</td><td>97.65</td

In [15]:
json_docs[0]

Document(metadata={'source': 'example_data/mlb_teams_2012.csv', 'file_directory': 'example_data', 'filename': 'mlb_teams_2012.csv', 'last_modified': '2025-01-05T08:34:00', 'text_as_html': '<table><tr><td>Team</td><td>"Payroll (millions)"</td><td>"Wins"</td></tr><tr><td>Nationals</td><td>81.34</td><td>98</td></tr><tr><td>Reds</td><td>82.20</td><td>97</td></tr><tr><td>Yankees</td><td>197.96</td><td>95</td></tr><tr><td>Giants</td><td>117.62</td><td>94</td></tr><tr><td>Braves</td><td>83.31</td><td>94</td></tr><tr><td>Athletics</td><td>55.37</td><td>94</td></tr><tr><td>Rangers</td><td>120.51</td><td>93</td></tr><tr><td>Orioles</td><td>81.43</td><td>93</td></tr><tr><td>Rays</td><td>64.17</td><td>90</td></tr><tr><td>Angels</td><td>154.49</td><td>89</td></tr><tr><td>Tigers</td><td>132.30</td><td>88</td></tr><tr><td>Cardinals</td><td>110.30</td><td>88</td></tr><tr><td>Dodgers</td><td>95.14</td><td>86</td></tr><tr><td>White Sox</td><td>96.92</td><td>85</td></tr><tr><td>Brewers</td><td>97.65</td>

In [16]:
json_docs[0].metadata


{'source': 'example_data/mlb_teams_2012.csv',
 'file_directory': 'example_data',
 'filename': 'mlb_teams_2012.csv',
 'last_modified': '2025-01-05T08:34:00',
 'text_as_html': '<table><tr><td>Team</td><td>"Payroll (millions)"</td><td>"Wins"</td></tr><tr><td>Nationals</td><td>81.34</td><td>98</td></tr><tr><td>Reds</td><td>82.20</td><td>97</td></tr><tr><td>Yankees</td><td>197.96</td><td>95</td></tr><tr><td>Giants</td><td>117.62</td><td>94</td></tr><tr><td>Braves</td><td>83.31</td><td>94</td></tr><tr><td>Athletics</td><td>55.37</td><td>94</td></tr><tr><td>Rangers</td><td>120.51</td><td>93</td></tr><tr><td>Orioles</td><td>81.43</td><td>93</td></tr><tr><td>Rays</td><td>64.17</td><td>90</td></tr><tr><td>Angels</td><td>154.49</td><td>89</td></tr><tr><td>Tigers</td><td>132.30</td><td>88</td></tr><tr><td>Cardinals</td><td>110.30</td><td>88</td></tr><tr><td>Dodgers</td><td>95.14</td><td>86</td></tr><tr><td>White Sox</td><td>96.92</td><td>85</td></tr><tr><td>Brewers</td><td>97.65</td><td>83</td></t

In [17]:
json_docs[0].page_content

'Team "Payroll (millions)" "Wins" Nationals 81.34 98 Reds 82.20 97 Yankees 197.96 95 Giants 117.62 94 Braves 83.31 94 Athletics 55.37 94 Rangers 120.51 93 Orioles 81.43 93 Rays 64.17 90 Angels 154.49 89 Tigers 132.30 88 Cardinals 110.30 88 Dodgers 95.14 86 White Sox 96.92 85 Brewers 97.65 83 Phillies 174.54 81 Diamondbacks 74.28 81 Pirates 63.43 79 Padres 55.24 76 Mariners 81.97 75 Mets 93.35 74 Blue Jays 75.48 73 Royals 60.91 72 Marlins 118.07 69 Red Sox 173.18 69 Indians 78.43 68 Twins 94.08 66 Rockies 78.06 64 Cubs 88.19 61 Astros 60.65 55'

Read from JSON Lines file


In [20]:
from langchain_community.document_loaders import JSONLoader

loader = JSONLoader(
    file_path="./example_data/facebook_chat_messages.jsonl",
    jq_schema=".content",
    text_content=False,
    json_lines=True,
)

jsnol_docs = loader.load()


In [21]:
jsnol_docs

[Document(metadata={'source': 'E:\\Learn2\\workspace2\\git_area\\Mastering-LangChain\\3-Integration\\1-Doc-Loader\\example_data\\facebook_chat_messages.jsonl', 'seq_num': 1}, page_content='Bye!'),
 Document(metadata={'source': 'E:\\Learn2\\workspace2\\git_area\\Mastering-LangChain\\3-Integration\\1-Doc-Loader\\example_data\\facebook_chat_messages.jsonl', 'seq_num': 2}, page_content='Oh no worries! Bye'),
 Document(metadata={'source': 'E:\\Learn2\\workspace2\\git_area\\Mastering-LangChain\\3-Integration\\1-Doc-Loader\\example_data\\facebook_chat_messages.jsonl', 'seq_num': 3}, page_content='No Im sorry it was my mistake, the blue one is not for sale')]

Read specific content keys


In [22]:
loader = JSONLoader(
    file_path="./example_data/facebook_chat_messages.jsonl",
    jq_schema=".",
    content_key="sender_name",
    json_lines=True,
)

jsonl_key_docs = loader.load()


In [23]:
jsonl_key_docs

[Document(metadata={'source': 'E:\\Learn2\\workspace2\\git_area\\Mastering-LangChain\\3-Integration\\1-Doc-Loader\\example_data\\facebook_chat_messages.jsonl', 'seq_num': 1}, page_content='User 2'),
 Document(metadata={'source': 'E:\\Learn2\\workspace2\\git_area\\Mastering-LangChain\\3-Integration\\1-Doc-Loader\\example_data\\facebook_chat_messages.jsonl', 'seq_num': 2}, page_content='User 1'),
 Document(metadata={'source': 'E:\\Learn2\\workspace2\\git_area\\Mastering-LangChain\\3-Integration\\1-Doc-Loader\\example_data\\facebook_chat_messages.jsonl', 'seq_num': 3}, page_content='User 2')]

In [None]:
print(jsonl_key_docs[0])

In [24]:
# Define the metadata extraction function.
def metadata_func(record: dict, metadata: dict) -> dict:
    metadata["sender_name"] = record.get("sender_name")
    metadata["timestamp_ms"] = record.get("timestamp_ms")

    return metadata


loader = JSONLoader(
    file_path="./example_data/facebook_chat.json",
    jq_schema=".messages[]",
    content_key="content",
    metadata_func=metadata_func,
)

docs = loader.load()
print(docs[0].metadata)

{'source': 'E:\\Learn2\\workspace2\\git_area\\Mastering-LangChain\\3-Integration\\1-Doc-Loader\\example_data\\facebook_chat.json', 'seq_num': 1, 'sender_name': 'User 2', 'timestamp_ms': 1675597571851}
