In [1]:
import pandas as pd
import os
from langchain_community.document_loaders import CSVLoader, UnstructuredCSVLoader

In [3]:
os.makedirs("data/structured_file", exist_ok= True)

In [5]:
# Create sample data
data = {
    'Product': ['Laptop', 'Mouse', 'Keyboard', 'Monitor', 'Webcam'],
    'Category': ['Electronics', 'Accessories', 'Accessories', 'Electronics', 'Electronics'],
    'Price': [999.99, 29.99, 79.99, 299.99, 89.99],
    'Stock': [50, 200, 150, 75, 100],
    'Description': [
        'High-performance laptop with 16GB RAM and 512GB SSD',
        'Wireless optical mouse with ergonomic design',
        'Mechanical keyboard with RGB backlighting',
        '27-inch 4K monitor with HDR support',
        '1080p webcam with noise cancellation'
    ]
}

df = pd.DataFrame(data)
df.to_csv("data/structured_file/products.csv", index= False)

# Save as Excel with multiple sheets

In [11]:
with pd.ExcelWriter("data/structured_file/inv.xlsx") as w:
    df.to_excel(w, sheet_name="products", index= False)

    # Add other sheet
    summary_data ={
        'Category': ['Electronics', 'Accessories'],
        'Total_Items': [3, 2],
        'Total_Value': [1389.97, 109.98]
        
    }
    pd.DataFrame(summary_data).to_excel(w, sheet_name="Summary", index = False)

In [25]:
from langchain_community.document_loaders import UnstructuredExcelLoader
data_excel = UnstructuredExcelLoader("data/structured_file/inv.xlsx")
data = data_excel.load()
data[0].page_content

'Product Category Price Stock Description Laptop Electronics 999.99 50 High-performance laptop with 16GB RAM and 512GB SSD Mouse Accessories 29.99 200 Wireless optical mouse with ergonomic design Keyboard Accessories 79.99 150 Mechanical keyboard with RGB backlighting Monitor Electronics 299.99 75 27-inch 4K monitor with HDR support Webcam Electronics 89.99 100 1080p webcam with noise cancellation\n\nCategory Total_Items Total_Value Electronics 3 1389.97 Accessories 2 109.98'

## CSV Processing

In [29]:
from langchain_community.document_loaders import CSVLoader, UnstructuredCSVLoader

In [45]:
csv_loader = CSVLoader(
    file_path="data/structured_file/products.csv",
    encoding='utf-8',
    csv_args={
        "delimiter": ',',
        "quotechar": '"'
    }
)

csv_data= csv_loader.load()
print(csv_data)

print(f"Loaded {len(csv_data)} documents (one per row)")
print("\nFirst document:")
print(f"Content: {csv_data[0].page_content}")
print(f"Metadata: {csv_data[0].metadata}")

[Document(metadata={'source': 'data/structured_file/products.csv', 'row': 0}, page_content='Product: Laptop\nCategory: Electronics\nPrice: 999.99\nStock: 50\nDescription: High-performance laptop with 16GB RAM and 512GB SSD'), Document(metadata={'source': 'data/structured_file/products.csv', 'row': 1}, page_content='Product: Mouse\nCategory: Accessories\nPrice: 29.99\nStock: 200\nDescription: Wireless optical mouse with ergonomic design'), Document(metadata={'source': 'data/structured_file/products.csv', 'row': 2}, page_content='Product: Keyboard\nCategory: Accessories\nPrice: 79.99\nStock: 150\nDescription: Mechanical keyboard with RGB backlighting'), Document(metadata={'source': 'data/structured_file/products.csv', 'row': 3}, page_content='Product: Monitor\nCategory: Electronics\nPrice: 299.99\nStock: 75\nDescription: 27-inch 4K monitor with HDR support'), Document(metadata={'source': 'data/structured_file/products.csv', 'row': 4}, page_content='Product: Webcam\nCategory: Electronics\

In [57]:
from typing import List
from langchain_core.documents import Document

# Custom CSV processing for better control

def process_csv(filepath: str) -> List[Document]:
    df = pd.read_csv(filepath)
    documents = []

    for idx, row in df.iterrows():
        # Create structured content
        content = f""" Product Information:
        Name: {row['Product']}
        Category: {row['Category']}
        Price: ${row['Price']}
        Stock: {row['Stock']} units
        Description: {row['Description']}"""

        doc = Document(
            page_content=content,
            metadata = {
                'source': filepath,
                'row_index': idx,
                'product_name': row['Product'],
                'category': row['Category'],
                'price': row['Price'],
                'data_type': 'product_info'
            }
        )
        documents.append(doc)
    return documents

In [75]:
process_csv("data/structured_file/products.csv")

[Document(metadata={'source': 'data/structured_file/products.csv', 'row_index': 0, 'product_name': 'Laptop', 'category': 'Electronics', 'price': 999.99, 'data_type': 'product_info'}, page_content=' Product Information:\n        Name: Laptop\n        Category: Electronics\n        Price: $999.99\n        Stock: 50 units\n        Description: High-performance laptop with 16GB RAM and 512GB SSD'),
 Document(metadata={'source': 'data/structured_file/products.csv', 'row_index': 1, 'product_name': 'Mouse', 'category': 'Accessories', 'price': 29.99, 'data_type': 'product_info'}, page_content=' Product Information:\n        Name: Mouse\n        Category: Accessories\n        Price: $29.99\n        Stock: 200 units\n        Description: Wireless optical mouse with ergonomic design'),
 Document(metadata={'source': 'data/structured_file/products.csv', 'row_index': 2, 'product_name': 'Keyboard', 'category': 'Accessories', 'price': 79.99, 'data_type': 'product_info'}, page_content=' Product Informa

In [61]:
data = pd.read_csv("data/structured_file/products.csv")
data

Unnamed: 0,Product,Category,Price,Stock,Description
0,Laptop,Electronics,999.99,50,High-performance laptop with 16GB RAM and 512G...
1,Mouse,Accessories,29.99,200,Wireless optical mouse with ergonomic design
2,Keyboard,Accessories,79.99,150,Mechanical keyboard with RGB backlighting
3,Monitor,Electronics,299.99,75,27-inch 4K monitor with HDR support
4,Webcam,Electronics,89.99,100,1080p webcam with noise cancellation


In [71]:
for idx, row in data.iterrows():
    print(row["Price"])
    

999.99
29.99
79.99
299.99
89.99


## Excel Processing

In [92]:
# Method 1 Usng pandas for full control
def process_ecxel(filepath:str) -> List[Document]:
    documents = []
    ecxel_file = pd.ExcelFile(filepath)

    for sheet_name in ecxel_file.sheet_names:
        df = pd.read_excel(filepath, sheet_name = sheet_name)

        # Create document for each sheet
        sheet_content = f"Sheet :{sheet_name}\n"
        sheet_content +=f"Column: {', '.join(df.columns)}\n"
        sheet_content += f"Rows: {len(df)}\n\n"
        sheet_content +=df.to_string(index=False)

        doc = Document(
            page_content=sheet_content,
            metadata={
                'source': filepath,
                'sheet_name': sheet_name,
                'num_rows': len(df),
                'num_columns': len(df.columns),
                'data_type': 'excel_sheet'
                
            }
        )
        documents.append(doc)
    return documents

In [94]:
excel_data= process_ecxel("data/structured_file/inv.xlsx")
print(excel_data)

[Document(metadata={'source': 'data/structured_file/inv.xlsx', 'sheet_name': 'products', 'num_rows': 5, 'num_columns': 5, 'data_type': 'excel_sheet'}, page_content='Sheet :products\nColumn: Product, Category, Price, Stock, Description\nRows: 5\n\n Product    Category  Price  Stock                                         Description\n  Laptop Electronics 999.99     50 High-performance laptop with 16GB RAM and 512GB SSD\n   Mouse Accessories  29.99    200        Wireless optical mouse with ergonomic design\nKeyboard Accessories  79.99    150           Mechanical keyboard with RGB backlighting\n Monitor Electronics 299.99     75                 27-inch 4K monitor with HDR support\n  Webcam Electronics  89.99    100                1080p webcam with noise cancellation'), Document(metadata={'source': 'data/structured_file/inv.xlsx', 'sheet_name': 'Summary', 'num_rows': 2, 'num_columns': 3, 'data_type': 'excel_sheet'}, page_content='Sheet :Summary\nColumn: Category, Total_Items, Total_Value\n

In [96]:
from langchain_community.document_loaders import UnstructuredExcelLoader

try:
    excel_data = UnstructuredExcelLoader(
        "data/structured_file/inv.xlsx",
        mode = "elements"
    )
    unstructured_data = excel_data.load()
except Exception as a:
    print(f"Error : {a}")

In [98]:
unstructured_data

[Document(metadata={'source': 'data/structured_file/inv.xlsx', 'file_directory': 'data/structured_file', 'filename': 'inv.xlsx', 'last_modified': '2025-09-18T17:00:07', 'page_name': 'products', 'page_number': 1, 'text_as_html': '<table><tr><td>Product</td><td>Category</td><td>Price</td><td>Stock</td><td>Description</td></tr><tr><td>Laptop</td><td>Electronics</td><td>999.99</td><td>50</td><td>High-performance laptop with 16GB RAM and 512GB SSD</td></tr><tr><td>Mouse</td><td>Accessories</td><td>29.99</td><td>200</td><td>Wireless optical mouse with ergonomic design</td></tr><tr><td>Keyboard</td><td>Accessories</td><td>79.99</td><td>150</td><td>Mechanical keyboard with RGB backlighting</td></tr><tr><td>Monitor</td><td>Electronics</td><td>299.99</td><td>75</td><td>27-inch 4K monitor with HDR support</td></tr><tr><td>Webcam</td><td>Electronics</td><td>89.99</td><td>100</td><td>1080p webcam with noise cancellation</td></tr></table>', 'languages': ['eng'], 'filetype': 'application/vnd.openxmlf