### CSV And Excel files - Structured Data

In [None]:
import pandas as pd
import os

In [5]:
os.makedirs("data/structured_files", exist_ok = True)

In [None]:
# Create sample data
data = {
    'Product': ['Laptop', 'Mouse', 'Keyboard', 'Monitor', 'Webcam'],
    'Category': ['Electronics', 'Accessories', 'Accessories', 'Electronics', 'Electronics'],
    'Price': [999.99, 29.99, 79.99, 299.99, 89.99],
    'Stock': [50, 200, 150, 75, 100],
    'Description': [
        'High-performance laptop with 16GB RAM and 512GB SSD',
        'Wireless optical mouse with ergonomic design',
        'Mechanical keyboard with RGB backlighting',
        '27-inch 4K monitor with HDR support',
        '1080p webcam with noise cancellation'
    ]
}

#Save CSV
df = pd.DataFrame(data)
df.to_csv('data/structured_files/products.csv')


In [None]:
# Save as Excel with multiple sheets
with pd.ExcelWriter('data/structured_files/inventory.xlsx') as writer:
    df.to_excel(writer, sheet_name='Products', index=False)
    
    # Add another sheet
    summary_data = {
        'Category': ['Electronics', 'Accessories'],
        'Total_Items': [3, 2],
        'Total_Value': [1389.97, 109.98]
    }
    pd.DataFrame(summary_data).to_excel(writer, sheet_name='Summary', index=False)

### CSV Processing

In [6]:
from langchain_community.document_loaders import CSVLoader
from langchain_community.document_loaders import UnstructuredCSVLoader

In [16]:
###Method 1: CSV Loader -Each row becomes a document
print(' CSVLoader - Row-based Documents')

csv_loader = CSVLoader(
    file_path='data/structured_files/products.csv', 
    encoding='utf-8', 
    csv_args={
        'delimiter':',',
        'quotechar':'"',
    }
)

csv_docs = csv_loader.load()
print(csv_docs)
print(f" Loaded {len(csv_docs)} documents (one per row)")
print(f"\n First document:")
print(f"Content: {csv_docs[0].page_content}")
print(f"Metadata: {csv_docs[0].metadata}")

 CSVLoader - Row-based Documents
[Document(metadata={'source': 'data/structured_files/products.csv', 'row': 0}, page_content='Product: Laptop\nCategory: Electronics\nPrice: 999.99\nStock: 50\nDescription: High-performance laptop with 16GB RAM and 512GB SSD'), Document(metadata={'source': 'data/structured_files/products.csv', 'row': 1}, page_content='Product: Mouse\nCategory: Accessories\nPrice: 29.99\nStock: 200\nDescription: Wireless optical mouse with ergonomic design'), Document(metadata={'source': 'data/structured_files/products.csv', 'row': 2}, page_content='Product: Keyboard\nCategory: Accessories\nPrice: 79.99\nStock: 150\nDescription: Mechanical keyboard with RGB backlighting'), Document(metadata={'source': 'data/structured_files/products.csv', 'row': 3}, page_content='Product: Monitor\nCategory: Electronics\nPrice: 299.99\nStock: 75\nDescription: 27-inch 4K monitor with HDR support'), Document(metadata={'source': 'data/structured_files/products.csv', 'row': 4}, page_content='P

In [23]:
from typing import List
from langchain_core.documents import Document
## Method 2: Custom CSV Processing for better control
print("\n Custom CSV Processing")
def process_csv_intelligently(filepath:str)->List[Document]:
    """Process CSV with intelligent document creation"""
    df = pd.read_csv(filepath)
    documents = []

    #Strategy 1: One document per row with structured content
    for idx, row in df.iterrows():
        #create structured content
        content = f"""Product Information:
        Name: {row['Product']}
        Category: {row['Category']}, 
        Price: {row['Price']}, 
        Stock: {row['Stock']}, 
        Description: {row['Description']}"""

        #Create doc with rich metadata
        doc = Document(
            page_content=content, 
            metadata={
                'source': filepath, 
                'row_index': idx, 
                'product_name': row['Product'], 
                'category':row['Category'], 
                'price':row['Price'], 
                'data_type': 'product_info'
            }
        )
        documents.append(doc)
        return documents



 Custom CSV Processing


In [35]:
csv_docs = process_csv_intelligently('data/structured_files/products.csv')
print(csv_docs[0].page_content)

Product Information:
        Name: Laptop
        Category: Electronics, 
        Price: 999.99, 
        Stock: 50, 
        Description: High-performance laptop with 16GB RAM and 512GB SSD


#### Excel Processing

In [37]:
#Method 1: Using pandas for full control 

print("Pandas-based Excel Processing")

def process_excel_with_pandas(filepath:str)->List[Document]:
    """Process Excel with sheet awareness"""
    documents= []

    #Read all sheets
    excel_file = pd.ExcelFile(filepath)

    for sheet_name in excel_file.sheet_names:
        df = pd.read_excel(filepath, sheet_name=sheet_name)

        #Create documents for each sheet
        sheet_content = f"Sheet: {sheet_name}\n"
        sheet_content += f"Columns:{', '.join(df.columns)}\n"
        sheet_content += f"Rows: {len(df)}\n\n"
        sheet_content += df.to_string(index=False)

        doc = Document(
            page_content=sheet_content, 
            metadata = {
                'source':filepath,
                'sheet_name':sheet_name, 
                'num_rows': len(df), 
                'num_columns': len(df.columns), 
                'data_type':'excel_sheet'
            }
        )

        documents.append(doc)
    
    return documents

        

Pandas-based Excel Processing


In [40]:
excel_docs = process_excel_with_pandas('data/structured_files/inventory.xlsx')
print(f"Processed {len(excel_docs)} sheets")

print(excel_docs[0].page_content)

Processed 2 sheets
Sheet: Products
Columns:Product, Category, Price, Stock, Description
Rows: 5

 Product    Category  Price  Stock                                         Description
  Laptop Electronics 999.99     50 High-performance laptop with 16GB RAM and 512GB SSD
   Mouse Accessories  29.99    200        Wireless optical mouse with ergonomic design
Keyboard Accessories  79.99    150           Mechanical keyboard with RGB backlighting
 Monitor Electronics 299.99     75                 27-inch 4K monitor with HDR support
  Webcam Electronics  89.99    100                1080p webcam with noise cancellation


In [47]:
from langchain_community.document_loaders import UnstructuredExcelLoader
# Method 2: UnstructuredExcelLoader
print("\n UnstructuredExcelLoader")

try:
    excel_loader = UnstructuredExcelLoader(
        'data/structured_files/inventory.xlsx',
        mode="elements"
    )
    unstructured_docs = excel_loader.load()
    print(unstructured_docs)
except Exception as e:
    print(f"Exception {e}")


 UnstructuredExcelLoader
Exception No module named 'msoffcrypto'
