<a href="https://colab.research.google.com/github/wesslen/data-management/blob/main/notebooks/01_input_ingestion.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
!git clone https://github.com/wesslen/data-management.git

Cloning into 'data-management'...
remote: Enumerating objects: 19, done.[K
remote: Counting objects: 100% (19/19), done.[K
remote: Compressing objects: 100% (16/16), done.[K
remote: Total 19 (delta 4), reused 0 (delta 0), pack-reused 0 (from 0)[K
Receiving objects: 100% (19/19), 42.41 KiB | 944.00 KiB/s, done.
Resolving deltas: 100% (4/4), done.


In [9]:
!uv pip install --system docx openpyxl python-docx

[2mUsing Python 3.10.12 environment at /usr[0m
[37m⠋[0m [2mResolving dependencies...                                                     [0m[2K[37m⠋[0m [2mResolving dependencies...                                                     [0m[2K[37m⠙[0m [2mResolving dependencies...                                                     [0m[2K[37m⠙[0m [2mdocx==0.2.4                                                                   [0m[2K[37m⠙[0m [2mopenpyxl==3.1.5                                                               [0m[2K[37m⠙[0m [2mpython-docx==1.1.2                                                            [0m[2K[37m⠙[0m [2mlxml==5.3.0                                                                   [0m[2K[37m⠙[0m [2mpillow==11.0.0                                                                [0m[2K[37m⠙[0m [2met-xmlfile==2.0.0                                                             [0m[2K[37m⠙[0m [2mtyping-extensions=

In [13]:
import os
import pandas as pd
from pathlib import Path
import openpyxl
import markdown
import csv

# Use the correct import for python-docx
try:
    from docx import Document
except ImportError as e:
    print(f"Error importing docx: {e}")
    print("Please install python-docx using: pip install python-docx")
    Document = None

# Global file mappings
FILE_MAPPINGS = {
    'balance-sheet-sop.docx': ('# Balance Sheet Standard Operating Procedures\n\n', 'docx'),
    'balance-sheet.xlsx': ('# Balance Sheet\n\n', 'xlsx'),
    'data-management-template.csv': ('# Data Management Template\n\n', 'csv'),
    'metadata-dictionary.csv': ('# Metadata Dictionary\n\n', 'csv')
}

def convert_docx_to_markdown(docx_path):
    """Convert Word document to markdown format."""
    if Document is None:
        return "Error: python-docx package not properly installed"

    try:
        doc = Document(docx_path)
        markdown_content = []

        for paragraph in doc.paragraphs:
            # Handle different heading levels based on style
            style = paragraph.style.name
            text = paragraph.text.strip()

            if not text:  # Skip empty paragraphs
                continue

            if style.startswith('Heading'):
                try:
                    level = int(style[-1])  # Get the heading level number
                    markdown_content.append(f"{'#' * level} {text}\n")
                except ValueError:
                    # Default to level 2 heading if number can't be extracted
                    markdown_content.append(f"## {text}\n")
            else:
                # Handle normal paragraphs and other styles
                markdown_content.append(f"{text}\n")

        return "\n".join(markdown_content)
    except Exception as e:
        return f"Error converting DOCX: {str(e)}"

def convert_xlsx_to_markdown(xlsx_path):
    """Convert Excel spreadsheet to markdown format."""
    try:
        wb = openpyxl.load_workbook(xlsx_path, data_only=True)
        sheet = wb.active
        markdown_content = []

        # Process each row
        for row in sheet.iter_rows():
            # Convert row to markdown table format
            row_values = [str(cell.value if cell.value is not None else '') for cell in row]
            row_values = [val.replace('|', '\\|') for val in row_values]  # Escape any pipe characters
            markdown_content.append(f"| {' | '.join(row_values)} |")

            # Add table header separator after first row
            if len(markdown_content) == 1:
                markdown_content.append(f"|{'|'.join(['---' for _ in row_values])}|")

        return "\n".join(markdown_content)
    except Exception as e:
        return f"Error converting XLSX: {str(e)}"

def convert_csv_to_markdown(csv_path):
    """Convert CSV to markdown format with robust error handling."""
    try:
        # First, detect the number of columns by reading the header
        with open(csv_path, 'r', encoding='utf-8') as f:
            dialect = csv.Sniffer().sniff(f.readline())
            f.seek(0)
            reader = csv.reader(f, dialect)
            header = next(reader)
            num_columns = len(header)

        # Read CSV with detected number of columns
        df = pd.read_csv(csv_path,
                        encoding='utf-8',
                        quoting=csv.QUOTE_ALL,  # Quote all fields
                        escapechar='\\',        # Use backslash as escape character
                        on_bad_lines='warn')    # Warn about problematic lines

        # Clean the data
        df = df.fillna('')  # Replace NaN with empty string

        # Convert to markdown
        markdown_table = "| " + " | ".join(df.columns) + " |\n"
        markdown_table += "|" + "|".join(["---" for _ in df.columns]) + "|\n"

        # Add each row
        for _, row in df.iterrows():
            # Clean and escape each cell value
            cleaned_values = []
            for value in row:
                # Convert to string and clean
                value_str = str(value).strip()
                value_str = value_str.replace('|', '\\|')  # Escape pipe characters
                value_str = value_str.replace('\n', ' ')   # Replace newlines with spaces
                cleaned_values.append(value_str)

            markdown_table += "| " + " | ".join(cleaned_values) + " |\n"

        return markdown_table

    except pd.errors.ParserError as e:
        print(f"CSV parsing error: {str(e)}")
        print("Attempting alternative parsing method...")

        try:
            # Alternative parsing method for problematic CSVs
            rows = []
            with open(csv_path, 'r', encoding='utf-8') as f:
                reader = csv.reader(f, quoting=csv.QUOTE_ALL, escapechar='\\')
                for row in reader:
                    # Clean and escape each cell value
                    cleaned_row = [str(cell).strip().replace('|', '\\|').replace('\n', ' ') for cell in row]
                    rows.append(cleaned_row)

            # Create markdown table
            if rows:
                markdown_table = "| " + " | ".join(rows[0]) + " |\n"
                markdown_table += "|" + "|".join(["---" for _ in rows[0]]) + "|\n"
                for row in rows[1:]:
                    markdown_table += "| " + " | ".join(row) + " |\n"
                return markdown_table
            else:
                return "Error: No data found in CSV file"

        except Exception as e2:
            return f"Error converting CSV (both methods failed): {str(e2)}"

    except Exception as e:
        return f"Error converting CSV: {str(e)}"

def process_documents(input_folder, output_folder):
    """Process all documents in the input folder and convert to markdown."""
    # Create output folder if it doesn't exist
    Path(output_folder).mkdir(parents=True, exist_ok=True)

    successful_conversions = 0

    # Create converter function mapping
    converter_functions = {
        'docx': convert_docx_to_markdown,
        'xlsx': convert_xlsx_to_markdown,
        'csv': convert_csv_to_markdown
    }

    # Process each file
    for filename, (header, file_type) in FILE_MAPPINGS.items():
        input_path = os.path.join(input_folder, filename)
        output_path = os.path.join(output_folder, f"{os.path.splitext(filename)[0]}.md")

        print(f"\nProcessing {filename}...")

        if not os.path.exists(input_path):
            print(f"Warning: {filename} not found in input folder")
            continue

        try:
            # Get appropriate converter function
            converter_func = converter_functions[file_type]

            # Convert content
            content = converter_func(input_path)

            if content.startswith("Error"):
                print(f"Error in conversion: {content}")
                continue

            # Write to markdown file
            with open(output_path, 'w', encoding='utf-8') as f:
                f.write(header)
                f.write(content)

            print(f"Successfully created {output_path}")
            successful_conversions += 1

        except Exception as e:
            print(f"Error processing {filename}: {str(e)}")

    return successful_conversions

def main():
    """Main function to run the document conversion process."""
    print("Document to Markdown Converter")
    print("-" * 30)

    # Get input and output paths
    input_folder = input("Enter input folder path: ").strip()
    output_folder = input("Enter output folder path: ").strip()

    # Validate input folder
    if not os.path.exists(input_folder):
        print("Error: Input folder does not exist!")
        return

    # Process the documents
    print("\nStarting conversion process...")
    successful_conversions = process_documents(input_folder, output_folder)

    # Print summary
    print("\nConversion process completed!")
    print(f"Successfully converted {successful_conversions} out of {len(FILE_MAPPINGS)} files")

    if successful_conversions > 0:
        print(f"\nMarkdown files have been created in: {output_folder}")

if __name__ == "__main__":
    main()

Document to Markdown Converter
------------------------------
Enter input folder path: data-management/data/
Enter output folder path: output/

Starting conversion process...

Processing balance-sheet-sop.docx...
Successfully created output/balance-sheet-sop.md

Processing balance-sheet.xlsx...
Successfully created output/balance-sheet.md

Processing data-management-template.csv...
Successfully created output/data-management-template.md

Processing metadata-dictionary.csv...
Successfully created output/metadata-dictionary.md

Conversion process completed!
Successfully converted 4 out of 4 files

Markdown files have been created in: output/



  df = pd.read_csv(csv_path,
