# Run Pre-processing Pipeline steps

In [1]:
# import dependencies
import sys, os
from pathlib import Path
# get project root (parent directory of notebooks)
nb_dir = Path(os.getcwd())
project_root = nb_dir.parent

sys.path.append(str(project_root))
from preprocessing import PreprocessingPipeline

2025-07-08 17:32:11,291 - src.helpers - INFO - Logging initialized for /Users/taishajoseph/Documents/Projects/MDC-Challenge-2025/logs/preprocessing.log


In [2]:
# Fix working directory issue - change to project root
import os
from pathlib import Path

# Get current working directory (should be notebooks folder)
current_dir = Path(os.getcwd())
print(f"Current working directory: {current_dir}")

# Change to project root (parent directory of notebooks)
project_root = current_dir.parent
os.chdir(project_root)

print(f"Changed to project root: {os.getcwd()}")
print(f"conversion_candidates.csv exists: {os.path.exists('Data/conversion_candidates.csv')}")

# Verify the file exists
if os.path.exists('Data/conversion_candidates.csv'):
    print("✅ conversion_candidates.csv found - ready to proceed")
else:
    print("❌ conversion_candidates.csv not found - check file location")


Current working directory: /Users/taishajoseph/Documents/Projects/MDC-Challenge-2025/notebooks
Changed to project root: /Users/taishajoseph/Documents/Projects/MDC-Challenge-2025
conversion_candidates.csv exists: True
✅ conversion_candidates.csv found - ready to proceed


In [3]:
pipeline = PreprocessingPipeline(data_dir="Data")

2025-07-08 17:32:11,349 - src.helpers - INFO - Initializing Enhanced PreprocessingPipeline...
2025-07-08 17:32:11,353 - src.helpers - INFO - Pipeline state loaded from /Users/taishajoseph/Documents/Projects/MDC-Challenge-2025/Data/pipeline_state.pkl
2025-07-08 17:32:11,354 - src.helpers - INFO - Pipeline initialized with 8 steps


Executing PreprocessingPipeline...
Function PreprocessingPipeline took 0.0061 seconds to complete.


## Run Step-by-step

In [4]:
# Run steps individually with custom parameters
steps = [
    ("pre_chunking_eda", {"show_plots": True}),
    ("doc_conversion", {"timeout": 1800}),
    ("document_parsing", {"extract_sections": True}),
    ("semantic_chunking", {"chunk_size": 300, "chunk_overlap": 30})
]

In [5]:
# Step 1: EDA
step_id = steps[0][0]
params = steps[0][1]
# Update parameters
pipeline.update_step_parameters(step_id, params)
# Run step
success = pipeline.run_single_step(step_id) #, force=True if I want regenerate outputs
if success:
    print(f"✅ {step_id} completed successfully")
else:
    print(f"❌ {step_id} failed")
# pipeline.generate_consolidated_reports()

2025-07-08 17:32:11,372 - src.helpers - INFO - Updated parameters for step pre_chunking_eda
2025-07-08 17:32:11,374 - src.helpers - INFO - Step pre_chunking_eda already completed, skipping


✅ pre_chunking_eda completed successfully


In [6]:
# Step 2: PDF to XML conversion
step_id = steps[1][0]
params = steps[1][1]
# Run step
success = pipeline.run_single_step(step_id)
if success:
    print(f"✅ {step_id} completed successfully")
else:
    print(f"❌ {step_id} failed")
# pipeline.generate_consolidated_reports()

2025-07-08 17:32:11,386 - src.helpers - INFO - Step doc_conversion already completed, skipping


✅ doc_conversion completed successfully


In [None]:
# Step 3: Document Parsing
step_id = steps[2][0]
params = steps[2][1]
# Run step
success = pipeline.run_single_step(step_id)
if success:
    print(f"✅ {step_id} completed successfully")
else:
    print(f"❌ {step_id} failed")

In [None]:
# Run all steps
success = pipeline.run_all()

if success:
    print("Pipeline completed successfully!")
    # Generate reports
    pipeline.generate_consolidated_reports()
else:
    print("Pipeline failed!")
    # Check failed steps
    print(f"Failed steps: {pipeline.state.failed_steps}")
