# Run Pre-processing Pipeline steps

In [1]:
# import dependencies
import sys, os
from pathlib import Path
# get project root (parent directory of notebooks)
nb_dir = Path(os.getcwd())
project_root = nb_dir.parent

sys.path.append(str(project_root))
from preprocessing import PreprocessingPipeline

2025-07-09 15:56:11,169 - src.helpers - INFO - Logging initialized for /Users/taishajoseph/Documents/Projects/MDC-Challenge-2025/logs/preprocessing.log


In [2]:
# Fix working directory issue - change to project root
import os
from pathlib import Path

# Get current working directory (should be notebooks folder)
current_dir = Path(os.getcwd())
print(f"Current working directory: {current_dir}")

# Change to project root (parent directory of notebooks)
project_root = current_dir.parent
os.chdir(project_root)

print(f"Changed to project root: {os.getcwd()}")
print(f"conversion_candidates.csv exists: {os.path.exists('Data/conversion_candidates.csv')}")

# Verify the file exists
if os.path.exists('Data/conversion_candidates.csv'):
    print("✅ conversion_candidates.csv found - ready to proceed")
else:
    print("❌ conversion_candidates.csv not found - check file location")


Current working directory: /Users/taishajoseph/Documents/Projects/MDC-Challenge-2025/notebooks
Changed to project root: /Users/taishajoseph/Documents/Projects/MDC-Challenge-2025
conversion_candidates.csv exists: True
✅ conversion_candidates.csv found - ready to proceed


In [3]:
pipeline = PreprocessingPipeline(data_dir="Data")

2025-07-09 15:56:11,355 - src.helpers - INFO - Initializing Enhanced PreprocessingPipeline...
2025-07-09 15:56:11,365 - src.helpers - INFO - Pipeline state loaded from /Users/taishajoseph/Documents/Projects/MDC-Challenge-2025/Data/pipeline_state.pkl
2025-07-09 15:56:11,366 - src.helpers - INFO - Pipeline initialized with 8 steps


Executing PreprocessingPipeline...
Function PreprocessingPipeline took 0.0124 seconds to complete.


## Run Step-by-step

In [4]:
# Run steps individually with custom parameters
data_dir = os.path.join(project_root, "Data/")
doc_path = os.path.join(data_dir, "train/parsed/parsed_documents.pkl")
steps = [
    ("pre_chunking_eda", {"show_plots": True}),
    ("doc_conversion", {"data_dir": data_dir}),
    ("document_parsing", {"data_dir": data_dir}),
    ("semantic_chunking", {"chunk_size": 300, "chunk_overlap": 30, "input_path": doc_path})
]

In [5]:
# Step 1: EDA
step_id = steps[0][0]
params = steps[0][1]
# Update parameters
pipeline.update_step_parameters(step_id, params)
# Run step
success = pipeline.run_single_step(step_id) #, force=True if I want regenerate outputs
if success:
    print(f"✅ {step_id} completed successfully")
else:
    print(f"❌ {step_id} failed")
# pipeline.generate_consolidated_reports()

2025-07-09 15:56:11,385 - src.helpers - INFO - Updated parameters for step pre_chunking_eda
2025-07-09 15:56:11,386 - src.helpers - INFO - Step pre_chunking_eda already completed, skipping


✅ pre_chunking_eda completed successfully


In [6]:
# Step 2: PDF to XML conversion
step_id = steps[1][0]
params = steps[1][1]
# Update parameters
pipeline.update_step_parameters(step_id, params)
# Run step
success = pipeline.run_single_step(step_id)
if success:
    print(f"✅ {step_id} completed successfully")
else:
    print(f"❌ {step_id} failed")
# pipeline.generate_consolidated_reports()

2025-07-09 15:56:11,397 - src.helpers - INFO - Updated parameters for step doc_conversion
2025-07-09 15:56:11,399 - src.helpers - INFO - Step doc_conversion already completed, skipping


✅ doc_conversion completed successfully


In [7]:
# Step 3: Document Parsing
step_id = steps[2][0]
params = steps[2][1]
# Update parameters
# pipeline.update_step_parameters(step_id, params)
# Run step
success = pipeline.run_single_step(step_id)
if success:
    print(f"✅ {step_id} completed successfully")
else:
    print(f"❌ {step_id} failed")

2025-07-09 15:56:11,410 - src.helpers - INFO - Step document_parsing already completed, skipping


✅ document_parsing completed successfully


In [8]:
# Step 4: Semantic Chunking
step_id = steps[3][0]
params = steps[3][1]
# Update parameters
pipeline.update_step_parameters(step_id, params)
# Run step
success = pipeline.run_single_step(step_id)
if success:
    print(f"✅ {step_id} completed successfully")
else:
    print(f"❌ {step_id} failed")

2025-07-09 15:56:11,421 - src.helpers - INFO - Updated parameters for step semantic_chunking
2025-07-09 15:56:11,423 - src.helpers - INFO - Validating prerequisites for step semantic_chunking
2025-07-09 15:56:11,425 - src.helpers - INFO - Data directory structure validation passed
2025-07-09 15:56:11,427 - src.helpers - INFO - File dependencies validation passed for step semantic_chunking
2025-07-09 15:56:14,947 - src.helpers - INFO - Validated pickle file format for semantic_chunking
2025-07-09 15:56:14,948 - src.helpers - INFO - Input format validation passed for step semantic_chunking
2025-07-09 15:56:14,952 - src.helpers - INFO - Cleaned up temporary files: ['__pycache__/preprocessing.cpython-312.pyc']
2025-07-09 15:56:14,954 - src.helpers - INFO - Starting step: Semantic Chunking
2025-07-09 15:56:17,241 - src.helpers - INFO - Logging initialized for /Users/taishajoseph/Documents/Projects/MDC-Challenge-2025/logs/update_patterns.log
2025-07-09 15:56:17,241 - src.helpers - INFO - Log

Executing load_entity_patterns...


2025-07-09 15:56:18,162 - src.helpers - INFO - Loaded 1563 patterns from artifacts/entity_patterns.yaml
2025-07-09 15:56:18,162 - src.helpers - INFO - Loaded 1563 patterns from artifacts/entity_patterns.yaml
2025-07-09 15:56:18,164 - src.helpers - INFO - Default pattern DOI is in the compiled patterns. Updating it.
2025-07-09 15:56:18,164 - src.helpers - INFO - Default pattern DOI is in the compiled patterns. Updating it.
2025-07-09 15:56:18,165 - src.helpers - INFO - Default pattern PRIDE is in the compiled patterns. Updating it.
2025-07-09 15:56:18,165 - src.helpers - INFO - Default pattern PRIDE is in the compiled patterns. Updating it.
2025-07-09 15:56:18,167 - src.helpers - INFO - Default pattern WIKIDATA is in the compiled patterns. Updating it.
2025-07-09 15:56:18,167 - src.helpers - INFO - Default pattern WIKIDATA is in the compiled patterns. Updating it.
2025-07-09 15:56:18,169 - src.helpers - INFO - Default pattern OCID is in the compiled patterns. Updating it.
2025-07-09 15:

Function load_entity_patterns took 0.7863 seconds to complete.


2025-07-09 15:56:18,472 - src.helpers - INFO - Logging initialized for /Users/taishajoseph/Documents/Projects/MDC-Challenge-2025/logs/run_chunking_pipeline.log
2025-07-09 15:56:18,472 - src.helpers - INFO - Logging initialized for /Users/taishajoseph/Documents/Projects/MDC-Challenge-2025/logs/run_chunking_pipeline.log
2025-07-09 15:56:18,472 - src.helpers - INFO - Logging initialized for /Users/taishajoseph/Documents/Projects/MDC-Challenge-2025/logs/run_chunking_pipeline.log
2025-07-09 15:56:18,472 - src.helpers - INFO - Logging initialized for /Users/taishajoseph/Documents/Projects/MDC-Challenge-2025/logs/run_chunking_pipeline.log
2025-07-09 15:56:18,476 - src.helpers - INFO - Starting comprehensive semantic chunking with reporting...
2025-07-09 15:56:18,476 - src.helpers - INFO - Starting comprehensive semantic chunking with reporting...
2025-07-09 15:56:18,476 - src.helpers - INFO - Starting comprehensive semantic chunking with reporting...
2025-07-09 15:56:18,476 - src.helpers - IN

Executing run_semantic_chunking_with_reporting...
Executing run_semantic_chunking_pipeline...
Executing load_parsed_documents_for_chunking...
Function load_parsed_documents_for_chunking took 0.0132 seconds to complete.
Executing prepare_section_texts_for_chunking...
Function prepare_section_texts_for_chunking took 0.0185 seconds to complete.
Executing create_pre_chunk_entity_inventory...
Function create_pre_chunk_entity_inventory took 0.0388 seconds to complete.
Executing create_section_aware_chunks...


Chunking documents: 100%|██████████| 3/3 [00:00<00:00, 377.55it/s]
2025-07-09 15:56:18,708 - src.helpers - INFO - ✅ Created 3 chunks from 3 sections
2025-07-09 15:56:18,708 - src.helpers - INFO - ✅ Created 3 chunks from 3 sections
2025-07-09 15:56:18,708 - src.helpers - INFO - ✅ Created 3 chunks from 3 sections
2025-07-09 15:56:18,708 - src.helpers - INFO - ✅ Created 3 chunks from 3 sections
2025-07-09 15:56:18,712 - src.helpers - INFO - 
📋 Step 5: Link Adjacent Chunks
2025-07-09 15:56:18,712 - src.helpers - INFO - 
📋 Step 5: Link Adjacent Chunks
2025-07-09 15:56:18,712 - src.helpers - INFO - 
📋 Step 5: Link Adjacent Chunks
2025-07-09 15:56:18,712 - src.helpers - INFO - 
📋 Step 5: Link Adjacent Chunks
2025-07-09 15:56:18,719 - src.helpers - INFO - Linking adjacent chunks for 3 chunks
2025-07-09 15:56:18,719 - src.helpers - INFO - Linking adjacent chunks for 3 chunks
2025-07-09 15:56:18,719 - src.helpers - INFO - Linking adjacent chunks for 3 chunks
2025-07-09 15:56:18,719 - src.helpers

Function create_section_aware_chunks took 0.1279 seconds to complete.
Executing link_adjacent_chunks...
Function link_adjacent_chunks took 0.0113 seconds to complete.
Executing refine_chunk_types...
Function refine_chunk_types took 0.0171 seconds to complete.
Executing validate_chunk_integrity...
Function validate_chunk_integrity took 0.0848 seconds to complete.
Executing export_chunks_for_embedding...
Function export_chunks_for_embedding took 0.0444 seconds to complete.


2025-07-09 15:56:18,913 - src.helpers - INFO -    Chunks created: 3
2025-07-09 15:56:18,913 - src.helpers - INFO -    Chunks created: 3
2025-07-09 15:56:18,913 - src.helpers - INFO -    Chunks created: 3
2025-07-09 15:56:18,913 - src.helpers - INFO -    Chunks created: 3
2025-07-09 15:56:18,919 - src.helpers - INFO - Step 3: Quality analysis and recommendations...
2025-07-09 15:56:18,919 - src.helpers - INFO - Step 3: Quality analysis and recommendations...
2025-07-09 15:56:18,919 - src.helpers - INFO - Step 3: Quality analysis and recommendations...
2025-07-09 15:56:18,919 - src.helpers - INFO - Step 3: Quality analysis and recommendations...
2025-07-09 15:56:18,924 - src.helpers - INFO - Semantic chunking with reporting completed successfully!
2025-07-09 15:56:18,924 - src.helpers - INFO - Semantic chunking with reporting completed successfully!
2025-07-09 15:56:18,924 - src.helpers - INFO - Semantic chunking with reporting completed successfully!
2025-07-09 15:56:18,924 - src.helper

   Average tokens per chunk: 261.0
   Entity retention: 100.0%
Function run_semantic_chunking_pipeline took 0.4301 seconds to complete.
SEMANTIC CHUNKING PIPELINE COMPREHENSIVE REPORT
Generated: 2025-07-09 15:56:18
Data Directory: Data
Chunking Duration: 0:00:00.452938

EXECUTIVE SUMMARY
----------------------------------------
• Configuration Validation: PASSED
• Pipeline Success: True
• Total Chunks Created: 3
• Entity Retention Rate: 100.0%
• Quality Score: GOOD
• Total Processing Time: 0.43s
• Average Tokens per Chunk: 261.0
• Chunking Steps Completed: 3

CHUNKING PERFORMANCE
----------------------------------------
• average_processing_time: 0.43s
• total_processing_time: 0.43s
• fastest_processing: 0.4302248954772949
• slowest_processing: 0.4302248954772949
• processing_rate: 2.32 docs/sec
• average_memory_mb: 171.0MB
• peak_memory_mb: 171.0MB
• min_memory_mb: 171.0MB

QUALITY GATES STATUS
----------------------------------------
• Entity retention: ✅ PASSED
• Retention rate: 100

2025-07-09 15:56:19,966 - src.helpers - INFO - Checkpoint created for step semantic_chunking
2025-07-09 15:56:19,966 - src.helpers - INFO - Checkpoint created for step semantic_chunking
2025-07-09 15:56:19,966 - src.helpers - INFO - Checkpoint created for step semantic_chunking
2025-07-09 15:56:19,966 - src.helpers - INFO - Checkpoint created for step semantic_chunking
2025-07-09 15:56:19,971 - src.helpers - INFO - Step Semantic Chunking completed successfully in 5.00 seconds
2025-07-09 15:56:19,971 - src.helpers - INFO - Step Semantic Chunking completed successfully in 5.00 seconds
2025-07-09 15:56:19,971 - src.helpers - INFO - Step Semantic Chunking completed successfully in 5.00 seconds
2025-07-09 15:56:19,971 - src.helpers - INFO - Step Semantic Chunking completed successfully in 5.00 seconds


✅ semantic_chunking completed successfully


In [None]:
# Run all steps
success = pipeline.run_all()

if success:
    print("Pipeline completed successfully!")
    # Generate reports
    pipeline.generate_consolidated_reports()
else:
    print("Pipeline failed!")
    # Check failed steps
    print(f"Failed steps: {pipeline.state.failed_steps}")
