In [1]:
# Fair-Price Brazilian Health Data Pipeline - Production Orchestration
# Complete extraction and standardization pipeline using our modular architecture

import sys
from pathlib import Path
import pandas as pd
import time
from datetime import datetime

# =============================================================================
# PROJECT SETUP & IMPORTS
# =============================================================================

# Setup project paths
project_root = Path.cwd().parent if Path.cwd().name == 'notebooks' else Path.cwd()
src_path = project_root / 'src'
sys.path.insert(0, str(src_path))

# Import our Fair-Price modules
from config.settings import get_config
from utils.logger import get_extraction_logger, get_standardization_logger
from extraction.extractors import OpenDataSUSExtractor

print("🚀 FAIR-PRICE BRAZILIAN HEALTH DATA PIPELINE")
print("=" * 60)
print(f"📁 Project root: {project_root}")
print("✅ Successfully imported all Fair-Price modules")

# =============================================================================
# CONFIGURATION & LOGGERS
# =============================================================================

config = get_config()
extraction_logger = get_extraction_logger()
standardization_logger = get_standardization_logger()

print(f"\n📂 DATA DIRECTORIES:")
print(f"   Raw data: {config.raw_data_dir}")
print(f"   Processed data: {config.processed_data_dir}")

# Ensure processed data directory exists
config.processed_data_dir.mkdir(parents=True, exist_ok=True)

# =============================================================================
# STEP 1: DATA EXTRACTION
# =============================================================================

print("\n" + "=" * 60)
print("STEP 1: DATA EXTRACTION")
print("=" * 60)

# Initialize extractor using the correct signature from your extractors.py
extractor = OpenDataSUSExtractor()

print("🌐 Starting extraction from OpenDataSUS...")
extraction_start_time = time.time()

try:
    # Use the correct method name from your extractors.py
    extraction_report = extractor.extract_all_years()
    extraction_time = time.time() - extraction_start_time
    
    print(f"\n✅ EXTRACTION COMPLETED ({extraction_time:.2f}s)")
    print(f"   Files extracted: {len(extraction_report)}")
    
    # List downloaded files
    csv_files = list(config.raw_data_dir.glob("*.csv"))
    print(f"\n📄 AVAILABLE CSV FILES ({len(csv_files)}):")
    for csv_file in sorted(csv_files):
        size_mb = csv_file.stat().st_size / (1024 * 1024)
        print(f"   {csv_file.name} ({size_mb:.1f} MB)")

except Exception as e:
    extraction_logger.error(f"Extraction failed: {str(e)}")
    print(f"❌ EXTRACTION FAILED: {str(e)}")
    # Continue with existing files if extraction fails
    csv_files = list(config.raw_data_dir.glob("*.csv"))
    print(f"📄 Using existing files: {len(csv_files)} CSV files found")

# =============================================================================
# STEP 2: DATA STANDARDIZATION
# =============================================================================

print("\n" + "=" * 60)
print("STEP 2: DATA STANDARDIZATION")
print("=" * 60)

# Import standardization modules (correct module names from your structure)
from standardization.cleaners import clean_dataframe
from standardization.validators import validate_dataframe, get_validation_summary

# Create a simple processor that uses our existing modules  
class ProductionStandardizationProcessor:
    """Lightweight processor that uses our modular functions."""
    
    def __init__(self, config, logger):
        self.config = config
        self.logger = logger
    
    def load_csv_with_encoding(self, file_path: Path) -> pd.DataFrame:
        """Load CSV handling Brazilian encoding."""
        try:
            df = pd.read_csv(file_path, encoding='latin-1', sep=';')
            self.logger.info(f"✅ Loaded {file_path.name}: {len(df):,} rows")
            return df
        except Exception:
            # Try fallbacks
            for encoding, sep in [('utf-8', ';'), ('latin-1', ','), ('utf-8', ',')]:
                try:
                    df = pd.read_csv(file_path, encoding=encoding, sep=sep)
                    self.logger.info(f"✅ Loaded {file_path.name} with {encoding}/{sep}")
                    return df
                except Exception:
                    continue
            raise Exception(f"Failed to load {file_path.name}")
    
    def process_single_file(self, input_path: Path, output_path: Path) -> dict:
        """Process one CSV file through the complete pipeline."""
        start_time = time.time()
        file_name = input_path.name
        
        self.logger.info(f"🔄 Processing {file_name}...")
        
        # Load raw data
        df_raw = self.load_csv_with_encoding(input_path)
        
        # Extract year from filename and add 'ano' column if missing
        year = int(input_path.stem)  # Extract year from filename (e.g., "2024.csv" -> 2024)
        if 'ano' not in [col.lower() for col in df_raw.columns]:
            df_raw['ano'] = year
            self.logger.info(f"   📅 Added 'ano' column with value {year}")
        
        # Apply cleaning (uses our cleaners module)
        df_cleaned = clean_dataframe(df_raw, self.config)
        
        # Apply validation (uses our validators module)
        validation_rules = {'required_fields': ['ano', 'uf']}
        df_validated = validate_dataframe(df_cleaned, validation_rules)
        
        # Get quality summary
        quality_summary = get_validation_summary(df_validated)
        
        # Save cleaned data (remove validation columns)
        business_columns = [col for col in df_validated.columns 
                          if not col.startswith(('quality_', 'has_valid_'))]
        df_output = df_validated[business_columns]
        
        output_path.parent.mkdir(parents=True, exist_ok=True)
        df_output.to_csv(output_path, index=False, encoding='utf-8', sep=',')
        
        processing_time = time.time() - start_time
        
        # Return processing report
        return {
            'file_name': file_name,
            'processing_time': processing_time,
            'rows_input': len(df_raw),
            'rows_output': len(df_output),
            'columns_input': len(df_raw.columns),
            'columns_output': len(df_output.columns),
            'quality_score': quality_summary['avg_quality_score'],
            'high_quality_rows': quality_summary['high_quality_rows'],
            'input_path': str(input_path),
            'output_path': str(output_path)
        }

# Initialize processor
processor = ProductionStandardizationProcessor(config, standardization_logger)

# Find CSV files to process
csv_files = list(config.raw_data_dir.glob("*.csv"))

if not csv_files:
    print("❌ No CSV files found to process!")
else:
    print(f"📊 Found {len(csv_files)} CSV files to standardize")
    
    # Process all files
    standardization_start_time = time.time()
    processing_reports = []
    
    for csv_file in sorted(csv_files):
        try:
            output_file = config.processed_data_dir / csv_file.name
            report = processor.process_single_file(csv_file, output_file)
            processing_reports.append(report)
            
            print(f"   ✅ {report['file_name']}: {report['quality_score']:.1f}% quality ({report['processing_time']:.2f}s)")
            
        except Exception as e:
            standardization_logger.error(f"Failed to process {csv_file.name}: {str(e)}")
            print(f"   ❌ {csv_file.name}: FAILED - {str(e)}")
    
    standardization_time = time.time() - standardization_start_time

# =============================================================================
# STEP 3: PIPELINE SUMMARY & RESULTS
# =============================================================================

print("\n" + "=" * 60)
print("PIPELINE EXECUTION SUMMARY")
print("=" * 60)

if processing_reports:
    # Calculate overall statistics
    total_input_rows = sum(r['rows_input'] for r in processing_reports)
    total_output_rows = sum(r['rows_output'] for r in processing_reports)
    avg_quality_score = sum(r['quality_score'] for r in processing_reports) / len(processing_reports)
    total_high_quality = sum(r['high_quality_rows'] for r in processing_reports)
    
    print(f"📊 PROCESSING RESULTS:")
    print(f"   Files processed: {len(processing_reports)}/{len(csv_files)}")
    print(f"   Total input rows: {total_input_rows:,}")
    print(f"   Total output rows: {total_output_rows:,}")
    print(f"   Data preservation: {(total_output_rows/total_input_rows)*100:.1f}%")
    print(f"   Average quality score: {avg_quality_score:.1f}%")
    print(f"   High quality rows: {total_high_quality:,}")
    
    print(f"\n⏱️  PERFORMANCE:")
    print(f"   Extraction time: {extraction_time:.2f}s")
    print(f"   Standardization time: {standardization_time:.2f}s")
    print(f"   Total pipeline time: {extraction_time + standardization_time:.2f}s")
    
    print(f"\n📂 OUTPUT FILES:")
    processed_files = list(config.processed_data_dir.glob("*.csv"))
    for pfile in sorted(processed_files):
        size_mb = pfile.stat().st_size / (1024 * 1024)
        print(f"   {pfile.name} ({size_mb:.1f} MB)")
    
    # Detailed file-by-file results
    print(f"\n📋 DETAILED RESULTS:")
    print(f"{'File':<20} {'Rows':<10} {'Quality':<10} {'Time':<8}")
    print("-" * 50)
    for report in processing_reports:
        print(f"{report['file_name']:<20} {report['rows_output']:<10,} {report['quality_score']:<10.1f}% {report['processing_time']:<8.2f}s")

else:
    print("❌ No files were successfully processed")

print(f"\n🎯 PIPELINE STATUS: {'COMPLETED SUCCESSFULLY' if processing_reports else 'FAILED'}")
print("=" * 60)

# =============================================================================
# NEXT STEPS GUIDANCE
# =============================================================================

if processing_reports:
    print(f"\n📝 NEXT STEPS:")
    print(f"1. ✅ Standardized files are ready in: {config.processed_data_dir}")
    print(f"2. 🔄 Next phase: Consolidation (merge all files into single dataset)")
    print(f"3. 📊 Data ready for analysis and visualization")
    print(f"4. 🎯 Average quality score: {avg_quality_score:.1f}% - {'EXCELLENT' if avg_quality_score >= 90 else 'GOOD' if avg_quality_score >= 75 else 'NEEDS IMPROVEMENT'}")

print(f"\n🚀 Fair-Price pipeline execution completed at {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}")

🚀 FAIR-PRICE BRAZILIAN HEALTH DATA PIPELINE
📁 Project root: /home/victor-jose/Documents/projetos/DGU/DGU45/fair-price
✅ Successfully imported all Fair-Price modules

📂 DATA DIRECTORIES:
   Raw data: /home/victor-jose/Documents/projetos/DGU/DGU45/fair-price/data/raw
   Processed data: /home/victor-jose/Documents/projetos/DGU/DGU45/fair-price/data/processed

STEP 1: DATA EXTRACTION
2025-06-29 14:31:58 - fair_price.extraction - INFO - 🏥 OpenDataSUS Extractor initialized
2025-06-29 14:31:58 - fair_price.extraction - INFO - 📁 Output directory: /home/victor-jose/Documents/projetos/DGU/DGU45/fair-price/data/raw
2025-06-29 14:31:58 - fair_price.extraction - INFO - 🎯 Target years: [2020, 2021, 2022, 2023, 2024]
🌐 Starting extraction from OpenDataSUS...
2025-06-29 14:31:58 - fair_price.extraction - INFO - 🚀 Starting Complete multi-year extraction
2025-06-29 14:31:58 - fair_price.extraction - INFO - 🚀 Starting extraction for years: [2020, 2021, 2022, 2023, 2024]
2025-06-29 14:31:58 - fair_price.e

2025-06-29 14:32:05 - fair_price.standardization - INFO - ✅ Loaded 2020.csv: 71,227 rows
   ✅ 2020.csv: 99.8% quality (3.40s)
2025-06-29 14:32:09 - fair_price.standardization - INFO - 🔄 Processing 2021.csv...
2025-06-29 14:32:09 - fair_price.standardization - INFO - ✅ Loaded 2021.csv: 70,893 rows
   ✅ 2021.csv: 100.0% quality (4.64s)
2025-06-29 14:32:13 - fair_price.standardization - INFO - 🔄 Processing 2022.csv...
2025-06-29 14:32:13 - fair_price.standardization - INFO - ✅ Loaded 2022.csv: 69,028 rows
   ✅ 2022.csv: 99.6% quality (2.89s)
2025-06-29 14:32:16 - fair_price.standardization - INFO - 🔄 Processing 2023.csv...
2025-06-29 14:32:16 - fair_price.standardization - INFO - ✅ Loaded 2023.csv: 37,522 rows
2025-06-29 14:32:16 - fair_price.standardization - INFO -    📅 Added 'ano' column with value 2023
   ✅ 2023.csv: 98.9% quality (2.21s)
2025-06-29 14:32:18 - fair_price.standardization - INFO - 🔄 Processing 2024.csv...
2025-06-29 14:32:18 - fair_price.standardization - INFO - ✅ Loade

In [2]:
# Fair-Price Data Consolidation - Merge All Standardized Files
# Consolidates 5 standardized CSV files into a single, clean dataset

import sys
from pathlib import Path
import pandas as pd
import time
from datetime import datetime

# =============================================================================
# PROJECT SETUP & IMPORTS
# =============================================================================

# Setup project paths
project_root = Path.cwd().parent if Path.cwd().name == 'notebooks' else Path.cwd()
src_path = project_root / 'src'
sys.path.insert(0, str(src_path))

# Import our Fair-Price modules
from config.settings import get_config
from utils.logger import get_standardization_logger

print("🎯 FAIR-PRICE DATA CONSOLIDATION PIPELINE")
print("=" * 70)
print(f"📁 Project root: {project_root}")
print("✅ Successfully imported Fair-Price modules")

# =============================================================================
# CONFIGURATION & SETUP
# =============================================================================

config = get_config()
logger = get_standardization_logger()

# Directory paths
input_dir = config.processed_data_dir
output_dir = config.output_data_dir

print(f"\n📂 CONSOLIDATION DIRECTORIES:")
print(f"   Input (processed): {input_dir}")
print(f"   Output (final): {output_dir}")

# Check input files
csv_files = list(input_dir.glob("*.csv"))
if not csv_files:
    print("❌ No standardized CSV files found!")
    print(f"   Please run the standardization pipeline first")
    exit()

print(f"\n📋 FOUND {len(csv_files)} STANDARDIZED FILES:")
total_size_mb = 0
for csv_file in sorted(csv_files):
    size_mb = csv_file.stat().st_size / (1024 * 1024)
    total_size_mb += size_mb
    print(f"   {csv_file.name} ({size_mb:.1f} MB)")

print(f"📊 Total input data: {total_size_mb:.1f} MB")

# =============================================================================
# CONSOLIDATION EXECUTION
# =============================================================================

print(f"\n" + "=" * 70)
print("CONSOLIDATION PIPELINE EXECUTION")
print("=" * 70)

# Import consolidation module
try:
    from consolidation.consolidators import HealthDataConsolidator
    
    # Initialize consolidator
    consolidator = HealthDataConsolidator(config, logger)
    
    print("✅ Consolidation module imported successfully")
    
except ImportError as e:
    print(f"❌ Failed to import consolidation module: {e}")
    print("💡 Creating consolidator inline...")
    
    # If module doesn't exist, use the class we just defined
    exec(open(src_path / 'consolidation' / 'consolidators.py').read() if (src_path / 'consolidation' / 'consolidators.py').exists() else '')

# Execute complete consolidation workflow
print(f"\n🚀 Starting complete consolidation workflow...")
consolidation_start_time = time.time()

try:
    # Run complete consolidation
    consolidation_report = consolidator.consolidate_all_data(
        input_dir=input_dir,
        output_dir=output_dir,
        duplicate_strategy="keep_latest"
    )
    
    consolidation_time = time.time() - consolidation_start_time
    
    print(f"\n✅ CONSOLIDATION COMPLETED SUCCESSFULLY!")
    
except Exception as e:
    logger.error(f"Consolidation failed: {str(e)}")
    print(f"❌ CONSOLIDATION FAILED: {str(e)}")
    consolidation_report = None
    consolidation_time = time.time() - consolidation_start_time

# =============================================================================
# RESULTS SUMMARY & ANALYSIS
# =============================================================================

print(f"\n" + "=" * 70)
print("CONSOLIDATION RESULTS SUMMARY")
print("=" * 70)

if consolidation_report:
    summary = consolidation_report['consolidation_summary']
    final_validation = consolidation_report['final_validation']
    duplicate_detection = consolidation_report['duplicate_detection']
    
    print(f"📊 CONSOLIDATION METRICS:")
    print(f"   Input files: {summary['input_files']}")
    print(f"   Output records: {summary['output_records']:,}")
    print(f"   Data quality score: {summary['data_quality_score']:.1f}%")
    print(f"   Processing time: {consolidation_report['processing_time_seconds']:.2f}s")
    
    print(f"\n📈 DATA QUALITY ANALYSIS:")
    print(f"   Overall completeness: {final_validation['overall_completeness']:.1f}%")
    print(f"   Memory usage: {final_validation['memory_usage_mb']:.1f} MB")
    print(f"   Duplicate reduction: {duplicate_detection['duplicate_percentage']:.1f}%")
    
    print(f"\n📅 YEAR DISTRIBUTION:")
    year_dist = final_validation['year_distribution']
    for year in sorted(year_dist.keys()):
        count = year_dist[year]
        percentage = (count / summary['output_records']) * 100
        print(f"   {year}: {count:,} records ({percentage:.1f}%)")
    
    print(f"\n📂 OUTPUT FILES GENERATED:")
    if 'saved_files' in consolidation_report:
        for format_name, file_path in consolidation_report['saved_files'].items():
            file_path_obj = Path(file_path)
            if file_path_obj.exists():
                size_mb = file_path_obj.stat().st_size / (1024 * 1024)
                print(f"   {format_name.upper()}: {file_path_obj.name} ({size_mb:.1f} MB)")
    
    # Data completeness analysis
    print(f"\n📋 COLUMN COMPLETENESS (Top 10 Complete):")
    completeness = final_validation['completeness_by_column']
    top_complete = sorted(completeness.items(), key=lambda x: x[1], reverse=True)[:10]
    for col, comp in top_complete:
        print(f"   {col:<30} {comp:>6.1f}%")
    
    # Show bottom 5 for attention
    print(f"\n⚠️  COLUMNS NEEDING ATTENTION (Lowest Completeness):")
    bottom_complete = sorted(completeness.items(), key=lambda x: x[1])[:5]
    for col, comp in bottom_complete:
        if comp < 90:  # Only show if less than 90% complete
            print(f"   {col:<30} {comp:>6.1f}%")

else:
    print("❌ No consolidation results to display")

# =============================================================================
# FINAL PIPELINE STATUS
# =============================================================================

print(f"\n" + "=" * 70)
print("FINAL PIPELINE STATUS")
print("=" * 70)

if consolidation_report:
    final_records = consolidation_report['consolidation_summary']['output_records']
    quality_score = consolidation_report['consolidation_summary']['data_quality_score']
    
    # Determine quality grade
    if quality_score >= 95:
        quality_grade = "EXCELLENT ⭐⭐⭐"
    elif quality_score >= 85:
        quality_grade = "VERY GOOD ⭐⭐"
    elif quality_score >= 75:
        quality_grade = "GOOD ⭐"
    else:
        quality_grade = "NEEDS IMPROVEMENT ⚠️"
    
    print(f"🎯 CONSOLIDATION STATUS: SUCCESS ✅")
    print(f"📊 Final Dataset: {final_records:,} records")
    print(f"🏆 Data Quality: {quality_score:.1f}% - {quality_grade}")
    print(f"⏱️  Total Processing Time: {consolidation_time:.2f} seconds")
    
    print(f"\n📋 WHAT'S READY:")
    print(f"1. ✅ Consolidated dataset with {final_records:,} health procurement records")
    print(f"2. ✅ Clean, validated data spanning 2020-2024")
    print(f"3. ✅ Multiple output formats (CSV, Parquet)")
    print(f"4. ✅ Comprehensive data quality assessment")
    print(f"5. ✅ Ready for business intelligence and analysis")
    
    print(f"\n🚀 NEXT STEPS:")
    print(f"• Data analysis and visualization")
    print(f"• Business intelligence dashboard creation")
    print(f"• Procurement pattern analysis")
    print(f"• Geographic spending analysis")
    print(f"• Supplier performance evaluation")

else:
    print(f"🎯 CONSOLIDATION STATUS: FAILED ❌")
    print(f"⏱️  Processing Time: {consolidation_time:.2f} seconds")
    print(f"\n🔧 TROUBLESHOOTING:")
    print(f"• Check that standardized files exist in {input_dir}")
    print(f"• Verify file permissions for {output_dir}")
    print(f"• Review error logs above for specific issues")

print(f"\n📊 COMPLETE FAIR-PRICE PIPELINE SUMMARY:")
print(f"   Phase 1: ✅ Extraction (5 CSV files, 142.2 MB)")
print(f"   Phase 2: ✅ Standardization (273,305 records, 99.2% quality)")
print(f"   Phase 3: {'✅' if consolidation_report else '❌'} Consolidation ({consolidation_report['consolidation_summary']['output_records']:,} final records)" if consolidation_report else "❌ Consolidation (failed)")

print(f"\n🎉 Fair-Price Brazilian Health Data Pipeline completed at {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}")
print("=" * 70)

🎯 FAIR-PRICE DATA CONSOLIDATION PIPELINE
📁 Project root: /home/victor-jose/Documents/projetos/DGU/DGU45/fair-price
✅ Successfully imported Fair-Price modules

📂 CONSOLIDATION DIRECTORIES:
   Input (processed): /home/victor-jose/Documents/projetos/DGU/DGU45/fair-price/data/processed
   Output (final): /home/victor-jose/Documents/projetos/DGU/DGU45/fair-price/data/output

📋 FOUND 5 STANDARDIZED FILES:
   2020.csv (22.8 MB)
   2021.csv (25.2 MB)
   2022.csv (22.3 MB)
   2023.csv (12.2 MB)
   2024.csv (7.7 MB)
📊 Total input data: 90.2 MB

CONSOLIDATION PIPELINE EXECUTION
✅ Consolidation module imported successfully

🚀 Starting complete consolidation workflow...
2025-06-29 14:35:46 - fair_price.standardization - INFO - 🚀 Starting complete data consolidation process...
2025-06-29 14:35:46 - fair_price.standardization - INFO - 📂 Loading standardized files from /home/victor-jose/Documents/projetos/DGU/DGU45/fair-price/data/processed
2025-06-29 14:35:47 - fair_price.standardization - INFO -    

  combined_df = pd.concat(combined_data, ignore_index=True)


2025-06-29 14:35:47 - fair_price.standardization - INFO -    📊 Total duplicates: 28,213 (10.3%)
2025-06-29 14:35:47 - fair_price.standardization - INFO -    🔀 Cross-year duplicates: 0
2025-06-29 14:35:47 - fair_price.standardization - INFO -    📅 Same-year duplicates: 0
2025-06-29 14:35:47 - fair_price.standardization - INFO - 🔧 Consolidating data with 'keep_latest' duplicate strategy...


  consolidated_df = pd.concat(all_dataframes, ignore_index=True)


2025-06-29 14:35:48 - fair_price.standardization - INFO -    📊 Consolidation complete:
2025-06-29 14:35:48 - fair_price.standardization - INFO -       Input records: 273,305
2025-06-29 14:35:48 - fair_price.standardization - INFO -       Output records: 250,789
2025-06-29 14:35:48 - fair_price.standardization - INFO -       Reduction: 8.2%
2025-06-29 14:35:48 - fair_price.standardization - INFO - ✅ Validating consolidated dataset...
2025-06-29 14:35:49 - fair_price.standardization - INFO -    📊 Final dataset: 250,789 rows × 21 columns
2025-06-29 14:35:49 - fair_price.standardization - INFO -    💾 Memory usage: 257.4 MB
2025-06-29 14:35:49 - fair_price.standardization - INFO -    ✅ Overall completeness: 78.0%
2025-06-29 14:35:49 - fair_price.standardization - INFO -    📅 Year distribution:
2025-06-29 14:35:49 - fair_price.standardization - INFO -       2020: 71,216 records
2025-06-29 14:35:49 - fair_price.standardization - INFO -       2021: 70,814 records
2025-06-29 14:35:49 - fair_pri