In [3]:
# FAIR-PRICE Data Extraction
# Brazilian Health Economics Data Pipeline - Step 1: Data Extraction

# Import our modules
import sys
import os
from pathlib import Path

# Get the project root directory (one level up from notebooks)
project_root = Path.cwd().parent if 'notebooks' in str(Path.cwd()) else Path.cwd()
src_path = project_root / "src"

# Add both project root and src to Python path
if str(project_root) not in sys.path:
    sys.path.insert(0, str(project_root))
if str(src_path) not in sys.path:
    sys.path.insert(0, str(src_path))

print(f"📁 Project root: {project_root}")
print(f"📁 Source path: {src_path}")
print(f"📁 Current working directory: {Path.cwd()}")

# Now import our modules using absolute imports
try:
    # Import using src package notation
    import src.config as config_module
    import src.extraction as extraction_module
    import src.utils as utils_module
    
    # Extract what we need
    initialize_project = config_module.initialize_project
    HealthDataExtractor = extraction_module.HealthDataExtractor
    download_all_available_years = extraction_module.download_all_available_years
    StatusPrinter = utils_module.StatusPrinter
    FileUtils = utils_module.FileUtils
    
    print("✅ Modules imported successfully using src package!")
    
except ImportError as e:
    print(f"❌ Package import failed: {e}")
    print("🔧 Trying direct file loading...")
    
    # Last resort: Load each file individually and handle dependencies manually
    import importlib.util
    import types
    
    # First load config (no dependencies)
    config_spec = importlib.util.spec_from_file_location("config", src_path / "config.py")
    config_module = importlib.util.module_from_spec(config_spec)
    sys.modules['config'] = config_module
    config_spec.loader.exec_module(config_module)
    
    # Then load utils (depends on config)
    utils_spec = importlib.util.spec_from_file_location("utils", src_path / "utils.py")
    utils_module = importlib.util.module_from_spec(utils_spec)
    
    # Manually inject config dependency
    utils_module.config = config_module
    utils_module.Config = config_module.Config
    utils_module.LoggingConfig = config_module.LoggingConfig
    utils_module.FileConfig = config_module.FileConfig
    
    sys.modules['utils'] = utils_module
    utils_spec.loader.exec_module(utils_module)
    
    # Finally load extraction (depends on both config and utils)
    extraction_spec = importlib.util.spec_from_file_location("extraction", src_path / "extraction.py")
    extraction_module = importlib.util.module_from_spec(extraction_spec)
    
    # Manually inject dependencies
    extraction_module.config = config_module
    extraction_module.utils = utils_module
    extraction_module.Config = config_module.Config
    extraction_module.WebScrapingConfig = config_module.WebScrapingConfig
    extraction_module.FileConfig = config_module.FileConfig
    extraction_module.FileUtils = utils_module.FileUtils
    extraction_module.ProgressTracker = utils_module.ProgressTracker
    extraction_module.LoggerSetup = utils_module.LoggerSetup
    extraction_module.StatusPrinter = utils_module.StatusPrinter
    extraction_module.retry_on_exception = utils_module.retry_on_exception
    extraction_module.timing_decorator = utils_module.timing_decorator
    
    sys.modules['extraction'] = extraction_module
    extraction_spec.loader.exec_module(extraction_module)
    
    # Extract what we need
    initialize_project = config_module.initialize_project
    HealthDataExtractor = extraction_module.HealthDataExtractor
    download_all_available_years = extraction_module.download_all_available_years
    StatusPrinter = utils_module.StatusPrinter
    FileUtils = utils_module.FileUtils
    
    print("✅ Modules loaded via manual dependency injection!")

# Initialize the project (creates directories)
print("🚀 Initializing FAIR-PRICE Project")
initialize_project()

# Create the extractor
extractor = HealthDataExtractor()

print(f"\n📁 Files will be downloaded to: {extractor.output_dir}")
print(f"📋 Current files in directory:")

# Show current status
current_files = extractor.get_current_files()
if current_files:
    for year, file_path in sorted(current_files.items()):
        size_mb = FileUtils.get_file_size_mb(file_path)
        print(f"   📄 {year}.csv: {size_mb:.1f}MB")
else:
    print("   (No CSV files found)")

# Download all available years
print(f"\n🎯 Starting download process...")
downloaded_files = extractor.download_all_years()

# Show final status
print(f"\n📊 Final Status:")
print(f"✅ Successfully downloaded: {len(downloaded_files)} files")

if downloaded_files:
    total_size = 0
    for year, file_path in sorted(downloaded_files.items()):
        size_mb = FileUtils.get_file_size_mb(file_path)
        total_size += size_mb
        print(f"   📄 {year}.csv: {size_mb:.1f}MB")
    
    print(f"\n💾 Total size: {total_size:.1f}MB")
    print(f"📁 Location: {extractor.output_dir}")

# Quick validation
print(f"\n🔍 Quick validation of downloaded files:")
validation_results = extractor.validate_all_files(downloaded_files)

for year, is_valid in sorted(validation_results.items()):
    status = "✅ Valid" if is_valid else "❌ Invalid"
    print(f"   {year}.csv: {status}")

print(f"\n🎉 Extraction complete! Ready for exploration and standardization.")

📁 Project root: /home/victor-jose/Documents/projetos/DGU/DGU45/fair-price
📁 Source path: /home/victor-jose/Documents/projetos/DGU/DGU45/fair-price/src
📁 Current working directory: /home/victor-jose/Documents/projetos/DGU/DGU45/fair-price/notebooks
❌ Package import failed: No module named 'pandas'
🔧 Trying direct file loading...


ModuleNotFoundError: No module named 'pandas'