In [None]:
import pandas as pd
import numpy as np
import json
import hashlib
from datetime import datetime
from pathlib import Path
from typing import Dict, List, Tuple, Any, Optional
import warnings
warnings.filterwarnings('ignore')

# Create the validate module if it doesn't exist
from petrinex.validate import (
    DataValidator,
    create_silver_from_bronze,
    compare_bronze_silver,
    generate_audit_report
)

In [None]:
# Define data paths
bronze_path = Path("../fixtures")
silver_path = Path("../fixtures")  # Silver tables in fixtures
audit_path = Path("../fixtures")   # Audit files in fixtures

# Create fixtures directory if it doesn't exist
bronze_path.mkdir(exist_ok=True)

# Define datasets to validate
datasets = {
    'ngl_vol': 'ngl_vol_bronze_cvx.parquet',
    'conv_vol': 'conv_vol_bronze_cvx.parquet'
}

print("📁 Validation Configuration:")
print(f"   Bronze path: {bronze_path}")
print(f"   Silver path: {silver_path}")
print(f"   Audit path: {audit_path}")
print(f"   Datasets: {list(datasets.keys())}")


In [None]:
# Define data quality rules for Petrinex bronze tables
data_quality_rules = {
    'ngl_vol': {
        'required_columns': [
            'ReportingFacilityID', 'ProductionMonth', 'WellID',
            'GasProduction', 'OilProduction', 'CondensateProduction', 
            'WaterProduction', 'Hours'
        ],
        'data_types': {
            'ReportingFacilityID': 'string',
            'ProductionMonth': 'datetime',
            'WellID': 'string', 
            'GasProduction': 'float',
            'OilProduction': 'float',
            'CondensateProduction': 'float',
            'WaterProduction': 'float',
            'Hours': 'float',
            'OperatorBAID': 'string',
            'OperatorName': 'string'
        },
        'ranges': {
            'GasProduction': (0, 1000000),
            'OilProduction': (0, 100000),
            'CondensateProduction': (0, 50000), 
            'WaterProduction': (0, 500000),
            'Hours': (0, 744)  # Max hours in a month
        },
        'rate_columns': ['GasProduction', 'OilProduction', 'CondensateProduction', 'WaterProduction'],
        'cumulative_columns': []
    },
    'conv_vol': {
        'required_columns': [
            'ProductionMonth', 'OperatorBAID', 'ReportingFacilityID', 
            'Volume', 'Hours'
        ],
        'data_types': {
            'ProductionMonth': 'datetime',
            'OperatorBAID': 'string',
            'OperatorName': 'string',
            'ReportingFacilityID': 'string',
            'Volume': 'float',
            'Energy': 'float',
            'Hours': 'float'
        },
        'ranges': {
            'Volume': (0, 1000000),
            'Energy': (0, 10000000),
            'Hours': (0, 744)
        },
        'rate_columns': ['Volume'],
        'cumulative_columns': []
    }
}

print("📋 Data Quality Rules Configured:")
for dataset, rules in data_quality_rules.items():
    print(f"   {dataset}: {len(rules['required_columns'])} columns, {len(rules['ranges'])} range checks")


In [None]:
# Initialize the data validator
validator = DataValidator()

print("✅ Data Validator initialized")
print(f"   Available validation methods: {len(validator.get_validation_methods())}")

# Process datasets
validation_results = {}

for dataset_name, filename in datasets.items():
    print(f"\n🔍 Processing {dataset_name.upper()}")
    print("=" * 50)
    
    # Load bronze data
    bronze_file = bronze_path / filename
    if not bronze_file.exists():
        print(f"❌ Bronze file not found: {bronze_file}")
        continue
        
    bronze_df = pd.read_parquet(bronze_file)
    print(f"📊 Loaded bronze data: {len(bronze_df):,} rows, {len(bronze_df.columns)} columns")
    
    # Define silver file path
    silver_file = silver_path / f"{dataset_name}_silver.parquet"
    
    # Check if silver table exists
    if not silver_file.exists():
        print("🆕 No silver table found - creating initial silver table")
        
        # Validate and clean bronze data
        rules = data_quality_rules.get(dataset_name, {})
        validation_result = validator.validate_dataframe(bronze_df, rules)
        cleaned_df = validator.apply_data_quality_fixes(bronze_df, rules)
        
        # Create silver table
        silver_df = create_silver_from_bronze(cleaned_df)
        silver_df.to_parquet(silver_file, index=False)
        
        validation_results[dataset_name] = {
            'action': 'created_silver',
            'bronze_rows': len(bronze_df),
            'silver_rows': len(silver_df),
            'validation': validation_result,
            'changes': None
        }
        
        print(f"✅ Created silver table: {len(silver_df):,} rows")
        
    else:
        print("🔄 Silver table exists - performing change detection")
        
        # Load existing silver data
        silver_df = pd.read_parquet(silver_file)
        print(f"📊 Loaded silver data: {len(silver_df):,} rows")
        
        # Validate bronze data
        rules = data_quality_rules.get(dataset_name, {})
        validation_result = validator.validate_dataframe(bronze_df, rules)
        cleaned_bronze_df = validator.apply_data_quality_fixes(bronze_df, rules)
        
        # Compare bronze vs silver
        comparison_result = compare_bronze_silver(cleaned_bronze_df, silver_df)
        
        # Update silver table if there are changes
        if comparison_result['has_changes']:
            print(f"📝 Changes detected: {comparison_result['summary']}")
            
            # Create updated silver table
            updated_silver_df = create_silver_from_bronze(cleaned_bronze_df)
            updated_silver_df.to_parquet(silver_file, index=False)
            
            print(f"✅ Updated silver table: {len(updated_silver_df):,} rows")
        else:
            print("✅ No changes detected - silver table up to date")
        
        validation_results[dataset_name] = {
            'action': 'compared_and_updated' if comparison_result['has_changes'] else 'no_changes',
            'bronze_rows': len(bronze_df),
            'silver_rows': len(silver_df),
            'validation': validation_result,
            'changes': comparison_result
        }
    
    # Display validation summary
    result = validation_results[dataset_name]
    validation = result['validation']
    
    print(f"\n📋 Data Quality Summary for {dataset_name}:")
    print(f"   ✅ Passed checks: {validation['passed_checks']}")
    print(f"   ❌ Failed checks: {validation['failed_checks']}")
    if validation['errors']:
        print(f"   🚨 Errors: {len(validation['errors'])}")
        for error in validation['errors'][:3]:  # Show first 3 errors
            print(f"      - {error}")

print(f"\n🎉 Validation completed for {len(validation_results)} datasets")


In [None]:
# Generate comprehensive audit report
audit_report = generate_audit_report(validation_results)

# Save audit report
timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
audit_file = audit_path / f"validation_audit_{timestamp}.json"

with open(audit_file, 'w') as f:
    json.dump(audit_report, f, indent=2, default=str)

print(f"📄 Audit report saved: {audit_file}")
print(f"📊 Report summary:")
print(f"   - Timestamp: {audit_report['metadata']['timestamp']}")
print(f"   - Datasets processed: {len(audit_report['datasets'])}")
print(f"   - Total validation checks: {audit_report['summary']['total_checks']}")
print(f"   - Passed checks: {audit_report['summary']['passed_checks']}")
print(f"   - Failed checks: {audit_report['summary']['failed_checks']}")

# Display comprehensive validation dashboard
print("\n📊 VALIDATION DASHBOARD")
print("=" * 60)

for dataset_name, result in validation_results.items():
    print(f"\n🗂️  {dataset_name.upper()}")
    print(f"   Action: {result['action']}")
    print(f"   Bronze rows: {result['bronze_rows']:,}")
    print(f"   Silver rows: {result['silver_rows']:,}")
    
    validation = result['validation']
    total_checks = validation['passed_checks'] + validation['failed_checks']
    success_rate = validation['passed_checks'] / total_checks * 100 if total_checks > 0 else 0
    
    print(f"   Data Quality: {success_rate:.1f}% ({validation['passed_checks']}/{total_checks} checks passed)")
    
    if result['changes'] and result['changes']['has_changes']:
        changes = result['changes']
        print(f"   Changes detected:")
        print(f"     - New rows: {changes.get('new_rows', 0)}")
        print(f"     - Modified rows: {changes.get('modified_rows', 0)}")
        print(f"     - Deleted rows: {changes.get('deleted_rows', 0)}")

print(f"\n✅ All datasets validated successfully!")
print(f"📁 Silver tables and audit reports available in: {silver_path}")
