## Data Validation 

In [1]:
import os

In [2]:
%pwd

'c:\\Users\\VenuraP\\Desktop\\Browns Data Projects\\ML Projects\\POC\\Harti-Food-Price-Prediction\\notebooks'

In [3]:
os.chdir("../")
%pwd

'c:\\Users\\VenuraP\\Desktop\\Browns Data Projects\\ML Projects\\POC\\Harti-Food-Price-Prediction'

## Data Validation

In [4]:
from dataclasses import dataclass
from pathlib import Path

@dataclass(frozen=True)
class DataValidationConfig:
    root_dir: Path
    STATUS_FILE: str
    unzip_data_dir: Path
    all_schema: dict

## Data Validation Configuration

In [5]:
from src.constants import *
from src.utils.common import read_yaml, create_directories

In [6]:
from pathlib import Path

# Function to initialize configuration by reading YAML and creating directories
def load_configuration(config_filepath: Path = CONFIG_FILE_PATH, schema_filepath: Path = SCHEMA_FILE_PATH):

    config = read_yaml(config_filepath)
    schema = read_yaml(schema_filepath)

    return config, schema


# Function to get data validation configuration from the loaded config
def get_data_validation_config(config, schema) -> DataValidationConfig:

    # Extract data validation settings from the config
    data_validation = config.data_validation

    # Create and return a DataValidationConfig instance, including the schema
    return DataValidationConfig(
        root_dir=data_validation.root_dir,
        unzip_data_dir=data_validation.unzip_data_dir,
        STATUS_FILE=data_validation.STATUS_FILE,
        all_schema=schema.COLUMNS,
    )


# # load config and schema
# config, schema = load_configuration()
# # Step 4: Retrieve the data ingestion configuration from the loaded config
# data_validation_config = get_data_validation_config(config, schema)
# # Step 5: Create the directories related to data ingestion (root directory)
# create_directories([data_validation_config.root_dir])

## Data Validation Components 

In [7]:
import pandas as pd

def validate_all_columns(config):
    """
    Validate if all columns in the data match the schema.

    Args:
        config: DataValidationConfig instance with configurations for validation.
    
    Returns:
        bool: True if all columns match the schema, False otherwise.
    """
    try:
        validation_status = True  # Assume valid unless proven otherwise

        # Load data from the specified Excel file (explicitly specify the engine)
        data = pd.read_excel(config.unzip_data_dir, engine='openpyxl')
        all_cols = list(data.columns)

        # Load schema keys
        all_schema = config.all_schema.keys()

        # Iterate through all columns and validate
        for col in all_cols:
            if col not in all_schema:
                validation_status = False
                with open(config.STATUS_FILE, 'w') as f:
                    f.write(f"Validation status: {validation_status}\n")
                break  # Exit the loop if any column is invalid
        else:
            with open(config.STATUS_FILE, 'w') as f:
                f.write(f"Validation status: {validation_status}\n")

        return validation_status

    except Exception as e:
        raise e

[2024-09-25 09:22:52,250: INFO: common: yaml file: config\config.yaml loaded successfully]
[2024-09-25 09:22:52,250: INFO: common: yaml file: schema.yaml loaded successfully]
[2024-09-25 09:22:52,255: INFO: common: created directory at: artifacts/data_validation]


## Pipeline (Data_Validation)

In [None]:
from src import logger

def data_validation_training_pipeline():
    """Runs the data validation pipeline."""
    try:
        # Load config and schema
        config , _ = load_configuration()

        # Retrieve the data ingestion configuration from the loaded config
        data_validation_config = get_data_validation_config(config)

        # Create directories related to data ingestion (root directory)
        create_directories([data_validation_config.root_dir])

        # Download the data file as part of the data ingestion process
        validate_all_columns(data_validation_config)

    except Exception as e:
        logger.error(f"An error occurred during data validation: {e}")

if __name__ == "__main__":

    data_validation_training_pipeline()