In [1]:
import os

In [2]:
%pwd

'c:\\DataScience\\Projects\\wine_quality\\research'

In [3]:
os.chdir("../")

In [4]:
%pwd

'c:\\DataScience\\Projects\\wine_quality'

In [5]:
from dataclasses import dataclass
from pathlib import Path


@dataclass(frozen=True)
class DataValidationConfig:
    root_dir: Path
    STATUS_FILE: str
    unzip_data_dir: Path
    all_schema: dict

In [6]:
from mlproject.constants import *
from mlproject.utils.common import read_yaml_file, create_directories

In [7]:
class ConfigurationManager:
    def __init__(
        self,
        config_filepath = CONFIG_FILE_PATH,
        params_filepath = PARAMS_FILE_PATH,
        schema_filepath = SCHEMA_FILE_PATH):

        self.config = read_yaml_file(config_filepath)
        self.params = read_yaml_file(params_filepath)
        self.schema = read_yaml_file(schema_filepath)

        create_directories([self.config.artifacts_root])


    def get_data_validation_config(self) -> DataValidationConfig:
        schema=self.schema.COLUMNS
        config = self.config.data_validation

        create_directories([config.root_dir])

        data_validation_config = DataValidationConfig(
            root_dir=config.root_dir,
            STATUS_FILE=config.STATUS_FILE,
            unzip_data_dir=config.unzip_data_dir ,
            all_schema=schema,
        )

        return data_validation_config

In [8]:
import pandas as pd

Pyarrow will become a required dependency of pandas in the next major release of pandas (pandas 3.0),
(to allow more performant data types, such as the Arrow string type, and better interoperability with other libraries)
but was not found to be installed on your system.
If this would cause problems for you,
please provide us feedback at https://github.com/pandas-dev/pandas/issues/54466
        
  import pandas as pd


In [9]:
class DataValidation:
    def __init__(self, config: DataValidationConfig):
        self.config = config

    
    def validate_all_columns(self)-> bool:
        try:
            validation_status = None

            data = pd.read_csv(self.config.unzip_data_dir)
            all_cols = list(data.columns)

            all_schema = self.config.all_schema.keys()

            
            for col in all_cols:
                if col in all_schema:
                    validation_status = True
                    with open(self.config.STATUS_FILE, 'w') as f:
                        f.write(f"Validation status: {validation_status}")
                else:
                    validation_status = False
                    with open(self.config.STATUS_FILE, 'w') as f:
                        f.write(f"Validation status: {validation_status}")

            return validation_status
        
        except Exception as e:
            raise e

In [10]:
try:
    config = ConfigurationManager()
    data_validation_config = config.get_data_validation_config()
    data_validation = DataValidation(config=data_validation_config)
    data_validation.validate_all_columns()
except Exception as e:
    raise e

[2024-02-22 23:48:14,859: INFO:common: yaml file: config\config.yaml loaded successfully]
[2024-02-22 23:48:14,863: INFO:common: yaml file: params.yaml loaded successfully]
[2024-02-22 23:48:14,867: INFO:common: yaml file: schema.yaml loaded successfully]
[2024-02-22 23:48:14,867: INFO:common: createD Directory at:artifacts]
[2024-02-22 23:48:14,867: INFO:common: createD Directory at:artifacts/data_validation]
