In [1]:
import os

In [2]:
%pwd

'/home/tejas/MLProj/Thyroid-Disease-Prediction/research'

In [3]:
os.chdir('../')

In [4]:
%pwd

'/home/tejas/MLProj/Thyroid-Disease-Prediction'

In [5]:
from dataclasses import dataclass
from pathlib import Path

@dataclass(frozen=True)
class DataValidationConfig:
    root_dir: Path
    STATUS_FILE: str
    unzip_data_dir: Path
    all_schema: dict

In [6]:
from ThyroidProject.constants import *
from ThyroidProject.utils.common import read_yaml , create_directories

In [7]:
class ConfigurationManager:
    def __init__(self,
                 config_filepath=CONFIG_FILE_PATH,
                 params_filepath=PARAMS_FILE_PATH,
                 schema_filepath=SCHEMA_FILE_PATH):
        """
        Initialize the ConfigurationManager class.

        Args:
            config_filepath (str): The path to the configuration file.
            params_filepath (str): The path to the parameters file.
            schema_filepath (str): The path to the schema file.

        Returns:
            None

        Raises:
            FileNotFoundError: If the configuration, parameters, or schema files cannot be found.
            ValueError: If the configuration or parameters files are not valid YAML files.
        """
        # Read the configuration, parameters, and schema files
        self.config = read_yaml(config_filepath)
        self.params = read_yaml(params_filepath)
        self.schema = read_yaml(schema_filepath)

        # Create the directories if they do not exist
        create_directories([self.config.artifacts_root])

    def get_data_validation_config(self) -> DataValidationConfig:
        """
        Get the data validation configuration.

        Returns:
            DataValidationConfig: The data validation configuration.
        """
        config = self.config.data_validation
        schema = self.schema.COLUMNS

        create_directories([config.root_dir])

        data_validation_config = DataValidationConfig(
            root_dir=config.root_dir,
            STATUS_FILE=config.STATUS_FILE,
            unzip_data_dir=config.unzip_data_dir,
            all_schema=schema,
        )

        return data_validation_config

In [8]:
import pandas as pd

In [9]:
class DataValidation:
    def __init__(self, config: DataValidationConfig):
        self.config = config

    def validate_all_columns(self) -> bool:
        """
        Validate all the columns in the data.

        This function reads the data from the unzip data directory and checks if all the columns are present in the schema and if their data types match with the schema. If any column is missing or has an incorrect data type, the function sets the validation status to False and writes an error message to the status file. If all the columns are valid, the function sets the validation status to True and writes an success message to the status file.

        Returns:
            bool: True if all the columns are valid, False otherwise.

        Raises:
            Exception: If there is an error while validating the columns.
        """
        try:
            validation_status = None
            
            # Column names of the raw data
            names = ["age", "sex", "on_thyroxine", "query_on_thyroxine", "on_antihyroid_meds", "sick", "pregnant", "thyroid_surgery", "I131_treatment", "query_hypothyroid", "query_hyperthyroid", "lithium", "goitre",
                     "tumor", "hypopituitary", "psych", "TSH_measured", "TSH", "T3_measured", "T3", "TT4_measured", "TT4", "T4U_measured", "T4U", "FTI_measured", "FTI", "TBG_measured", "TBG", "referral_source", "target"]

            # Read the data from the unzip data directory
            data = pd.read_csv(self.config.unzip_data_dir,names=names)

            # Get a list of all the columns
            all_cols = list(data.columns)

            # Get the schema for all the columns
            all_schema = self.config.all_schema.keys()

            # Loop through all the columns
            for col in all_cols:
                # Check if the column is present in the schema and if its data type matches with the schema
                if col not in all_schema or data[col].dtype != self.config.all_schema[col]:
                    # Set the validation status to False if any column is missing or has an incorrect data type
                    validation_status = False
                    with open(self.config.STATUS_FILE, "w") as f:
                        f.write(f"Validation status: {validation_status}")
                # Set the validation status to True if all the columns are valid
                else:
                    validation_status = True
                    with open(self.config.STATUS_FILE, "w") as f:
                        f.write(f"Validation status: {validation_status}")

            # Return the validation status
            return validation_status

        except Exception as e:
            raise e

In [10]:
try:
    config = ConfigurationManager()
    data_validation_config = config.get_data_validation_config()
    data_validation = DataValidation(config=data_validation_config)
    data_validation.validate_all_columns()
except Exception as e:
    raise e

[2024-02-01 00:13:31,405:INFO:common:yaml file: config/config.yaml loaded successfully]
[2024-02-01 00:13:31,409:INFO:common:yaml file: params.yaml loaded successfully]
[2024-02-01 00:13:31,413:INFO:common:yaml file: schema.yaml loaded successfully]
[2024-02-01 00:13:31,414:INFO:common:created directory at :artifacts]
[2024-02-01 00:13:31,416:INFO:common:created directory at :artifacts/data_validation]
