In [1]:
import os
import pandas as pd
import numpy as np


In [2]:
%pwd
os.chdir("../")
%pwd

'g:\\success_analytics_courses\\internship_project\\pulsar_project'

In [3]:
#step - 1 : config.yaml updated
#step - 2 : constant file updated
#step - 3 : entity

from dataclasses import dataclass
from pathlib import Path

@dataclass(frozen=True)
class DataIngestionConfiguration:

    root_dir_name: Path
    dataset_download_url: str
    zip_data_dir_name: Path
    unzip_data_dir_name: Path
    
@dataclass(frozen=True)
class DataValidationConfiguration:

    validated_root_dir_name: Path
    validated_train_dir: Path
    validated_test_dir: Path
    validated_status_report_file_name: str
    validated_required_files:list

In [5]:
#step - 4 : configuration manager in src config
from pulsarclassification.constants import *
from pulsarclassification.logging import logging
from pulsarclassification.utils.common import read_yaml,create_directories

class ConfigurationManager:

    def __init__(self, config_file_path: str = CONFIG_FILE_PATH):
        
        try:
            self.config = read_yaml(CONFIG_FILE_PATH)
            create_directories(self.config.artifacts_dir_name)
            logging.info(f" Artifacts directory created at : {self.config.artifacts_dir_name} ")
        except Exception as e:
            raise e
        
    def get_data_ingestion_config(self) -> DataIngestionConfiguration:

        try:
            artifact_dir = self.config.artifacts_dir_name
            config = self.config.data_ingestion_config

            data_ingestion_dir = os.path.join(artifact_dir,config.root_dir_name)
            create_directories(data_ingestion_dir)

            raw_data_dir = os.path.join(data_ingestion_dir,config.zip_data_dir_name)
            create_directories(raw_data_dir)

            ingested_csv_data_dir = os.path.join(data_ingestion_dir,config.unzip_data_dir_name)
            create_directories(ingested_csv_data_dir)

            data_ingestion_config = DataIngestionConfiguration(
                root_dir_name  = config.root_dir_name,
                dataset_download_url = config.dataset_download_url,
                zip_data_dir_name = raw_data_dir,
                unzip_data_dir_name = ingested_csv_data_dir
            )

            logging.info(f" Data ingestion configuration: {data_ingestion_config}")

            return data_ingestion_config
    
        except Exception as e:
            raise e
        
    def get_data_validation_configuration(self) -> DataValidationConfiguration:

        try:
            artifact_dir = self.config.artifacts_dir_name
            config = self.config.data_validation_config

            data_validation_dir = os.path.join(artifact_dir,config.validated_root_dir_name)
            create_directories(data_validation_dir)

            data_validation_train_dir = os.path.join(data_validation_dir,config.validated_train_dir)
            create_directories(data_validation_train_dir)

            data_validation_test_dir = os.path.join(data_validation_dir,config.validated_test_dir)
            create_directories(data_validation_test_dir)

            data_validation_config = DataValidationConfiguration(
                validated_root_dir_name  = config.validated_root_dir_name,
                validated_train_dir = data_validation_train_dir,
                validated_test_dir = data_validation_test_dir,
                validated_status_report_file_name = os.path.join(data_validation_dir,config.validated_status_report_file_name),
                validated_required_files = config.validated_required_files
            )

            logging.info(f" Data validation configuration: {data_validation_config}")

            return data_validation_config
        
        except Exception as e:
            raise e

In [62]:
#stage - 6 : updating components

import os
from pathlib import Path
from pulsarclassification.logging import logging
from pulsarclassification.constants import *
from pulsarclassification.utils.common import read_yaml,create_directories,get_file_size
from pulsarclassification.entity import DataIngestionConfiguration,DataValidationConfiguration

class DataValidation:
    def __init__(self, ingestion_config : DataIngestionConfiguration,
                 validation_config:DataValidationConfiguration):

        try:
            self.ingestion_config = ingestion_config
            self.validation_config = validation_config
            self.schema = read_yaml(SCHEMA_FILE_PATH)
        except Exception as e:
            raise e 
        
    def file_exist_validation(self):
        try:
            file_exist_status = None
            all_files = os.listdir(self.ingestion_config.unzip_data_dir_name)
            with open(self.validation_config.validated_status_report_file_name,'w') as f:
                f.write(f">>>>>>>>>>>>file exist validation<<<<<<<<<<<<<\n\n")
                for file in all_files:
                    if file not in all_files:
                        file_exist_status = False
                        f.write(f"Validation status: {file_exist_status}------->{file} not present\n\n")
                    else:
                        file_exist_status = True
                        f.write(f"Validation status: {file_exist_status}------->{file} is present\n\n")
            f.close()
            logging.info(f"Validation status updated: {self.validation_config.validated_status_report_file_name}")
            return file_exist_status
        except Exception as e:
            raise e
        
    def number_of_columns_validation(self):
        try:
            vs = None
            train_data_file = os.path.join(self.ingestion_config.unzip_data_dir_name,INGESTED_TRAIN_FILE_NAME)
            test_data_file = os.path.join(self.ingestion_config.unzip_data_dir_name,INGESTED_TEST_FILE_NAME)
            
            df_train = pd.read_csv(train_data_file)
            df_test = pd.read_csv(test_data_file)

            df_train.drop(columns=self.schema.target_column,inplace=True)
            with open(self.validation_config.validated_status_report_file_name,'a') as f:
                f.write(f">>>>>>>>>>>>number of column validation<<<<<<<<<<<<<\n\n")
                if df_train.shape[1] == self.schema.number_of_feature_columns:
                        vs = True
                        f.write(f"Validation status:{vs}-------> Training file has {self.schema.number_of_feature_columns} columns\n\n")
                else:
                    vs = False
                    f.write(f"Validation status:{vs}-------> Training file has {df_train.shape[1]} columns\n\n")

                if df_test.shape[1] == self.schema.number_of_feature_columns:
                        vs = True
                        f.write(f"Validation status:{vs}-------> Industrial test file has {self.schema.number_of_feature_columns} columns\n\n")
                else:
                    vs = False
                    f.write(f"Validation status:{vs}-------> Industrial test file has {df_train.shape[1]} columns\n\n")
            f.close()
            logging.info(f"Validation status updated: {self.validation_config.validated_status_report_file_name}")
            return vs
        except Exception as e:
            raise e
        
    def datatype_of_columns_validation(self):
        try:
            vs = None
            train_data_file = os.path.join(self.ingestion_config.unzip_data_dir_name,INGESTED_TRAIN_FILE_NAME)
            test_data_file = os.path.join(self.ingestion_config.unzip_data_dir_name,INGESTED_TEST_FILE_NAME)
            
            df_train = pd.read_csv(train_data_file)
            df_test = pd.read_csv(test_data_file)

            features_list = df_test.columns.to_list()
            with open(self.validation_config.validated_status_report_file_name,'a') as f:
                f.write(f">>>>>>>>>>>>datatype of column validation<<<<<<<<<<<<<\n\n")
                for feature in features_list:
                    if (df_train[feature].dtype == self.schema.datatype_of_columns[feature]) and (df_test[feature].dtype == self.schema.datatype_of_columns[feature]) :
                            vs = True
                            f.write(f"Validation status:{vs}-------> The {feature} is present in Training file and Industrial test file has datatype {self.schema.datatype_of_columns[feature]} \n\n")
                    else:
                        vs = False
                        f.write(f"Validation status:{vs}-------> The {feature} is not present in Training file and Industrial test file. Please check this {feature} \n\n")
                if df_train[self.schema.target_column].dtype == self.schema.datatype_of_columns[self.schema.target_column]:
                    vs = True
                    f.write(f"Validation status:{vs}-------> The target column i.e {self.schema.target_column} is present in train file\n\n")
                else:
                    vs = False
                    f.write(f"Validation status:{vs}-------> The target column i.e {self.schema.target_column} is not present in train file\n\n")
            f.close()
            logging.info(f"Validation status updated: {self.validation_config.validated_status_report_file_name}")
            return vs
        except Exception as e:
            raise e
        
    def null_value_of_columns_validation(self):
        try:
            vs = None
            train_data_file = os.path.join(self.ingestion_config.unzip_data_dir_name,INGESTED_TRAIN_FILE_NAME)
            test_data_file = os.path.join(self.ingestion_config.unzip_data_dir_name,INGESTED_TEST_FILE_NAME)
            
            df_train = pd.read_csv(train_data_file)
            df_test = pd.read_csv(test_data_file)
            train_status = df_train.isna().sum().sum()
            test_status = df_test.isna().sum().sum()
            with open(self.validation_config.validated_status_report_file_name,'a') as f:
                f.write(f">>>>>>>>>>>>null value of column validation<<<<<<<<<<<<<\n\n")
                if train_status == test_status == 0:
                        vs = True
                        f.write(f"Validation status:{vs}-------> The is no null value in train and industrial test data\n\n")
                elif train_status != 0:
                    vs = False
                    null_features = [feature for feature in df_train.columns if df_train[feature].isna().sum()>0]
                    f.write(f"Validation status:{vs}-------> These features {null_features} have null value in train file \n\n")
                elif test_status != 0:
                    vs = False
                    null_features = [feature for feature in df_test.columns if df_test[feature].isna().sum()>0]
                    f.write(f"Validation status:{vs}-------> These features {null_features} have null value in industrial test file \n\n")
            f.close()
            logging.info(f"Validation status updated: {self.validation_config.validated_status_report_file_name}")
            return vs
        except Exception as e:
            raise e
        
    def unique_value_of_columns_validation(self):
        try:
            train_data_file = os.path.join(self.ingestion_config.unzip_data_dir_name,INGESTED_TRAIN_FILE_NAME)
            test_data_file = os.path.join(self.ingestion_config.unzip_data_dir_name,INGESTED_TEST_FILE_NAME)
            
            df_train = pd.read_csv(train_data_file)
            df_test = pd.read_csv(test_data_file)
            
            with open(self.validation_config.validated_status_report_file_name,'a') as f:
                f.write(f">>>>>>>>>>>>unique value of each column<<<<<<<<<<<<<\n\n")
                f.write(f">>>>>>>>>>>>unique value of each column in train data<<<<<<<<<<<<<\n\n")
                for feature in df_train.columns:
                        f.write(f"{feature} has {df_train[feature].nunique()} unique values \n\n")

                f.write(f">>>>>>>>>>>>unique value of each column in industrial test data<<<<<<<<<<<<<\n\n")
                for feature in df_test.columns:
                        f.write(f"{feature} has {df_test[feature].nunique()} unique values \n\n") 
            f.close()
            logging.info(f"Validation status updated: {self.validation_config.validated_status_report_file_name}")
        except Exception as e:
            raise e
        
    def saving_validated_data(self):
        try:
            train_data_file = os.path.join(self.ingestion_config.unzip_data_dir_name,INGESTED_TRAIN_FILE_NAME)
            test_data_file = os.path.join(self.ingestion_config.unzip_data_dir_name,INGESTED_TEST_FILE_NAME)
            
            df_train = pd.read_csv(train_data_file)
            df_test = pd.read_csv(test_data_file)

            validated_training_data_file_path = os.path.join(self.validation_config.validated_train_dir,VALIDATED_DATA_FILE_NAME_FOR_MODEL_TRAIN)
            validated_industrial_test_data_file_path = os.path.join(self.validation_config.validated_test_dir,VALIDATED_INDUSTRIALDATA_FILE_NAME)

            df_train.to_csv(validated_training_data_file_path,index=False)
            df_test.to_csv(validated_industrial_test_data_file_path,index=False)
            
            logging.info(f"Validated train data saved in : {validated_training_data_file_path}")
            logging.info(f"Validated industrial test data saved in : {validated_industrial_test_data_file_path}")
            
        except Exception as e:
            raise e
        

In [63]:
#from pulsarclassification.config.configuration import ConfigurationManager
#from pulsarclassification.components.data_validation import DataValidation

try:
    config = ConfigurationManager()
    data_ingestion_configuration = config.get_data_ingestion_config()
    data_validation_config = config.get_data_validation_configuration()
    data_validation = DataValidation(ingestion_config=data_ingestion_configuration,
                                     validation_config=data_validation_config)
    data_validation.file_exist_validation()
    data_validation.number_of_columns_validation()
    data_validation.datatype_of_columns_validation()
    data_validation.null_value_of_columns_validation()
    data_validation.unique_value_of_columns_validation()
    data_validation.saving_validated_data()
except Exception as e:
    raise e

[20-08-2023 22:22:04: INFO: common:  yaml file from this path config\config.yaml read succesfully]
[20-08-2023 22:22:04: INFO: common:  Directory already present: artifacts ]
[20-08-2023 22:22:04: INFO: 3402709211:  Artifacts directory created at : artifacts ]
[20-08-2023 22:22:04: INFO: common:  Directory already present: artifacts\data_ingestion ]
[20-08-2023 22:22:04: INFO: common:  Directory already present: artifacts\data_ingestion\raw_data ]
[20-08-2023 22:22:04: INFO: common:  Directory already present: artifacts\data_ingestion\ingested_data ]
[20-08-2023 22:22:04: INFO: 3402709211:  Data ingestion configuration: DataIngestionConfiguration(root_dir_name='data_ingestion', dataset_download_url='https://github.com/sumit-1492/datasets/raw/main/playground-series-s3e10.zip', zip_data_dir_name='artifacts\\data_ingestion\\raw_data', unzip_data_dir_name='artifacts\\data_ingestion\\ingested_data')]
[20-08-2023 22:22:04: INFO: common:  Directory already present: artifacts\data_validation ]

[20-08-2023 22:22:04: INFO: 3360515675: Validation status updated: artifacts\data_validation\status.txt]
[20-08-2023 22:22:05: INFO: 3360515675: Validation status updated: artifacts\data_validation\status.txt]
[20-08-2023 22:22:05: INFO: 3360515675: Validation status updated: artifacts\data_validation\status.txt]
[20-08-2023 22:22:06: INFO: 3360515675: Validation status updated: artifacts\data_validation\status.txt]
[20-08-2023 22:22:09: INFO: 3360515675: Validated train data saved in : artifacts\data_validation\training_data_for_model\pulsar.csv]
[20-08-2023 22:22:09: INFO: 3360515675: Validated industrial test data saved in : artifacts\data_validation\industrial_test_data\Industrial_pulsar_data.csv]
