In [1]:
import os
os.chdir("../")
%pwd

'g:\\sumit\\WORK\\Phisingurl\\phising_url_tabular_data\\phising_url_end_to_end'

In [2]:
##01 config.yaml updated
##02 constants updated
##03 entity

from dataclasses import dataclass
from pathlib import Path

@dataclass
class DataValidationConfiguration:
    root_dir_name: Path
    validate_data_path: Path
    status_file_path: Path

In [3]:
##04 updating configurationmanager in src config

from phising_project.constants import *
from phising_project.utils.common import *

class ConfigurationManager:

    def __init__(self,
                 config_file_path = CONFIG_FILE_PATH,
                 schema_file_path = SCHEMA_FILE_PATH,
                 parameter_file_path = PARAMETER_FILE_PATH):
        try:
            self.config = read_yaml_file(config_file_path)
            self.schema = read_yaml_file(schema_file_path)
            self.params = read_yaml_file(parameter_file_path)
            self.artifacts_dir_name = self.config.artifacts_dir_name

            create_directories(self.config.artifacts_dir_name)

            logger.info(f"Artifacts directory created at : {self.config.artifacts_dir_name}")
        
        except Exception as e:
            raise e
        
    def get_data_validation_configuration(self) -> DataValidationConfiguration:

        try:
            config = self.config.data_validation_config

            data_validation_dir = Path(os.path.join(self.artifacts_dir_name,config.root_dir_name))
            create_directories([data_validation_dir])

            validate_data_dir = Path(os.path.join(self.artifacts_dir_name,config.validate_data_path))
            create_directories([validate_data_dir])

            status_file_dir = Path(os.path.join(self.artifacts_dir_name,config.status_file_path))
            create_directories(status_file_dir)

            data_validation_config = DataValidationConfiguration(
                root_dir_name = data_validation_dir,
                validate_data_path = validate_data_dir,
                status_file_path = status_file_dir
            )
            logger.info(f"Data validation configuration updated: {data_validation_config}")

            return data_validation_config
        except Exception as e:
            raise e

In [None]:
##05 updating components

import os
import pandas as pd
import numpy as np
from phising_project.logging import logger
from phising_project.constants import *
from phising_project.utils.common import *
from phising_project.entity import DataIngestionConfiguration
from phising_project.config.configuration import ConfigurationManager

class DataValidation:
    
    def __init__(self,
                 ingestion_config:DataIngestionConfiguration,
                 config:DataValidationConfiguration):
        try:
            self.ingestion_config = ingestion_config
            self.config = config
            self.validation_status_file_name = os.path.join(self.config.status_file_path,VALIDATION_STATUS_FILE_NAME)

            # self.non_malicious_file_data = pd.read_csv(os.path.join(self.ingestion_config.unzip_data_path,NON_MALICIOUS_URL_FILE_NAME))
            # self.malicious_file_data = pd.read_csv(os.path.join(self.ingestion_config.unzip_data_path,MALICIOUS_URL_FILE_NAME))

            cfgm = ConfigurationManager()
            self.schema = cfgm.schema
            self.non_feature_column = self.schema.NON_FEATURE_COLUMN
            self.feature_columns = self.schema.FEATURE_COLUMNS
            self.target_column = self.schema.TARGET_COLUMN
            self.total_columns = list(self.non_feature_column.keys()) + list(self.feature_columns.keys()) + list(self.target_column.keys())

        except Exception as e:
            raise e

    def file_exist_validation(self):
        try:
            require_file_status = None
            
            all_files = os.listdir(self.ingestion_config.unzip_data_path)
    
            with open(self.validation_status_file_name,"w") as f:
                f.write(f">>>>>>>>>>>>>>>>>>>>>>>file exist validation<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<\n\n")
                if (MALICIOUS_URL_FILE_NAME in all_files) and (NON_MALICIOUS_URL_FILE_NAME in all_files):
                        
                    require_file_status = True
                    f.write(f"validation status : {require_file_status} ----------------> {MALICIOUS_URL_FILE_NAME} and {NON_MALICIOUS_URL_FILE_NAME} are present\n\n")
                else:
                    require_file_status = False
                    f.write(f"validation status : {require_file_status} ----------------> require file for training are not present\n\n")
            
            f.close()

            if not require_file_status:
                logger.info("file exist validation failed")
                sys.exit(1)
            else:
                non_malicious_file_data = pd.read_csv(os.path.join(self.ingestion_config.unzip_data_path,NON_MALICIOUS_URL_FILE_NAME))
                malicious_file_data = pd.read_csv(os.path.join(self.ingestion_config.unzip_data_path,MALICIOUS_URL_FILE_NAME))
                logger.info(f"Validation status updted in : {self.validation_status_file_name}")
            
            return malicious_file_data,non_malicious_file_data

        except Exception as e:
            raise e
        
    def number_of_columns_validation(self):
        try:
            number_of_column_status = None

            malicious_file,non_malicious_file = self.file_exist_validation()

            total_columns = set(self.total_columns)
            malicious_columns = set(malicious_file.columns)
            non_malicious_columns = set(non_malicious_file.columns)

            with open(self.validation_status_file_name,"a") as f:
                f.write(f">>>>>>>>>>>>>>>>>>>>>>>file exist validation<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<\n\n")
                if len(total_columns)==len(malicious_columns)==len(non_malicious_columns):
                    number_of_column_status = True
                    f.write(f"validation status : {number_of_column_status} ----------------> total number of columns are : {len(total_columns)}\n\n")
                else:
                    number_of_column_status = False
                    f.write(f"validation status : {number_of_column_status} ----------------> num_of_schema_column:{len(total_columns)},
                            num_of_malicious_column:{len(malicious_columns)},
                            num_of_nonmalicious_column:{len(non_malicious_columns)}\n\n")
                    
            if number_of_column_status:
                column_match_status = None

        except Exception as e:
            raise e




                    
            


            