### Imports

In [1]:
from dataclasses import dataclass
from pathlib import Path
from Phishing_Detector.utils import *
from Phishing_Detector.constants import *
from urllib import request
from scipy.io import arff
import pandas as pd

In [3]:
os.chdir('../')
#os.chdir('../DS_Projects/Phishing-Domain-Detection/')
os.getcwd()

'c:\\DS_Projects\\Phishing-Domain-Detection'

### Config Entity

In [11]:
@dataclass(frozen=True)
class DataValidationConfig:
    schema: dict
    processed_data_file_path: Path


### Configuration

In [12]:
class ConfigurationManager:

    def __init__(
            self, 
            config_path = CONFIG_FILE_PATH,
            params_path = PARAMS_FILE_PATH
    ):
        self.config = read_yaml(config_path)
        self.params = read_yaml(params_path)
        create_directories([self.config.artifacts_root])

    def get_data_validation_config(self) -> DataValidationConfig:

        config = self.config.data_validation

        data_validation_config = DataValidationConfig(
            schema = config.schema,
            processed_data_file_path= config.processed_data_file_path
        )

        return data_validation_config



In [13]:
class DataNotValid(Exception):
    def __init__(self, message = "Data is not valid"):
        self.message = message
        super().__init__(self.message)

In [24]:
class DataValidation:
    
    def __init__(self, config:DataValidationConfig):
        self.config = config
        
    def validate_data(self):

        def validate_columns(schema, df):
            print("Validating columns.")
            validation_status = False
            for i in df.columns:
                if i not in list(schema.keys()):
                    validation_status = False
                    print(f"Invalid columns found - {i}.")
                    break
                else:
                    validation_status = True
                
                if df[i].dtype != schema[i]:
                    validation_status = False
                    print(f"Invalid data type found - {i} : {df[i].dtype}.")
                    break
                else:
                    validation_status = True

            if validation_status:
                print("All Columns are valid.")
            else:
                print("Failed to validate columns.")

            return validation_status

        def validate_no_of_cols(schema_no_of_cols, df):
            print("Validating number of columns.")
            validation_status = False

            if schema_no_of_cols == df.shape[1]:
                validation_status = True
                print("Valid number of columns found.")
            else:
                validation_status = False
                print(f"Invalid no of columns - {df.shape[1]}.")
                print("Failed to validate total number of columns.")
                
            return validation_status

        def validate_no_of_rows(schema_no_of_rows, df):
            print("Validating number of rows.")
            validation_status = False
            
            if schema_no_of_rows == df.shape[0]:
                validation_status = True
                print("Valid number of rows found.")
            else:
                validation_status = False
                print(f"Invalid no of rows - {df.shape[0]}.")
                print("Failed to validate total number of rows.")
            
            return validation_status

        try:
            print(f"{'-'*30} Validating the dataset {'-'*30}")
            print("Reding validation schema.")
            schema = self.config.schema
            schema_columns = schema['columns']
            schema_total_cols = schema['no_of_cols']
            schema_total_rows = schema['no_of_rows']
            print("Loading the data.")
            df = pd.read_csv(Path(self.config.processed_data_file_path))
            
            val_cols = validate_columns(schema_columns, df)
            val_total_cols = validate_no_of_cols(schema_total_cols,df)
            val_total_rows = validate_no_of_rows(schema_total_rows, df)
            validation_status = val_cols and val_total_cols and val_total_rows

            if validation_status:
                pass
            else:
                raise DataNotValid
        except Exception as e:
            raise e


        


In [25]:
try:
    config = ConfigurationManager()
    data_validation_config = config.get_data_validation_config()
    data_validation = DataValidation(config = data_validation_config)
    data_validation.validate_data()
except Exception as e:
    raise e

------------------------------ Validating the dataset ------------------------------
Reding validation schema.
Loading the data.
Validating columns.
All Columns are valid.
Validating number of columns.
Valid number of columns found.
Validating number of rows.
Valid number of rows found.
