In [1]:
import os

In [2]:
%pwd

'/home/tanush-reddy/workspace/Wine-Grade-Estimator/research'

In [3]:
os.chdir("/home/tanush-reddy/workspace/Wine-Grade-Estimator")

In [4]:
%pwd

'/home/tanush-reddy/workspace/Wine-Grade-Estimator'

In [5]:
import pandas as pd


In [8]:
df=pd.read_csv("artifacts/data_ingestion/winequality-red.csv")
df.head()

Unnamed: 0,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol,quality
0,7.4,0.7,0.0,1.9,0.076,11.0,34.0,0.9978,3.51,0.56,9.4,5
1,7.8,0.88,0.0,2.6,0.098,25.0,67.0,0.9968,3.2,0.68,9.8,5
2,7.8,0.76,0.04,2.3,0.092,15.0,54.0,0.997,3.26,0.65,9.8,5
3,11.2,0.28,0.56,1.9,0.075,17.0,60.0,0.998,3.16,0.58,9.8,6
4,7.4,0.7,0.0,1.9,0.076,11.0,34.0,0.9978,3.51,0.56,9.4,5


In [9]:
df.dtypes

fixed acidity           float64
volatile acidity        float64
citric acid             float64
residual sugar          float64
chlorides               float64
free sulfur dioxide     float64
total sulfur dioxide    float64
density                 float64
pH                      float64
sulphates               float64
alcohol                 float64
quality                   int64
dtype: object

In [10]:
df.isnull().sum()

fixed acidity           0
volatile acidity        0
citric acid             0
residual sugar          0
chlorides               0
free sulfur dioxide     0
total sulfur dioxide    0
density                 0
pH                      0
sulphates               0
alcohol                 0
quality                 0
dtype: int64

In [12]:
df.shape

(1599, 12)

In [15]:
df.columns

Index(['fixed acidity', 'volatile acidity', 'citric acid', 'residual sugar',
       'chlorides', 'free sulfur dioxide', 'total sulfur dioxide', 'density',
       'pH', 'sulphates', 'alcohol', 'quality'],
      dtype='object')

In [16]:
from dataclasses import dataclass
from pathlib import Path

@dataclass(frozen=True)
class DataValidationConfig:
    root_dir: Path 
    STATUS_FILE: str
    unzip_data_dir: Path
    all_schema: dict

In [18]:
from src.constants import *
from src.utils.common import read_yaml, create_directories


In [29]:
class ConfigurationManager:
    def __init__(self, config_file_path=CONFIG_FILE_PATH, params_file_path=PARAMS_FILE_PATH, schema_file_path=SCHEMA_FILE_PATH):
        self.config_file_path = config_file_path
        self.params_file_path = params_file_path
        self.schema_file_path = schema_file_path

        self.config = read_yaml(self.config_file_path)
        self.params = read_yaml(self.params_file_path)
        self.schema = read_yaml(self.schema_file_path)

        create_directories([self.config.artifacts_root])
        
        
    def get_data_validation_config(self) -> DataValidationConfig:
        config = self.config.data_validation
        schema = self.schema.COLUMNS.to_dict()
        create_directories([config.root_dir])
        
        data_validation_config = DataValidationConfig(
            root_dir=Path(config.root_dir),
            STATUS_FILE=config.STATUS_FILE,
            unzip_data_dir=Path(config.unzip_data_dir),
            all_schema=schema
        )
        
        return data_validation_config

In [30]:
import os
from src import logger
from src.exception import CustomException
import sys

In [33]:
class DataValidation:
    def __init__(self, config: DataValidationConfig):
        self.config = config
        
    
    def validate_data(self)->bool:
        try:
            validation_status = None
            df=pd.read_csv(self.config.unzip_data_dir)
            logger.info("Data loaded successfully")
            logger.info("Validating data")
            all_columns = list(df.columns)
            all_schema = self.config.all_schema.keys()
            
            for col in all_columns:
                if col not in all_schema:
                    logger.error(f"Column {col} is not in schema")
                    validation_status = False
                    with open(self.config.STATUS_FILE, 'w') as f:
                        f.write("Data validation failed")
                elif col in all_schema:
                        validation_status = True
                        logger.info(f"Column {col} is in schema")
                        with open(self.config.STATUS_FILE, 'w') as f:
                            f.write("Data validation passed")
            return validation_status
        except Exception as e:
            raise CustomException(e, sys) from e

In [34]:
try:
    config=ConfigurationManager()
    data_validation_config = config.get_data_validation_config()
    data_validation = DataValidation(config=data_validation_config)
    data_validation.validate_data()
except Exception as e:
    raise CustomException(e, sys) from e

[2025-07-28 23:36:34,161] 28 WineGradeEstimator - INFO - YAML file config/config.yaml read successfully.
[2025-07-28 23:36:34,162] 28 WineGradeEstimator - INFO - YAML file params.yaml read successfully.
[2025-07-28 23:36:34,163] 28 WineGradeEstimator - INFO - YAML file schema.yaml read successfully.
[2025-07-28 23:36:34,164] 46 WineGradeEstimator - INFO - Directory created: artifacts
[2025-07-28 23:36:34,165] 46 WineGradeEstimator - INFO - Directory created: artifacts/data_validation
[2025-07-28 23:36:34,168] 10 WineGradeEstimator - INFO - Data loaded successfully
[2025-07-28 23:36:34,169] 11 WineGradeEstimator - INFO - Validating data
[2025-07-28 23:36:34,169] 23 WineGradeEstimator - INFO - Column fixed acidity is in schema
[2025-07-28 23:36:34,170] 23 WineGradeEstimator - INFO - Column volatile acidity is in schema
[2025-07-28 23:36:34,170] 23 WineGradeEstimator - INFO - Column citric acid is in schema
[2025-07-28 23:36:34,171] 23 WineGradeEstimator - INFO - Column residual sugar is 