For Data validation we define a schema file which gives information about the data such as column names, feature datatypes, target feature, domain value,etc

How we can retrieve column name and its datatype through dynamic technique:

In [1]:
import pandas as pd
import numpy as np
from collections import namedtuple

In [2]:
housing_df = pd.read_csv(r"C:\Users\User\Machine-Learning-Project\housing\artifact\data_ingestion\2022-09-23-15-25-32\raw_data\housing.csv")

In [3]:
housing_df.head()

Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,median_house_value,ocean_proximity
0,-122.23,37.88,41.0,880.0,129.0,322.0,126.0,8.3252,452600.0,NEAR BAY
1,-122.22,37.86,21.0,7099.0,1106.0,2401.0,1138.0,8.3014,358500.0,NEAR BAY
2,-122.24,37.85,52.0,1467.0,190.0,496.0,177.0,7.2574,352100.0,NEAR BAY
3,-122.25,37.85,52.0,1274.0,235.0,558.0,219.0,5.6431,341300.0,NEAR BAY
4,-122.25,37.85,52.0,1627.0,280.0,565.0,259.0,3.8462,342200.0,NEAR BAY


Retrieving Column name

In [4]:
col_name = housing_df.columns

In [5]:
col_name

Index(['longitude', 'latitude', 'housing_median_age', 'total_rooms',
       'total_bedrooms', 'population', 'households', 'median_income',
       'median_house_value', 'ocean_proximity'],
      dtype='object')

Retriving datatypes

In [6]:
d_types = [str(i) for i in list(housing_df.dtypes)]

In [7]:
d_types

['float64',
 'float64',
 'float64',
 'float64',
 'float64',
 'float64',
 'float64',
 'float64',
 'float64',
 'object']

Creating a dictionary of column names and its datatype 

In [8]:
column_info = dict(zip(col_name,d_types))

In [9]:
column_info

{'longitude': 'float64',
 'latitude': 'float64',
 'housing_median_age': 'float64',
 'total_rooms': 'float64',
 'total_bedrooms': 'float64',
 'population': 'float64',
 'households': 'float64',
 'median_income': 'float64',
 'median_house_value': 'float64',
 'ocean_proximity': 'object'}

Creating Domain Value dictionary

In [10]:
dom = {'ocean_proximity': list(np.sort(housing_df.ocean_proximity.unique()))}

In [11]:
dom

{'ocean_proximity': ['<1H OCEAN',
  'INLAND',
  'ISLAND',
  'NEAR BAY',
  'NEAR OCEAN']}

In [12]:
schema = {"columns":column_info,
          "target-column":"median_house_value",
          "domain_value":dom    
}

In [13]:
schema

{'columns': {'longitude': 'float64',
  'latitude': 'float64',
  'housing_median_age': 'float64',
  'total_rooms': 'float64',
  'total_bedrooms': 'float64',
  'population': 'float64',
  'households': 'float64',
  'median_income': 'float64',
  'median_house_value': 'float64',
  'ocean_proximity': 'object'},
 'target-column': 'median_house_value',
 'domain_value': {'ocean_proximity': ['<1H OCEAN',
   'INLAND',
   'ISLAND',
   'NEAR BAY',
   'NEAR OCEAN']}}

# Validating data through schema file

1. Number of Column
2. Check the value of ocean proximity 
     acceptable values     
     - <1H OCEAN
     - INLAND
     - ISLAND
     - NEAR BAY
     - NEAR OCEAN
3. Check column names

In [15]:
import yaml

In [17]:
with open(r'C:\Users\User\Machine-Learning-Project\config\schema.yaml', 'r') as file:    
    schema_file = yaml.safe_load(file)


In [20]:
schema_file

{'columns': {'longitude': 'float',
  'latitude': 'float',
  'housing_median_age': 'float',
  'total_rooms': 'float',
  'total_bedrooms': 'float',
  'population': 'float',
  'households': 'float',
  'median_income': 'float',
  'median_house_value': 'float',
  'ocean_proximity': 'category'},
 'target-column': 'median_house_value',
 'domain_value': {'ocean_proximity': ['<1H OCEAN',
   'INLAND',
   'ISLAND',
   'NEAR BAY',
   'NEAR OCEAN']}}

In [23]:
schema_file['columns'].keys()

dict_keys(['longitude', 'latitude', 'housing_median_age', 'total_rooms', 'total_bedrooms', 'population', 'households', 'median_income', 'median_house_value', 'ocean_proximity'])


In [26]:
dom_values = schema_file['domain_value'].values()

In [35]:
for i in dom_values:
    print(i)

['<1H OCEAN', 'INLAND', 'ISLAND', 'NEAR BAY', 'NEAR OCEAN']


In [42]:
list(np.sort(housing_df['ocean_proximity'].unique()))

list

In [43]:
list(np.sort(housing_df.columns))

['households',
 'housing_median_age',
 'latitude',
 'longitude',
 'median_house_value',
 'median_income',
 'ocean_proximity',
 'population',
 'total_bedrooms',
 'total_rooms']

In [45]:
len(schema_file['columns'].keys())

10

In [None]:
schema_file[DOMAIN_KEY]

Comparing train data information and schema file information for validation

In [53]:
train_df = pd.read_csv(r"C:\Users\User\Machine-Learning-Project\housing\artifact\data_ingestion\2022-09-23-15-25-32\ingested_data\train\housing.csv")
train_no_of_col = len(train_df.columns)  ## train data no of columns
train_col_names = list(np.sort(train_df.columns))
train_dom_values = list(np.sort(train_df['ocean_proximity'].unique()))

In [108]:
schema_no_of_col = len(schema['columns'].keys())
schema_col_names = list(np.sort(list(schema['columns'].keys())))
schema_dom_values = []
l_dom = [schema_dom_values.append(x) for i in schema['domain_value'].values() for x in i]
schema_dom_values = list(np.sort(schema_dom_values))

In [111]:
schema_dom_values

['<1H OCEAN', 'INLAND', 'ISLAND', 'NEAR BAY', 'NEAR OCEAN']

In [109]:
def validate_train_schema()->bool:
    if train_col_names == schema_col_names and train_no_of_col == schema_no_of_col and train_dom_values == schema_dom_values:
        return True
    else:
        print(f"Train col names - {train_col_names} : schema col names- {schema_col_names}")
        print(f"Train col length - {train_no_of_col} : schema col length- {schema_no_of_col}")
        print(f"Train domain values - {train_dom_values} : schema domain values - {schema_dom_values}")
        return False

In [110]:
validate_train_schema()

True

In [105]:
dic = {"q":1,"w":2, "f":4,"s":5,"c":9}
dic.values()

dict_values([1, 2, 4, 5, 9])