## Data Preprocessing 

In [6]:
import os

In [7]:
%pwd

'c:\\Users\\VenuraP\\Desktop\\Browns Data Projects\\ML Projects\\POC\\Harti-Food-Price-Prediction\\notebooks'

In [8]:
os.chdir("../")
%pwd

'c:\\Users\\VenuraP\\Desktop\\Browns Data Projects\\ML Projects\\POC\\Harti-Food-Price-Prediction'

## Data Preprocessing Entity


In [9]:
from dataclasses import dataclass
from pathlib import Path

@dataclass(frozen=True)
class DataPreprocessingConfig:
    root_dir: Path
    unzip_data_dir: Path
    local_data_file: Path

## Data Preprocessing Configuration

In [10]:
from src.constants import *
from src.utils.common import read_yaml, create_directories

In [11]:
from pathlib import Path

# Function to initialize configuration by reading YAML and creating directories
def load_configuration(config_filepath: Path = CONFIG_FILE_PATH):
    config = read_yaml(config_filepath)
    return config

# Function to get data validation configuration from the loaded config
def get_data_preprocessing_config(config) -> DataPreprocessingConfig:

    # Extract data validation settings from the config
    data_preprocessing = config.data_preprocessing

    # Create and return a DataValidationConfig instance, including the schema
    return DataPreprocessingConfig(
        root_dir=data_preprocessing.root_dir,
        unzip_data_dir=data_preprocessing.unzip_data_dir,
        local_data_file=data_preprocessing.local_data_file,
    )

## Data Preprocessing Components

In [12]:
import pandas as pd
from box import ConfigBox 

def data_read_clean_missing_values(config):

    # Read the Excel file into a DataFrame
    df = pd.read_excel(config.unzip_data_dir)
    
    # Replace missing values in 'items' with 'Rice (Rs/kg)_Nadu 2'
    df['items'] = df['items'].fillna('Rice (Rs/kg)_Nadu 2')
    
    # Interpolate missing values in 'pettah_average' using linear interpolation
    df['pettah_average'] = df['pettah_average'].interpolate(method='linear', limit_direction='both')
    
    return df

# Function to drop specified columns
def drop_unnecessary_columns(df):

    columns_to_drop = [
        'items', 'pettah_min_value', 'pettah_max_value', 
        'food_inflation_Base_2013', 'percipitation', 
        'Bankrupt', 'pettah_range', 'pettah_midpoint'
    ]
    
    # Drop the specified columns
    df.drop(columns=columns_to_drop, inplace=True)
    
    return df


def save_preprocessed_excel(config: ConfigBox, df: pd.DataFrame):
    # Save the DataFrame to the specified file path as an Excel file
    df.to_excel(config.local_data_file, index=False)
    print(f"File saved at: {config.local_data_file}")


## Data Preprocessing Pipeline

In [17]:
from src import logger

def data_preprocessing_training_pipeline():
    
    try:
        # Load config and schema
        config = load_configuration()

        # Retrieve the data ingestion configuration from the loaded config
        data_preprocessing_config = get_data_preprocessing_config(config)

        # Create directories related to data ingestion (root directory)
        create_directories([data_preprocessing_config.root_dir])

        cleaned_data = data_read_clean_missing_values(data_preprocessing_config)

        dropped_columns_data=drop_unnecessary_columns(cleaned_data)

        save_preprocessed_excel(data_preprocessing_config, dropped_columns_data)

    except Exception as e:
        logger.error(f"An error occurred during data preprocessing: {e}")

if __name__ == "__main__":

 data_preprocessing_training_pipeline()

[2024-09-25 13:50:42,094: INFO: common: yaml file: config\config.yaml loaded successfully]
[2024-09-25 13:50:42,096: INFO: common: created directory at: artifacts/data_preprocessing]
File saved at: artifacts/data_preprocessing/Lstm_data_preprocessed.xlsx
