## Data Transformation

In [1]:
import os

In [2]:
%pwd

'c:\\Users\\VenuraP\\Desktop\\Browns Data Projects\\ML Projects\\POC\\Harti-Food-Price-Prediction\\notebooks'

In [3]:
os.chdir("../")

In [4]:
%pwd

'c:\\Users\\VenuraP\\Desktop\\Browns Data Projects\\ML Projects\\POC\\Harti-Food-Price-Prediction'

# Entity

In [5]:
from dataclasses import dataclass
from pathlib import Path

@dataclass(frozen=True)
class DataTransformationConfig:
    root_dir: Path
    data_path: Path
    local_data_file: Path

In [6]:
from src.constants import *
from src.utils.common import read_yaml, create_directories

## Configuration Manager

In [7]:
# Function to initialize configuration by reading YAML and creating directories
def load_configuration(config_filepath: Path = CONFIG_FILE_PATH, schema_filepath: Path = SCHEMA_FILE_PATH):

    config = read_yaml(config_filepath)
    schema = read_yaml(schema_filepath)

    return config, schema

# Function to get data ingestion configuration from the loaded config
def get_data_transformation_config(config) -> DataTransformationConfig:

    # Extract data ingestion settings from the config
    data_transformation = config.data_transformation

    # Create and return a DataIngestionConfig instance
    return DataTransformationConfig(
        root_dir=data_transformation.root_dir,
        data_path=data_transformation.data_path,
        local_data_file=data_transformation.local_data_file
    )

## Data Transformation Component

In [8]:
import pandas as pd
from scipy.stats import boxcox
from sklearn.preprocessing import MinMaxScaler
from box import ConfigBox

def box_cox_transformation(config):

    # Read the Excel file into a DataFrame
    df = pd.read_excel(config.data_path)

    # Apply Box-Cox Transformation to 'pettah_average'
    df['pettah_average'], lambda_value = boxcox(df['pettah_average'].replace(0, 0.01))
    
    return df

def min_max_scale(df):

    # Set 'date' as index
    df.set_index('date', inplace=True)

    # Initialize MinMaxScaler
    scaler = MinMaxScaler()

    # Fit and transform the data
    scaled_data = scaler.fit_transform(df)

    # Create a DataFrame from the scaled data with the same column names
    scaled_df = pd.DataFrame(scaled_data, columns=df.columns, index=df.index)

    return scaled_df

def remove_zeros_in_df(df):

    # Identify rows that have any '0' values
    rows_with_zeros = (df == 0).any(axis=1)

    # Filter out those rows (keep only rows without '0' values)
    cleaned_df = df[~rows_with_zeros]

    return cleaned_df

def save_preprocessed_excel(config: ConfigBox, df: pd.DataFrame):
    # Save the DataFrame to the specified file path as an Excel file
    df.to_excel(config.local_data_file, index=False)
    print(f"File saved at: {config.local_data_file}")


## Data Transformation Pipeline

In [9]:
from src import logger

def data_transformation_training_pipeline():
    
    try:
        # Load config and schema
        config, _ = load_configuration()

        # Retrieve the data ingestion configuration from the loaded config
        data_transformation_config = get_data_transformation_config(config)

        # Create directories related to data ingestion (root directory)
        create_directories([data_transformation_config.root_dir])

        # # Box_cox transform
        box_cox = box_cox_transformation(data_transformation_config)
        
        # min max scale 
        min_max = min_max_scale(box_cox)

        # remove zeros
        remove_zeros=remove_zeros_in_df(min_max)

        # saving the file 
        save_preprocessed_excel(data_transformation_config, remove_zeros)

    except Exception as e:
        logger.error(f"An error occurred during data preprocessing: {e}")

if __name__ == "__main__":

 data_transformation_training_pipeline()

[2024-10-02 09:57:59,912: INFO: common: yaml file: config\config.yaml loaded successfully]


[2024-10-02 09:57:59,918: INFO: common: yaml file: schema.yaml loaded successfully]
[2024-10-02 09:57:59,922: INFO: common: created directory at: artifacts/data_transformation]
File saved at: artifacts/data_transformation/lstm_transformed.xlsx
