# Data Transformation

In [1]:
import os

In [2]:
os.chdir("../")

In [3]:
%pwd

'c:\\Users\\VenuraP\\Desktop\\Browns Data Projects\\ML Projects\\POC\\Harti-Food-Price-Prediction'

## Data Class for Configuration Entity

In [4]:
from dataclasses import dataclass
from pathlib import Path

@dataclass(frozen=True)
class ModelTrainerConfig:
    root_dir: Path
    local_data_path: Path
    train_y_data_file: Path
    train_x_data_file: Path
    test_y_data_file: Path
    test_x_data_file: Path
    model_checkpoint_path: Path
    model_name: str
    n_units_layer1: int
    n_units_layer2: int
    n_units_layer3: int
    dropout_rate: float
    sequence_length: int
    optimizer: str
    loss_function: str
    epochs: int
    batch_size: int
    validation_split: float
    target_column: str
    

In [5]:
from src.constants import *
from src.utils.common import read_yaml, create_directories

## Configuration Manager

In [6]:
# Function to initialize configuration by reading YAML and creating directories
def load_configuration(config_filepath: Path = CONFIG_FILE_PATH, schema_filepath: Path = SCHEMA_FILE_PATH):

    config = read_yaml(config_filepath)
    schema = read_yaml(schema_filepath)

    return config, schema

# Function to get data ingestion configuration from the loaded config
def get_model_trainer_config(config,schema) -> ModelTrainerConfig:

    # Extract data ingestion settings from the config
    model_trainer = config.model_trainer

    # Create and return a DataIngestionConfig instance
    return ModelTrainerConfig(
        root_dir=model_trainer.root_dir,
        local_data_path=model_trainer.local_data_path,
        train_x_data_file=model_trainer.train_x_data_file,
        train_y_data_file=model_trainer.train_y_data_file,
        test_x_data_file=model_trainer.test_x_data_file,
        test_y_data_file=model_trainer.test_y_data_file,
        model_checkpoint_path=model_trainer.model_checkpoint_path,
        model_name=model_trainer.model_name,
        n_units_layer1=model_trainer.n_units_layer1,
        n_units_layer2=model_trainer.n_units_layer2,
        n_units_layer3=model_trainer.n_units_layer3,
        dropout_rate=model_trainer.dropout_rate,
        sequence_length=model_trainer.sequence_length,
        optimizer=model_trainer.optimizer,
        loss_function=model_trainer.loss_function,
        epochs=model_trainer.epochs,
        batch_size=model_trainer.batch_size,
        validation_split=model_trainer.validation_split,
        target_column = schema.TARGET_COLUMN
    )

## Model Trainer Component

In [7]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dense, Dropout, Bidirectional
from tensorflow.keras.callbacks import ModelCheckpoint
import pandas as pd
import numpy as np

def sequence_creation_train_test_split(config):
    # Read the Excel file into a DataFrame
    df = pd.read_excel(config.local_data_path)
    
    # Define sequence length and number of features
    sequence_length = config.sequence_length 
    num_features = len(df.columns)

    # Create sequences and corresponding labels
    sequences = []
    labels = []

    # Dynamically set target column based on config.target_column
    target_col_idx = df.columns.get_loc(config.target_column)  # Get index of target column

    for i in range(len(df) - sequence_length):
        seq = df.iloc[i:i+sequence_length, :]
        label = df.iloc[i+sequence_length, target_col_idx]  # since target is the first column
        sequences.append(seq)
        labels.append(label)

    # Convert to numpy arrays
    sequences = np.array(sequences)
    labels = np.array(labels)

    # Split into train and test sets
    train_size = int(0.8 * len(sequences)) 
    train_x, test_x = sequences[:train_size], sequences[train_size:]
    train_y, test_y = labels[:train_size], labels[train_size:]

    return train_x, test_x, train_y, test_y


def save_train_test_data_to_excel(train_x, test_x, train_y, test_y, config):

    # Convert train_x, train_y, test_x, test_y to DataFrames
    train_x_df = pd.DataFrame(train_x.reshape(train_x.shape[0], -1))  # Flatten 3D array to 2D
    train_y_df = pd.DataFrame(train_y, columns=[config.target_column])

    test_x_df = pd.DataFrame(test_x.reshape(test_x.shape[0], -1))  # Flatten 3D array to 2D
    test_y_df = pd.DataFrame(test_y, columns=[config.target_column])

    # Save DataFrames to Excel files defined in config
    train_x_path = config.train_x_data_file
    train_y_path = config.train_y_data_file
    test_x_path = config.test_x_data_file
    test_y_path = config.test_y_data_file

    train_x_df.to_excel(train_x_path, index=False)
    train_y_df.to_excel(train_y_path, index=False)

    test_x_df.to_excel(test_x_path, index=False)
    test_y_df.to_excel(test_y_path, index=False)

    print(f"Train and test data saved to Excel files:\n"
          f"Train X: {train_x_path}\nTrain Y: {train_y_path}\n"
          f"Test X: {test_x_path}\nTest Y: {test_y_path}")


def lstm_model_trainer(train_x, train_y, config):

    # Create the LSTM model using the provided config
    model = Sequential()

    # Add Bidirectional LSTM layers with dropout
    model.add(Bidirectional(LSTM(units=config.n_units_layer1, return_sequences=True), 
                            input_shape=(train_x.shape[1], train_x.shape[2])))
    model.add(Dropout(config.dropout_rate))

    model.add(Bidirectional(LSTM(units=config.n_units_layer2, return_sequences=True)))
    model.add(Dropout(config.dropout_rate))

    model.add(Bidirectional(LSTM(units=config.n_units_layer3, return_sequences=False)))
    model.add(Dropout(config.dropout_rate))

    # Add a dense output layer
    model.add(Dense(units=1))

    # Compile the model using the config
    model.compile(optimizer=config.optimizer, loss=config.loss_function)

    # Create a checkpoint callback for saving the best model
    model_checkpoint = ModelCheckpoint(filepath=str(config.model_checkpoint_path), 
                                       monitor='val_loss', 
                                       save_best_only=True)
    # Train the model
    history = model.fit(
        train_x, train_y,
        epochs=config.epochs,
        batch_size=config.batch_size,
        validation_split=config.validation_split,
        callbacks=[model_checkpoint]
    )

    return model, history

# Model Trainer Pipeline

In [8]:
from src import logger

def model_training_pipeline():
    try:
        # Load config and schema
        config,schema = load_configuration()

        # Retrieve the data ingestion configuration from the loaded config
        model_trainer_config = get_model_trainer_config(config,schema)

        # Create directories related to data ingestion (root directory)
        create_directories([model_trainer_config.root_dir])

        # Sequence creation
        train_x, test_x, train_y, test_y = sequence_creation_train_test_split(model_trainer_config)
        
        # Saving data to an excel from numpy array (3D to 2D)
        save_train_test_data_to_excel(train_x, test_x, train_y, test_y, config)

        # Train the model
        lstm_model_trainer(train_x, train_y, config)

    except Exception as e:
        logger.error(f"An error occurred in model trainer : {e}")

if __name__ == "__main__":

 model_training_pipeline()

[2024-10-03 17:35:13,636: INFO: common: yaml file: config\config.yaml loaded successfully]
[2024-10-03 17:35:13,637: INFO: common: yaml file: schema.yaml loaded successfully]
[2024-10-03 17:35:13,637: INFO: common: created directory at: artifacts/model_trainer]
[2024-10-03 17:35:14,051: ERROR: 3964945739: An error occurred in model trainer : {'name': 'pettah_average'}]
