In [1]:
import os
import pandas as pd
import numpy as np

In [2]:
%pwd
os.chdir("../")
%pwd

'g:\\success_analytics_courses\\internship_project\\pulsar_project'

In [3]:
## step - 1 : config.yaml completed
## step - 2 : params.yaml completed(required in model trainer stage)
## step - 3 : constant completed
## step - 4 : entity

from dataclasses import dataclass
from pathlib import Path

@dataclass(frozen=True)
class DataIngestionConfiguration:

    root_dir_name: Path
    dataset_download_url: str
    zip_data_dir_name: Path
    unzip_data_dir_name: Path
    
@dataclass(frozen=True)
class DataValidationConfiguration:

    validated_root_dir_name: Path
    validated_train_dir: Path
    validated_test_dir: Path
    validated_status_report_file_name: str
    validated_required_files:list

@dataclass(frozen=True)
class DataTransformationConfiguration:

    transformed_root_dir_name: Path
    transformed_train_dir: Path
    transformed_test_dir: Path
    transformed_industrial_data_dir: Path
    transformed_preprocess_dir: Path

@dataclass(frozen=True)
class ModelTrainerConfiguration:

    trained_model_root_dir_name: Path
    trained_model_path_yaml_file: Path
    trained_model_base_accuracy: float
    trained_model_overfit_value: float
    trained_model_FPR: float
    trained_model_RECALL: float
    trained_model_selection:str


In [4]:
## step - 5 : configuration manager in src config

from pulsarclassification.constants import *
from pulsarclassification.logging import logging
from pulsarclassification.utils.common import read_yaml,create_directories

class ConfigurationManager:

    def __init__(self, config_file_path: str = CONFIG_FILE_PATH):
        
        try:
            self.config = read_yaml(CONFIG_FILE_PATH)
            create_directories(self.config.artifacts_dir_name)
            logging.info(f" Artifacts directory created at : {self.config.artifacts_dir_name} ")
        except Exception as e:
            raise e
        
    def get_data_ingestion_config(self) -> DataIngestionConfiguration:

        try:
            artifact_dir = self.config.artifacts_dir_name
            config = self.config.data_ingestion_config

            data_ingestion_dir = os.path.join(artifact_dir,config.root_dir_name)
            create_directories(data_ingestion_dir)

            raw_data_dir = os.path.join(data_ingestion_dir,config.zip_data_dir_name)
            create_directories(raw_data_dir)

            ingested_csv_data_dir = os.path.join(data_ingestion_dir,config.unzip_data_dir_name)
            create_directories(ingested_csv_data_dir)

            data_ingestion_config = DataIngestionConfiguration(
                root_dir_name  = config.root_dir_name,
                dataset_download_url = config.dataset_download_url,
                zip_data_dir_name = raw_data_dir,
                unzip_data_dir_name = ingested_csv_data_dir
            )

            logging.info(f" Data ingestion configuration: {data_ingestion_config}")

            return data_ingestion_config
    
        except Exception as e:
            raise e
        
    def get_data_validation_configuration(self) -> DataValidationConfiguration:

        try:
            artifact_dir = self.config.artifacts_dir_name
            config = self.config.data_validation_config

            data_validation_dir = os.path.join(artifact_dir,config.validated_root_dir_name)
            create_directories(data_validation_dir)

            data_validation_train_dir = os.path.join(data_validation_dir,config.validated_train_dir)
            create_directories(data_validation_train_dir)

            data_validation_test_dir = os.path.join(data_validation_dir,config.validated_test_dir)
            create_directories(data_validation_test_dir)

            data_validation_config = DataValidationConfiguration(
                validated_root_dir_name  = config.validated_root_dir_name,
                validated_train_dir = data_validation_train_dir,
                validated_test_dir = data_validation_test_dir,
                validated_status_report_file_name = os.path.join(data_validation_dir,config.validated_status_report_file_name),
                validated_required_files = config.validated_required_files
            )

            logging.info(f" Data validation configuration: {data_validation_config}")

            return data_validation_config
        
        except Exception as e:
            raise e
        
    def get_data_transformation_configuration(self) -> DataTransformationConfiguration:

        try:
            artifact_dir = self.config.artifacts_dir_name
            config = self.config.data_transformation_config

            data_transformation_dir = os.path.join(artifact_dir,config.transformed_root_dir_name)
            create_directories(data_transformation_dir)

            data_transformation_train_dir = os.path.join(data_transformation_dir,config.transformed_train_dir)
            create_directories(data_transformation_train_dir)

            data_transformation_test_dir = os.path.join(data_transformation_dir,config.transformed_test_dir)
            create_directories(data_transformation_test_dir)

            data_transformation_industrial_data_dir = os.path.join(data_transformation_dir,config.transformed_industrial_data_dir)
            create_directories(data_transformation_industrial_data_dir)

            data_transformation_preprocess_data_dir = os.path.join(data_transformation_dir,config.transformed_preprocess_dir)
            create_directories(data_transformation_preprocess_data_dir)


            data_transformation_config = DataTransformationConfiguration(
                transformed_root_dir_name = data_transformation_dir,
                transformed_train_dir = data_transformation_train_dir,
                transformed_test_dir =  data_transformation_test_dir,
                transformed_industrial_data_dir =  data_transformation_industrial_data_dir,
                transformed_preprocess_dir = data_transformation_preprocess_data_dir
            )

            logging.info(f" Data transformation configuration: {data_transformation_config}")

            return data_transformation_config
        
        except Exception as e:
            raise e
    
    def get_model_trainer_configuration(self) -> ModelTrainerConfiguration:

        try:
            artifact_dir = self.config.artifacts_dir_name
            config = self.config.model_trainer_config
            param_config = read_yaml(MODEL_PARAMETER_FILE_PATH)

            model_trainer_dir = os.path.join(artifact_dir,config.trained_model_root_dir_name)
            create_directories(model_trainer_dir)

            model_trainer_yaml_file = Path(os.path.join(model_trainer_dir,config[MODEL_TRAINER_YAML_FILE_NAME_KEY]))

            model_trainer_config = ModelTrainerConfiguration(
                trained_model_root_dir_name = model_trainer_dir,
                trained_model_path_yaml_file = model_trainer_yaml_file,
                trained_model_base_accuracy = config.trained_model_base_accuracy,
                trained_model_overfit_value = config.trained_model_overfit_value,
                trained_model_FPR           = config.trained_model_FPR,
                trained_model_RECALL        = config.trained_model_RECALL,
                trained_model_selection     = param_config[MODEL_SELECTION_KEY]
            )

            logging.info(f" Model trainer configuration: {model_trainer_config}")

            return model_trainer_config
        
        except Exception as e:
            raise e

In [7]:
#stage - 6 : updating components

import os
import importlib
from pathlib import Path
from sklearn.model_selection import StratifiedShuffleSplit
from pulsarclassification.logging import logging
from pulsarclassification.constants import *
from pulsarclassification.utils.common import read_yaml,create_directories,get_file_size,pickle_file_saving,write_yaml
from pulsarclassification.entity import DataIngestionConfiguration,DataValidationConfiguration,DataTransformationConfiguration

class ModelTrainer:
    def __init__(self, 
                 transformation_config: DataTransformationConfiguration,
                 modeltrainer_config: ModelTrainerConfiguration):

        try:
            self.transformation_config = transformation_config
            self.modeltrainer_config = modeltrainer_config
            self.schema = read_yaml(SCHEMA_FILE_PATH)
        except Exception as e:
            raise e 
        
    def get_data_for_training(self):
        try:
            model_train_data_file_path = os.path.join(self.transformation_config.transformed_train_dir,TRANSFORMED_MODEL_TRAIN_FILE_NAME)
            model_test_data_file_path = os.path.join(self.transformation_config.transformed_test_dir,TRANSFORMED_MODEL_TEST_FILE_NAME)
            
            model_train_data = pd.read_csv(model_train_data_file_path)
            model_test_data = pd.read_csv(model_test_data_file_path)

            input_features = model_train_data.drop(self.schema.target_column,axis=1)
            logging.info(f"Input features extracted from {TRANSFORMED_MODEL_TRAIN_FILE_NAME} having shape : {input_features.shape} ")
            output_features = model_train_data[self.schema.target_column]
            logging.info(f"Output feature extracted from {TRANSFORMED_MODEL_TRAIN_FILE_NAME} having shape : {output_features.shape} ")
            
            return input_features,output_features

        except Exception as e:
            raise e
        
    def get_model(self,modellibrary,classificationmodel,modelparameters,inputfeatures,outputfeatures):
        try:
            #CURRENT_TIME_STAMP = f"{datetime.now().strftime('%d-%m-%Y-%H-%M-%S')}"
            mllibrary = importlib.import_module(modellibrary)
            mlmodel = getattr(mllibrary, classificationmodel)
            model = mlmodel(**modelparameters)
            model.fit(inputfeatures,outputfeatures)
            return model
        except Exception as e:
            raise e
    
    def save_model(self):
        try:
            saved_model_artifacts = {SAVED_MODEL_ARTIFACTS_KEY :{}}
            X,y = self.get_data_for_training()  ## X = input features , y = output features

            model_saving_folder_name = os.path.join(self.modeltrainer_config.trained_model_root_dir_name,SAVED_MODEL_FOLDER_KEY)
            create_directories(model_saving_folder_name)

            number_of_model_for_train = []
            for key,value in self.modeltrainer_config.trained_model_selection.items():
                number_of_model_for_train.append(key)

            logging.info(f"Number of model to train : {len(number_of_model_for_train)}")

            for i in range(len(number_of_model_for_train)):

                model_selection = self.modeltrainer_config.trained_model_selection[number_of_model_for_train[i]]
                
                logging.info(f"{model_selection[MODEL_CLASSIFIER_KEY]} training started")
                
                trained_model = self.get_model(model_selection[MODEL_CLASSIFIER_MODULE_KEY],
                                            model_selection[MODEL_CLASSIFIER_KEY],
                                            model_selection[MODEL_CLASSIFIER_PARAMETER_KEY],
                                            X,y)
            
                trained_model_saving_path = os.path.join(model_saving_folder_name,model_selection[MODEL_CLASSIFIER_KEY])
                create_directories(trained_model_saving_path)
                pickle_file_saving(trained_model,trained_model_saving_path,TRAINED_MODEL_FILE_NAME)
                trained_model_path = os.path.join(trained_model_saving_path,TRAINED_MODEL_FILE_NAME)
                key_of_path = f"model_{i}_path_{CURRENT_DATE_STAMP}"
                trained_model_artifacts = {key_of_path:Path(trained_model_path)}
                saved_model_artifacts[SAVED_MODEL_ARTIFACTS_KEY].update(trained_model_artifacts)
            
                logging.info(f"{model_selection[MODEL_CLASSIFIER_KEY]} training completed")

            write_yaml(self.modeltrainer_config.trained_model_path_yaml_file,saved_model_artifacts)
            logging.info(f"Model paths updated in yaml file: {self.modeltrainer_config.trained_model_path_yaml_file}")
            
        except Exception as e:
            raise e

In [8]:
try:
    config = ConfigurationManager()
    data_transformation_config = config.get_data_transformation_configuration()
    model_trainer_config = config.get_model_trainer_configuration()
    model_trainer = ModelTrainer(transformation_config=data_transformation_config,
                                 modeltrainer_config = model_trainer_config)
    model_trainer.save_model()
except Exception as e:
    raise e

[03-09-2023 09:43:23: INFO: common:  yaml file from this path g:\success_analytics_courses\internship_project\pulsar_project\config\config.yaml read succesfully]
[03-09-2023 09:43:23: INFO: common:  yaml file from this path g:\success_analytics_courses\internship_project\pulsar_project\config\params.yaml read succesfully]
[03-09-2023 09:43:23: INFO: common:  Directory already present: artifacts ]
[03-09-2023 09:43:23: INFO: 2280857492:  Artifacts directory created at : artifacts ]
[03-09-2023 09:43:23: INFO: common:  Directory already present: artifacts\data_transformation ]
[03-09-2023 09:43:23: INFO: common:  Directory already present: artifacts\data_transformation\training_data_for_model ]
[03-09-2023 09:43:23: INFO: common:  Directory already present: artifacts\data_transformation\test_data_for_model ]
[03-09-2023 09:43:23: INFO: common:  Directory already present: artifacts\data_transformation\industrial_test_data ]
[03-09-2023 09:43:23: INFO: common:  Directory already present: a

[03-09-2023 09:43:23: INFO: 3737684590: Input features extracted from pulsar_train_data.csv having shape : (105807, 8) ]
[03-09-2023 09:43:23: INFO: 3737684590: Output feature extracted from pulsar_train_data.csv having shape : (105807,) ]
[03-09-2023 09:43:23: INFO: common:  Directory already present: artifacts\trained_model\model_03092023 ]
[03-09-2023 09:43:23: INFO: 3737684590: Number of model to train : 2]
[03-09-2023 09:43:23: INFO: 3737684590: LGBMClassifier training started]
[LightGBM] [Info] Number of positive: 9870, number of negative: 95937
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 2040
[LightGBM] [Info] Number of data points in the train set: 105807, number of used features: 8
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.093283 -> initscore=-2.274192
[LightGBM] [Info] Start training from score -2.274192
[03-09-2023 09:43:24: INFO: common:  Directory already present: artifacts\trained_model\model_03092023\LGBMClassifier ]
[03