In [1]:
import os
import pandas as pd
import numpy as np


In [2]:
%pwd
os.chdir("../")
%pwd

'g:\\success_analytics_courses\\internship_project\\pulsar_project'

In [3]:
#step - 1 : config.yaml updated
#step - 2 : constant file updated
#step - 3 : entity

from dataclasses import dataclass
from pathlib import Path

@dataclass(frozen=True)
class DataIngestionConfiguration:

    root_dir_name: Path
    dataset_download_url: str
    zip_data_dir_name: Path
    unzip_data_dir_name: Path
    
@dataclass(frozen=True)
class DataValidationConfiguration:

    validated_root_dir_name: Path
    validated_train_dir: Path
    validated_test_dir: Path
    validated_status_report_file_name: str
    validated_required_files:list

@dataclass(frozen=True)
class DataTransformationConfiguration:

    transformed_root_dir_name: Path
    transformed_train_dir: Path
    transformed_test_dir: Path
    transformed_industrial_data_dir: Path
    transformed_preprocess_dir: Path

In [4]:
#step - 4 : configuration manager in src config
from pulsarclassification.constants import *
from pulsarclassification.logging import logging
from pulsarclassification.utils.common import read_yaml,create_directories

class ConfigurationManager:

    def __init__(self, config_file_path: str = CONFIG_FILE_PATH):
        
        try:
            self.config = read_yaml(CONFIG_FILE_PATH)
            create_directories(self.config.artifacts_dir_name)
            logging.info(f" Artifacts directory created at : {self.config.artifacts_dir_name} ")
        except Exception as e:
            raise e
        
    def get_data_ingestion_config(self) -> DataIngestionConfiguration:

        try:
            artifact_dir = self.config.artifacts_dir_name
            config = self.config.data_ingestion_config

            data_ingestion_dir = os.path.join(artifact_dir,config.root_dir_name)
            create_directories(data_ingestion_dir)

            raw_data_dir = os.path.join(data_ingestion_dir,config.zip_data_dir_name)
            create_directories(raw_data_dir)

            ingested_csv_data_dir = os.path.join(data_ingestion_dir,config.unzip_data_dir_name)
            create_directories(ingested_csv_data_dir)

            data_ingestion_config = DataIngestionConfiguration(
                root_dir_name  = config.root_dir_name,
                dataset_download_url = config.dataset_download_url,
                zip_data_dir_name = raw_data_dir,
                unzip_data_dir_name = ingested_csv_data_dir
            )

            logging.info(f" Data ingestion configuration: {data_ingestion_config}")

            return data_ingestion_config
    
        except Exception as e:
            raise e
        
    def get_data_validation_configuration(self) -> DataValidationConfiguration:

        try:
            artifact_dir = self.config.artifacts_dir_name
            config = self.config.data_validation_config

            data_validation_dir = os.path.join(artifact_dir,config.validated_root_dir_name)
            create_directories(data_validation_dir)

            data_validation_train_dir = os.path.join(data_validation_dir,config.validated_train_dir)
            create_directories(data_validation_train_dir)

            data_validation_test_dir = os.path.join(data_validation_dir,config.validated_test_dir)
            create_directories(data_validation_test_dir)

            data_validation_config = DataValidationConfiguration(
                validated_root_dir_name  = config.validated_root_dir_name,
                validated_train_dir = data_validation_train_dir,
                validated_test_dir = data_validation_test_dir,
                validated_status_report_file_name = os.path.join(data_validation_dir,config.validated_status_report_file_name),
                validated_required_files = config.validated_required_files
            )

            logging.info(f" Data validation configuration: {data_validation_config}")

            return data_validation_config
        
        except Exception as e:
            raise e
        
    def get_data_transformation_configuration(self) -> DataTransformationConfiguration:

        try:
            artifact_dir = self.config.artifacts_dir_name
            config = self.config.data_transformation_config

            data_transformation_dir = os.path.join(artifact_dir,config.transformed_root_dir_name)
            create_directories(data_transformation_dir)

            data_transformation_train_dir = os.path.join(data_transformation_dir,config.transformed_train_dir)
            create_directories(data_transformation_train_dir)

            data_transformation_test_dir = os.path.join(data_transformation_dir,config.transformed_test_dir)
            create_directories(data_transformation_test_dir)

            data_transformation_industrial_data_dir = os.path.join(data_transformation_dir,config.transformed_industrial_data_dir)
            create_directories(data_transformation_industrial_data_dir)

            data_transformation_preprocess_data_dir = os.path.join(data_transformation_dir,config.transformed_preprocess_dir)
            create_directories(data_transformation_preprocess_data_dir)


            data_transformation_config = DataTransformationConfiguration(
                transformed_root_dir_name = data_transformation_dir,
                transformed_train_dir = data_transformation_train_dir,
                transformed_test_dir =  data_transformation_test_dir,
                transformed_industrial_data_dir =  data_transformation_industrial_data_dir,
                transformed_preprocess_dir = data_transformation_preprocess_data_dir
            )

            logging.info(f" Data transformation configuration: {data_transformation_config}")

            return data_transformation_config
        
        except Exception as e:
            raise e

In [6]:
#stage - 6 : updating components

import os
from pathlib import Path
from sklearn.model_selection import StratifiedShuffleSplit
from pulsarclassification.logging import logging
from pulsarclassification.constants import *
from pulsarclassification.utils.common import read_yaml,create_directories,get_file_size
from pulsarclassification.entity import DataIngestionConfiguration,DataValidationConfiguration

class DataTransformation:
    def __init__(self, 
                 validation_config:DataValidationConfiguration,
                 transformation_config: DataTransformationConfiguration):

        try:
            self.validation_config = validation_config
            self.transformation_config = transformation_config
            self.schema = read_yaml(SCHEMA_FILE_PATH)
        except Exception as e:
            raise e 
        
    def file_transformation_saving(self):
        try:
            model_data_file_path = os.path.join(self.validation_config.validated_train_dir,VALIDATED_DATA_FILE_NAME_FOR_MODEL_TRAIN)
            industrial_data_file = os.path.join(self.validation_config.validated_test_dir,VALIDATED_INDUSTRIALDATA_FILE_NAME)
            
            model_data = pd.read_csv(model_data_file_path)
            industrial_data = pd.read_csv(industrial_data_file)

            features = self.schema.numeriacl_columns.split(" ")
            features.remove("id")
            model_data = model_data[features]
            features.remove(self.schema.target_column)
            industrial_data = industrial_data[features]

            sss = StratifiedShuffleSplit(n_splits=1, test_size=0.1, random_state=42)
            model_train_set = None
            model_test_set = None

            for train_index,test_index in sss.split(model_data,model_data['Class']):
                model_train_set = model_data.loc[train_index]
                model_test_set = model_data.loc[test_index]

            model_train_set.reset_index(drop=True,inplace=True)
            model_test_set.reset_index(drop=True,inplace=True)

            model_train_set.to_csv(os.path.join(self.transformation_config.transformed_train_dir,
                                                TRANSFORMED_MODEL_TRAIN_FILE_NAME),index=False)
            logging.info(f"{TRANSFORMED_MODEL_TRAIN_FILE_NAME} saved in {self.transformation_config.transformed_train_dir}")
            model_test_set.to_csv(os.path.join(self.transformation_config.transformed_test_dir,
                                                TRANSFORMED_MODEL_TEST_FILE_NAME),index=False)
            logging.info(f"{TRANSFORMED_MODEL_TEST_FILE_NAME} saved in {self.transformation_config.transformed_test_dir}")
            industrial_data.to_csv(os.path.join(self.transformation_config.transformed_industrial_data_dir,
                                                TRANSFORMED_INDUSTRIALDATA_FILE_NAME),index=False)
            logging.info(f"{TRANSFORMED_INDUSTRIALDATA_FILE_NAME} saved in {self.transformation_config.transformed_industrial_data_dir}")

        except Exception as e:
            raise e
        
    
                

In [7]:
#from pulsarclassification.config.configuration import ConfigurationManager
#from pulsarclassification.components.data_validation import DataValidation

try:
    config = ConfigurationManager()
    data_validation_config = config.get_data_validation_configuration()
    data_transformation_config = config.get_data_transformation_configuration()
    data_transformation = DataTransformation(validation_config=data_validation_config,
                                     transformation_config=data_transformation_config)
    data_transformation.file_transformation_saving()
except Exception as e:
    raise e

[24-08-2023 07:37:08: INFO: common:  yaml file from this path config\config.yaml read succesfully]
[24-08-2023 07:37:08: INFO: common:  Directory already present: artifacts ]
[24-08-2023 07:37:08: INFO: 4025342101:  Artifacts directory created at : artifacts ]
[24-08-2023 07:37:08: INFO: common:  Directory already present: artifacts\data_validation ]
[24-08-2023 07:37:08: INFO: common:  Directory already present: artifacts\data_validation\training_data_for_model ]
[24-08-2023 07:37:08: INFO: common:  Directory already present: artifacts\data_validation\industrial_test_data ]
[24-08-2023 07:37:08: INFO: 4025342101:  Data validation configuration: DataValidationConfiguration(validated_root_dir_name='data_validation', validated_train_dir='artifacts\\data_validation\\training_data_for_model', validated_test_dir='artifacts\\data_validation\\industrial_test_data', validated_status_report_file_name='artifacts\\data_validation\\status.txt', validated_required_files=BoxList(['train', 'test', 's