In [1]:
import os
%pwd

'g:\\success_analytics_courses\\internship_project\\pulsar_project\\research'

In [2]:
os.chdir("../")
%pwd

'g:\\success_analytics_courses\\internship_project\\pulsar_project'

In [3]:
#step -  1 : yaml update
# step - 2 : constant updated
# step - 4 : entity updated

from dataclasses import dataclass 
from pathlib import Path

@dataclass(frozen=True)
class DataIngestionConfiguration:

    root_dir_name: Path
    dataset_download_url: str
    zip_data_dir_name: Path
    unzip_data_dir_name: Path


In [10]:
#step - 5 : updating configuration manager

import sys
from pulsarclassification.logging import logging
from pulsarclassification.exception import PulsarException
from box.exceptions import BoxValueError
from pulsarclassification.constants import *
from pulsarclassification.utils.common import read_yaml,create_directories
#from pulsarclassification.entity import DataIngestionConfiguration

class ConfigurationManager:

    def __init__(self, config_file_path: str = CONFIG_FILE_PATH):
        
        try:
            self.config = read_yaml(CONFIG_FILE_PATH)
            create_directories(self.config.artifacts_dir_name)
            logging.info(f" Artifacts directory created at : {self.config.artifacts_dir_name} ")

        except Exception as e:
            raise PulsarException(e,sys)
        
    def get_data_ingestion_config(self) -> DataIngestionConfiguration:

        try:
            artifact_dir = self.config.artifacts_dir_name
            config = self.config.data_ingestion_config

            data_ingestion_dir = os.path.join(artifact_dir,config.root_dir_name)
            create_directories(data_ingestion_dir)

            raw_data_dir = os.path.join(data_ingestion_dir,config.zip_data_dir_name)
            create_directories(raw_data_dir)

            ingested_csv_data_dir = os.path.join(data_ingestion_dir,config.unzip_data_dir_name)
            create_directories(ingested_csv_data_dir)

            data_ingestion_config = DataIngestionConfiguration(
                root_dir_name  = config.root_dir_name,
                dataset_download_url = config.dataset_download_url,
                zip_data_dir_name = raw_data_dir,
                unzip_data_dir_name = ingested_csv_data_dir
            )

            logging.info(f" Data ingestion configuration: {data_ingestion_config}")

            return data_ingestion_config
    
        except Exception as e:
            raise PulsarException(e,sys)
        


In [11]:
#stage - 6 : updating components

import os
import urllib.request as request
import zipfile
from pulsarclassification.logging import logging
from pulsarclassification.utils.common import get_file_size
#from pulsarclassification.entity import DataIngestionConfiguration

class DataIngestion:
    def __init__(self, config : DataIngestionConfiguration):

        try:
            self.config = config
        except Exception as e:
            raise PulsarException(e,sys)
        
    def zip_file_downloader(self):

        try:
            zip_file_name = os.path.basename(self.config.dataset_download_url)
            zip_data_path = os.path.join(self.config.zip_data_dir_name,zip_file_name)
            self.zip_data_path_ = zip_data_path
            if not os.path.exists(zip_data_path):
                filename, headers = request.urlretrieve(url = self.config.dataset_download_url,filename=zip_data_path)
                logging.info(f"{filename} download! with following info: \n{headers}")
            else:
                logging.info(f"File already exists of size: {get_file_size(Path(zip_data_path))}") 
        except Exception as e:
            raise PulsarException(e,sys)
        
    def zip_file_extractor(self):

        try:
            unzip_data_path = self.config.unzip_data_dir_name
            with zipfile.ZipFile(self.zip_data_path_, 'r') as zip_file:
                zip_file.extractall(unzip_data_path)
            logging.info(f"Data unzipped in: {unzip_data_path}")
        except Exception as e:
            raise PulsarException(e,sys)



In [12]:
## updating pipeline

from pulsarclassification.config.configuration import ConfigurationManager
from pulsarclassification.components.data_ingestion import DataIngestion

try:
    config = ConfigurationManager()
    data_ingestion_config = config.get_data_ingestion_config()
    data_ingestion = DataIngestion(config=data_ingestion_config)
    data_ingestion.zip_file_downloader()
    data_ingestion.zip_file_extractor()
except Exception as e:
            raise PulsarException(e,sys)

[16-08-2023 21:38:49: INFO: common:  yaml file from this path config\config.yaml read succesfully]
[16-08-2023 21:38:49: INFO: common:  Directory created in this: artifacts ]
[16-08-2023 21:38:49: INFO: 2394603619:  Artifacts directory created at : artifacts ]
[16-08-2023 21:38:49: INFO: common:  Directory created in this: artifacts\data_ingestion ]
[16-08-2023 21:38:49: INFO: common:  Directory created in this: artifacts\data_ingestion\raw_data ]
[16-08-2023 21:38:49: INFO: common:  Directory created in this: artifacts\data_ingestion\ingested_data ]
[16-08-2023 21:38:49: INFO: 2394603619:  Data ingestion configuration: DataIngestionConfiguration(root_dir_name='data_ingestion', dataset_download_url='https://github.com/sumit-1492/datasets/raw/main/playground-series-s3e10.zip', zip_data_dir_name='artifacts\\data_ingestion\\raw_data', unzip_data_dir_name='artifacts\\data_ingestion\\ingested_data')]
[16-08-2023 21:38:53: INFO: 4093054242: artifacts\data_ingestion\raw_data\playground-series