In [1]:
import os

In [2]:
%pwd

'g:\\success_analytics_courses\\Customer_personality_analysis_project\\research'

In [3]:
os.chdir("../")

In [4]:
%pwd

'g:\\success_analytics_courses\\Customer_personality_analysis_project'

step - 1 config.yaml completed

In [5]:
## step - 2 updating entity
## it will return the data types all data_ingestion requirements

from dataclasses import dataclass
from pathlib import Path

@dataclass(frozen=True)
class DataIngestionConfiguration:

    root_dir: Path
    dataset_download_url: str
    zip_data_dir: Path
    unzip_data_dir: Path

In [6]:
# updating configuration manager in src config

from customerpersonality.logging import logging
from customerpersonality.constants import *
from customerpersonality.utils.common import read_yaml, create_directories
from customerpersonality.entity import DataIngestionConfiguration

class ConfigurationManager:

    def __init__(self, config_file_path: str = CONFIG_FILE_PATH):

        try:

            #self.config_file_path = CONFIG_FILE_PATH

            self.config = read_yaml(config_file_path) ## read config.yaml file

            create_directories([self.config.artifacts_dir])

            logging.info(f"artifacts directory created: {self.config.artifacts_dir}")

        except Exception as e:

            raise e

        
    def get_data_ingestion_config(self) -> DataIngestionConfiguration:

        try:

            artifacts_dir = self.config.artifacts_dir
            config = self.config.data_ingestion_config
            
            data_ingestion_dir = os.path.join(artifacts_dir,config.root_dir)
            create_directories([data_ingestion_dir])

            raw_data_dir = os.path.join(data_ingestion_dir,config.zip_data_dir)
            create_directories([raw_data_dir])

            ingested_unzip_csv_file_dir = os.path.join(data_ingestion_dir,config.unzip_data_dir)
            create_directories([ingested_unzip_csv_file_dir])

            data_ingestion_config = DataIngestionConfiguration( 
                                        root_dir = config.root_dir,
                                        dataset_download_url = config.dataset_download_url,
                                        zip_data_dir = raw_data_dir,
                                        unzip_data_dir = ingested_unzip_csv_file_dir,
                                    )
            logging.info(f" Data ingestion configuration: {data_ingestion_config}")

            return data_ingestion_config
        
        except Exception as e:
            raise e

In [7]:
## updating components

import os
import urllib.request as request
import zipfile
from customerpersonality.logging import logging
from customerpersonality.utils.common import get_file_size
from customerpersonality.entity import DataIngestionConfiguration

class DataIngestion:

    def __init__(self, config : DataIngestionConfiguration):

        try:
            self.config = config
        except Exception as e:
            raise e
        
    def download_zip_file(self):

        try:
            zip_file_name = os.path.basename(self.config.dataset_download_url)
            zip_data_path = os.path.join(self.config.zip_data_dir,zip_file_name)
            self.zip_data_path_ = zip_data_path
            if not os.path.exists(zip_data_path):
                filename, headers = request.urlretrieve(
                    url = self.config.dataset_download_url,
                    filename = zip_data_path
                )
                logging.info(f"{filename} download! with following info: \n{headers}")
            else:
                logging.info(f"File already exists of size: {get_file_size(Path(zip_data_path))}") 

        except Exception as e:
            raise e
        
    def extract_zip_file(self):

        try:
            unzip_data_path = self.config.unzip_data_dir
            with zipfile.ZipFile(self.zip_data_path_, 'r') as zip_file:
                zip_file.extractall(unzip_data_path)
            
            logging.info(f"Data unzipped in: {unzip_data_path}")

        except Exception as e:
            raise e
        

In [8]:
## updating pipeline

try:
    config = ConfigurationManager()
    data_ingestion_config = config.get_data_ingestion_config()
    data_ingestion = DataIngestion(config=data_ingestion_config)
    data_ingestion.download_zip_file()
    data_ingestion.extract_zip_file()
except Exception as e:
    raise e

[2023-07-27 07:51:27,610: INFO: common: yaml file: g:\success_analytics_courses\Customer_personality_analysis_project\config\config.yaml read successfully]
[2023-07-27 07:51:27,614: INFO: common: Directory created at:artifacts]
[2023-07-27 07:51:27,619: INFO: 3622309105: artifacts directory created: artifacts]
[2023-07-27 07:51:27,622: INFO: common: Directory created at:artifacts\data_ingestion]
[2023-07-27 07:51:27,628: INFO: common: Directory created at:artifacts\data_ingestion\raw_data]
[2023-07-27 07:51:27,631: INFO: common: Directory created at:artifacts\data_ingestion\ingested_data]
[2023-07-27 07:51:27,635: INFO: 3622309105:  Data ingestion configuration: DataIngestionConfiguration(root_dir='data_ingestion', dataset_download_url='https://github.com/sumit-1492/datasets/raw/main/marketing_campaign.zip', zip_data_dir='artifacts\\data_ingestion\\raw_data', unzip_data_dir='artifacts\\data_ingestion\\ingested_data')]
[2023-07-27 07:51:34,801: INFO: 2152330188: artifacts\data_ingestion