# Data ingestion stage

In [1]:
from collections import namedtuple
import os

In [2]:
DataIngestionConfig = namedtuple("DataIngestionConfig",[
    "root_dir",
    "source_URL",
    "local_data_file",
    "unzip_dir"
])

In [3]:
from dataclasses import dataclass
from pathlib import Path

@dataclass(frozen=True)
class DataIngestionConfig:
    root_dir       :Path
    source_URL     :str
    local_data_file:Path
    unzip_dir      :Path

In [4]:
%pwd

'c:\\Users\\Admin\\Projects\\CNN_Dogs_vs_Cats\\CNN_Dogs_vs_Cats\\research'

## Configuration manager class

In [5]:
%cd C:\Users\Admin\Projects\CNN_Dogs_vs_Cats\CNN_Dogs_vs_Cats\src

C:\Users\Admin\Projects\CNN_Dogs_vs_Cats\CNN_Dogs_vs_Cats\src


In [6]:
from CNN_Classifier.constants import *
from CNN_Classifier.utils import create_directories,read_yaml

In [7]:
print(CONFIG_FILE_PATH)

C:\Users\Admin\Projects\CNN_Dogs_vs_Cats\CNN_Dogs_vs_Cats\config\config.yaml


In [8]:
os.chdir('../')

In [9]:
%pwd

'C:\\Users\\Admin\\Projects\\CNN_Dogs_vs_Cats\\CNN_Dogs_vs_Cats'

In [10]:
class ConfigurationManager:
        def __init__(self,config_filepath = CONFIG_FILE_PATH,params_filepath = PARAMS_FILE_PATH):
                
                self.config = read_yaml(config_filepath)
                self.params = read_yaml(params_filepath)
                
        
                create_directories([self.config.artifacts_root]) # creating artifacts

        def get_data_ingestion_config(self) ->DataIngestionConfig: 
                config = self.config.data_ingestion

                create_directories([config.root_dir])
                
                data_ingestion_config  = DataIngestionConfig(
                        root_dir       = config.root_dir,
                        source_URL     = self.config.data_ingestion.source_URL,
                        local_data_file= config.local_data_file,
                        unzip_dir      = config.unzip_dir
                )
                return data_ingestion_config  

In [11]:
import os
import  urllib.request as request
from zipfile import ZipFile

In [14]:
class DataIngestion:
    def __init__(self, config: DataIngestionConfig):
        self.config = config

    
    def download_file(self):
        if not os.path.exists(self.config.local_data_file):
            filename, headers = request.urlretrieve(
                url = self.config.source_URL,
                filename = self.config.local_data_file
            )

    
    def _get_updated_list_of_files(self, list_of_files):
        return [f for f in list_of_files if f.endswith(".jpg") and ("Cat" in f or "Dog" in f)]
    
    
    def _preprocess(self, zf: ZipFile, f: str, working_dir: str):
        target_filepath = os.path.join(working_dir, f)
        if not os.path.exists(target_filepath):
            zf.extract(f, working_dir)
        
        if os.path.getsize(target_filepath) == 0:
            os.remove(target_filepath)

    

    
    def unzip_and_clean(self):
        with ZipFile(file=self.config.local_data_file, mode="r") as zf:
            list_of_files = zf.namelist()
            updated_list_of_files = self._get_updated_list_of_files(list_of_files)
            for f in updated_list_of_files:
                self._preprocess(zf, f, self.config.unzip_dir)
                

In [16]:
try:
    config = ConfigurationManager()
    data_ingestion_config = config.get_data_ingestion_config()
    data_ingestion = DataIngestion(config=data_ingestion_config)
    data_ingestion.download_file()
    data_ingestion.unzip_and_clean()
except Exception as e:
    raise e

[2023-11-30 00:21:08,950:INFO:common:yaml file: C:\Users\Admin\Projects\CNN_Dogs_vs_Cats\CNN_Dogs_vs_Cats\config\config.yaml loaded successfully]
Till here ok
[2023-11-30 00:21:08,950:INFO:common:yaml file: params.yaml loaded successfully]
Till here ok
[2023-11-30 00:21:08,964:INFO:common:created directory at: artifacts]
[2023-11-30 00:21:08,965:INFO:common:created directory at: artifacts/data_ingestion]


BadZipFile: File is not a zip file