In [1]:
import os

In [2]:
%pwd

'e:\\AI DA Portfolio\\Potato-Disease\\research'

In [3]:
os.chdir("../")

In [4]:
%pwd

'e:\\AI DA Portfolio\\Potato-Disease'

In [7]:
from dataclasses import dataclass
from pathlib import Path

@dataclass(frozen=True)
class DataIngestionConfig:
    root_dir: Path
    train_data_source_URL: str
    val_data_source_URL: str
    test_data_source_URL: str
    local_train_data_file: Path
    local_val_data_file: Path
    local_test_data_file: Path
    unzip_train_dir: Path
    unzip_val_dir: Path
    unzip_test_dir: Path

In [8]:
from PotatoDisease.constants import *
from PotatoDisease.utils.common import read_yaml, create_directories

In [9]:
class ConfigurationManager:
    def __init__(
        self,
        config_filepath=CONFIG_FILE_PATH,
        params_filepath=PARAMS_FILE_PATH
    ):
        self.config = read_yaml(Path(config_filepath))
        self.params = read_yaml(Path(params_filepath))

        create_directories([self.config.artifacts_root])

    def get_data_ingestion_config(self) -> DataIngestionConfig:
        config = self.config.data_ingestion

        create_directories([
            config.root_dir,
            config.unzip_train_dir,
            config.unzip_val_dir,
            config.unzip_test_dir
        ])

        data_ingestion_config = DataIngestionConfig(
            root_dir=config.root_dir,
            train_data_source_URL=config.train_data_source_URL,
            val_data_source_URL=config.val_data_source_URL,
            test_data_source_URL=config.test_data_source_URL,
            local_train_data_file=config.local_train_data_file,
            local_val_data_file=config.local_val_data_file,
            local_test_data_file=config.local_test_data_file,
            unzip_train_dir=config.unzip_train_dir,
            unzip_val_dir=config.unzip_val_dir,
            unzip_test_dir=config.unzip_test_dir
        )

        return data_ingestion_config
    

In [11]:
import os
import urllib.request as request
import zipfile
from PotatoDisease.logging import logger
from PotatoDisease.utils.common import get_size
from tqdm import tqdm

In [12]:
class DataIngestion:
    def __init__(self, config):
        self.config = config

    def download_file(self, dataset_type):
        if dataset_type == "train":
            url = self.config.train_data_source_URL
            local_data_file = self.config.local_train_data_file
        elif dataset_type == "val":
            url = self.config.val_data_source_URL
            local_data_file = self.config.local_val_data_file
        elif dataset_type == "test":
            url = self.config.test_data_source_URL
            local_data_file = self.config.local_test_data_file
        else:
            raise ValueError("Invalid dataset type. Choose from 'train', 'val', or 'test'.")

        logger.info(f"Downloading {dataset_type} dataset from {url}...")
        request.urlretrieve(url, local_data_file)
        logger.info(f"Downloaded {dataset_type} dataset to {local_data_file}.")

    def extract_zip_file(self, dataset_type):
        if dataset_type == "train":
            local_data_file = self.config.local_train_data_file
            unzip_path = self.config.unzip_train_dir
        elif dataset_type == "val":
            local_data_file = self.config.local_val_data_file
            unzip_path = self.config.unzip_val_dir
        elif dataset_type == "test":
            local_data_file = self.config.local_test_data_file
            unzip_path = self.config.unzip_test_dir
        else:
            raise ValueError("Invalid dataset type. Choose from 'train', 'val', or 'test'.")

        os.makedirs(unzip_path, exist_ok=True)
        
        # Verify the zip file before extracting
        if zipfile.is_zipfile(local_data_file):
            logger.info(f"Extracting {dataset_type} dataset...")
            with zipfile.ZipFile(local_data_file, 'r') as zip_ref:
                zip_ref.extractall(unzip_path)
            logger.info(f"Extracted {dataset_type} dataset to {unzip_path}.")
        else:
            raise zipfile.BadZipFile("The file is not a valid zip file.")


In [13]:
try:
    config = ConfigurationManager()
    data_ingestion_config = config.get_data_ingestion_config()
    data_ingestion = DataIngestion(config=data_ingestion_config)

    # Download and extract for each dataset type
    for dataset_type in ['train', 'val', 'test']:
        data_ingestion.download_file(dataset_type)
        data_ingestion.extract_zip_file(dataset_type)

except Exception as e:
    logger.error(f"An error occurred: {e}")
    raise

[2024-08-12 05:48:28,668: INFO: common: yaml file: config\config.yaml loaded successfully]
[2024-08-12 05:48:28,676: INFO: common: yaml file: params.yaml loaded successfully]
[2024-08-12 05:48:28,677: INFO: common: created directory at: artifacts]
[2024-08-12 05:48:28,680: INFO: common: created directory at: artifacts/data_ingestion]
[2024-08-12 05:48:28,681: INFO: common: created directory at: artifacts/data_ingestion/train]
[2024-08-12 05:48:28,682: INFO: common: created directory at: artifacts/data_ingestion/val]
[2024-08-12 05:48:28,683: INFO: common: created directory at: artifacts/data_ingestion/test]
[2024-08-12 05:48:28,684: INFO: 2068823624: Downloading train dataset from https://www.dropbox.com/scl/fi/yygtj4pn20onqebi9zbeu/train_data.zip?rlkey=xejca3z1s880obr1y7rks3e8z&st=jslqcrbr&dl=1...]
[2024-08-12 05:48:32,753: INFO: 2068823624: Downloaded train dataset to artifacts/data_ingestion/train_data.zip.]
[2024-08-12 05:48:32,760: INFO: 2068823624: Extracting train dataset...]
[2