In [12]:
import os
import requests
import zipfile
from pathlib import Path
from dataclasses import dataclass
from typing import Dict
import yaml

In [13]:
%pwd

'e:\\AI DA Portfolio\\Multiple-Disease-Prediction'

In [4]:
os.chdir("../")

In [5]:
%pwd

'e:\\AI DA Portfolio\\Multiple-Disease-Prediction'

In [14]:
@dataclass(frozen=True)
class DataIngestionConfig:
    root_dir: Path
    source_URL: str
    local_data_file: Path
    unzip_dir: Path

@dataclass(frozen=True)
class MultiAlgorithmDataIngestionConfig:
    algorithms: Dict[str, DataIngestionConfig]

In [18]:
# Load the config.yaml file
with open("config/config.yaml", "r") as f:
    config = yaml.safe_load(f)

# Extract the root directory and algorithm configurations
artifacts_root = Path(config["artifacts_root"])

data_ingestion_config = MultiAlgorithmDataIngestionConfig(
    algorithms={
        key: DataIngestionConfig(
            root_dir=artifacts_root / Path(algorithm["root_dir"]),
            source_URL=algorithm["source_URL"],
            local_data_file=artifacts_root / Path(algorithm["local_data_file"]),
            unzip_dir=artifacts_root / Path(algorithm["unzip_dir"])
        )
        for key, algorithm in config["data_ingestion"].items()
    }
)

In [19]:
def download_data(source_url: str, destination: Path) -> None:
    """Downloads the data from a given URL to the specified destination."""
    response = requests.get(source_url)
    response.raise_for_status()  # Ensure the request was successful
    with open(destination, 'wb') as f:
        f.write(response.content)
    print(f"Downloaded data to {destination}")

def unzip_data(zip_file: Path, extract_to: Path) -> None:
    """Unzips the data to the specified directory."""
    with zipfile.ZipFile(zip_file, 'r') as zip_ref:
        zip_ref.extractall(extract_to)
    print(f"Unzipped data to {extract_to}")

def prepare_data(config: DataIngestionConfig) -> None:
    """Handles the entire data ingestion process for a single algorithm."""
    config.unzip_dir.mkdir(parents=True, exist_ok=True)
    if not config.local_data_file.exists():
        download_data(config.source_URL, config.local_data_file)
    if not any(config.unzip_dir.iterdir()):
        unzip_data(config.local_data_file, config.unzip_dir)
    print(f"Data prepared at {config.unzip_dir}")


In [20]:
class DataIngestion:
    def __init__(self, config: DataIngestionConfig):
        self.config = config


    
    def download_file(self):
        if not os.path.exists(self.config.local_data_file):
            filename, headers = request.urlretrieve(
                url = self.config.source_URL,
                filename = self.config.local_data_file
            )
            logger.info(f"{filename} download! with following info: \n{headers}")
        else:
            logger.info(f"File already exists of size: {get_size(Path(self.config.local_data_file))}")  

        
    
    def extract_zip_file(self):
        """
        zip_file_path: str
        Extracts the zip file into the data directory
        Function returns None
        """
        unzip_path = self.config.unzip_dir
        os.makedirs(unzip_path, exist_ok=True)
        with zipfile.ZipFile(self.config.local_data_file, 'r') as zip_ref:
            zip_ref.extractall(unzip_path)

In [21]:
for algorithm_name, config in data_ingestion_config.algorithms.items():
    print(f"\nPreparing data for {algorithm_name}...")
    prepare_data(config)


Preparing data for heart_disease...
Downloaded data to artifacts\${artifacts_root}\data_ingestion\heart_disease_data.zip
Unzipped data to artifacts\${artifacts_root}\data_ingestion\heart_disease
Data prepared at artifacts\${artifacts_root}\data_ingestion\heart_disease

Preparing data for parkinsons...
Downloaded data to artifacts\${artifacts_root}\data_ingestion\parkinsons_data.zip
Unzipped data to artifacts\${artifacts_root}\data_ingestion\parkinsons
Data prepared at artifacts\${artifacts_root}\data_ingestion\parkinsons

Preparing data for diabetes...
Downloaded data to artifacts\${artifacts_root}\data_ingestion\diabetes_data.zip
Unzipped data to artifacts\${artifacts_root}\data_ingestion\diabetes
Data prepared at artifacts\${artifacts_root}\data_ingestion\diabetes
