In [1]:
import os
%pwd

'c:\\Users\\VenuraP\\Desktop\\Browns Data Projects\\ML Projects\\POC\\Harti-Food-Price-Prediction\\notebooks'

In [2]:
os.chdir("../")

## Update the Entity

In [3]:
from dataclasses import dataclass
from pathlib import Path

@dataclass(frozen=True)
class DataIngestionConfig:
    root_dir: Path
    source_URL: str
    local_data_file: Path
    unzip_dir: Path

In [4]:
from src.constants import * #import constants (paths to yaml files) inside here
from src.utils.common import read_yaml, create_directories 

## Configuration Manager(configuration.py)

In [5]:
# Function to initialize configuration by reading YAML and creating directories
def initialize_configuration(config_filepath: Path = CONFIG_FILE_PATH, schema_filepath: Path = SCHEMA_FILE_PATH):
    """
    Reads the config and schema YAML files and creates the necessary directories.

    Args:
        config_filepath (Path): The path to the configuration YAML file.
        schema_filepath (Path): The path to the schema YAML file.

    Returns:
        config: The loaded configuration from YAML.
        schema: The loaded schema from YAML.
    """
    config = read_yaml(config_filepath)
    schema = read_yaml(schema_filepath)

    return config, schema


# Function to get data ingestion configuration from the loaded config
def get_data_ingestion_config(config) -> DataIngestionConfig:
    """
    Retrieves data ingestion configuration from the config and creates necessary directories.

    Args:
        config: The loaded configuration from YAML.

    Returns:
        DataIngestionConfig: A dataclass containing the data ingestion configuration.
    """
    # Extract data ingestion settings from the config
    data_ingestion = config.data_ingestion

    # Create and return a DataIngestionConfig instance
    return DataIngestionConfig(
        root_dir=data_ingestion.root_dir,
        source_URL=data_ingestion.source_URL,
        local_data_file=data_ingestion.local_data_file,
        unzip_dir=data_ingestion.unzip_dir
    )

# load config and schema
config, schema = initialize_configuration()
# function to create directories
create_directories([config.artifacts_root])
# Step 4: Retrieve the data ingestion configuration from the loaded config
data_ingestion_config = get_data_ingestion_config(config)
# Step 5: Create the directories related to data ingestion (root directory)
create_directories([data_ingestion_config.root_dir])

[2024-09-24 15:26:20,918: INFO: common: yaml file: config\config.yaml loaded successfully]
[2024-09-24 15:26:20,923: INFO: common: yaml file: schema.yaml loaded successfully]
{'artifacts_root': 'artifacts', 'data_ingestion': {'root_dir': 'artifacts/data_ingestion', 'source_URL': 'https://github.com/venura-Pussella/Nadu_2_Price_Prediction/blob/main/Lstm_Dates.xlsx', 'local_data_file': 'artifacts/data_ingestion/Lstm_Dates.xlsx', 'unzip_dir': 'artifacts/data_ingestion'}, 'data_validation': {'root_dir': 'artifacts/data_validation', 'unzip_data_dir': 'artifacts/data_ingestion/Lstm_Dates.xlsx', 'STATUS_FILE': 'artifacts/data_validation/status.txt'}, 'data_preprocessing': {'root_dir': 'artifacts/data_preprocessing', 'unzip_data_dir': 'artifacts/data_ingestion/Lstm_Dates.xlsx', 'STATUS_FILE': 'artifacts/data_preprocessing/status.txt'}, 'data_transformation': {'root_dir': 'artifacts/data_transformation', 'data_path': 'artifacts/data_preprocessing/Lstm_Dates.xlsx'}, 'model_trainer': {'root_dir':

## Data Ingestion Components

In [6]:
import os
from pathlib import Path
from urllib import request
from box import ConfigBox
from src import logger

def download_file(config: ConfigBox):
    """Download a file from the source URL to the local data file path."""
    if not os.path.exists(config.local_data_file):
        filename, headers = request.urlretrieve(
            url=config.source_URL,
            filename=config.local_data_file
        )
        logger.info(f"{filename} downloaded! with the following info: \n{headers}")
    else:
        logger.info(f"File already exists of size: {get_size(Path(config.local_data_file))}")

def get_size(path: Path) -> str:
    """Get size of the file in KB."""
    size_in_kb = round(os.path.getsize(path) / 1024)
    return f"~ {size_in_kb} KB"

# Step 5: Download the file using the data ingestion configuration
lstm_data=download_file(data_ingestion_config)

[2024-09-24 15:29:06,009: INFO: 1022477913: artifacts/data_ingestion/Lstm_Dates.xlsx downloaded! with the following info: 
Server: GitHub.com
Date: Tue, 24 Sep 2024 09:59:03 GMT
Content-Type: text/html; charset=utf-8
Vary: X-PJAX, X-PJAX-Container, Turbo-Visit, Turbo-Frame, Accept-Encoding, Accept, X-Requested-With
ETag: W/"de174c521fef02f985587bb5608b2df5"
Cache-Control: max-age=0, private, must-revalidate
Strict-Transport-Security: max-age=31536000; includeSubdomains; preload
X-Frame-Options: deny
X-Content-Type-Options: nosniff
X-XSS-Protection: 0
Referrer-Policy: no-referrer-when-downgrade
Content-Security-Policy: default-src 'none'; base-uri 'self'; child-src github.com/assets-cdn/worker/ github.com/webpack/ github.com/assets/ gist.github.com/assets-cdn/worker/; connect-src 'self' uploads.github.com www.githubstatus.com collector.github.com raw.githubusercontent.com api.github.com github-cloud.s3.amazonaws.com github-production-repository-file-5c1aeb.s3.amazonaws.com github-produc

## Data Ingestion Pipeline

In [28]:
# src/pipeline.py
def data_ingestion_training_pipeline():
    """Runs the data ingestion pipeline."""
    try:
        # Load config and schema
        config , _ = initialize_configuration()

        # Create directories for artifacts root
        create_directories([config.artifacts_root])

        # Retrieve the data ingestion configuration from the loaded config
        data_ingestion_config = get_data_ingestion_config(config)

        # Create directories related to data ingestion (root directory)
        create_directories([data_ingestion_config.root_dir])

        # Download the data file as part of the data ingestion process
        download_file(data_ingestion_config)

    except Exception as e:
        logger.error(f"An error occurred during data ingestion: {e}")

data_ingestion_training_pipeline()

[2024-09-24 11:14:15,740: INFO: common: yaml file: config\config.yaml loaded successfully]
[2024-09-24 11:14:15,746: INFO: common: yaml file: schema.yaml loaded successfully]
[2024-09-24 11:14:15,749: INFO: common: created directory at: artifacts]
[2024-09-24 11:14:15,751: INFO: common: created directory at: artifacts/data_ingestion]
[2024-09-24 11:14:16,769: INFO: 535566467: artifacts/data_ingestion/Lstm_Dates.xlsx downloaded! with the following info: 
Server: GitHub.com
Date: Tue, 24 Sep 2024 05:44:13 GMT
Content-Type: text/html; charset=utf-8
Vary: X-PJAX, X-PJAX-Container, Turbo-Visit, Turbo-Frame, Accept-Encoding, Accept, X-Requested-With
ETag: W/"0ea54ab76c6f6a52b6ce4af0fd724268"
Cache-Control: max-age=0, private, must-revalidate
Strict-Transport-Security: max-age=31536000; includeSubdomains; preload
X-Frame-Options: deny
X-Content-Type-Options: nosniff
X-XSS-Protection: 0
Referrer-Policy: no-referrer-when-downgrade
Content-Security-Policy: default-src 'none'; base-uri 'self'; ch