In [0]:
from pyspark.sql import Row
import datetime
import json
import os


In [0]:
%sql
CREATE SCHEMA IF NOT EXISTS pipeline

In [0]:
%sql
CREATE TABLE IF NOT EXISTS pipeline.bronze_stats (
    stage STRING,
    file_name STRING,
    start_time TIMESTAMP,
    read_end_time TIMESTAMP,
    end_time TIMESTAMP,
    file_size_mb DOUBLE,
    num_records_ingested LONG,
    read_duration_seconds DOUBLE,
    write_duration_seconds DOUBLE,
    error_message STRING,
    timestamp TIMESTAMP
) USING DELTA;

In [0]:
import json
import logging
from pathlib import Path

# Configure structured logging
logging.basicConfig(
    level=logging.INFO,
    format="%(asctime)s - %(levelname)s - %(module)s - %(message)s",
    handlers=[logging.StreamHandler()]
)

def load_ingestion_config(file_path: str) -> dict:
    """
    Load the ingestion configuration from a JSON file.

    """
    try:
        # Validate file existence
        if not Path(file_path).exists():
            error_message = f"File not found at path: {file_path}"
            logging.error(error_message)
            raise FileNotFoundError(error_message)

        # Load the JSON file
        logging.info(f"Loading ingestion configuration from file: {file_path}")
        with open(file_path, "r") as f:
            config = json.load(f)

        # Log successful loading
        logging.info("Ingestion configuration loaded successfully.")
        return config

    except FileNotFoundError as fnf_error:
        logging.error(f"FileNotFoundError occurred: {fnf_error}")
        raise  

    except json.JSONDecodeError as json_error:
        error_message = f"Invalid JSON in file: {file_path}. Details: {json_error}"
        logging.error(error_message)
        raise ValueError(error_message) from json_error

if __name__ == "__main__":
    json_file_path = "/Workspace/Users/tesfamit03@gmail.com/code/config/files_to_ingest.json"
    try:
        files_to_ingest = load_ingestion_config(json_file_path)
        logging.info(f"Ingestion configuration: {files_to_ingest}")
    except Exception as e:
        logging.error(f"Failed to load ingestion configuration. Error: {e}")

In [0]:
%run /Workspace/Users/tesfamit03@gmail.com/code/utils/logging_utils

In [0]:
%sql
CREATE SCHEMA IF NOT EXISTS bronze

In [0]:
import logging
import datetime

# Configure structured logging
logging.basicConfig(
    level=logging.INFO,
    format="%(asctime)s - %(levelname)s - %(module)s - %(message)s",
    handlers=[logging.StreamHandler()]
)

# Iterate through each file and ingest it
for file_info in files_to_ingest:
    file_name = file_info.get("file_name")
    file_path = file_info.get("file_path")
    table_name = file_info.get("table_name")

    try:
        # Validate inputs
        if not all([file_name, file_path, table_name]):
            error_message = f"Missing required fields for file: {file_name}. File info: {file_info}"
            logging.error(error_message)
            raise ValueError(error_message)

        logging.info(f"Starting ingestion process for file: {file_name} into table: {table_name}")

        # Start time
        start_time = datetime.datetime.now()

        # Read the file
        logging.info(f"Reading file: {file_path}")
        df = spark.read.format("csv") \
            .option("header", True) \
            .option("inferSchema", True) \
            .load(file_path)

        read_end_time = datetime.datetime.now()

        # Write to the Bronze Layer
        logging.info(f"Writing file {file_name} to Delta table: {table_name}")
        df.write.format("delta") \
            .mode("overwrite") \
            .option("mergeSchema", "true") \
            .saveAsTable(table_name)

        end_time = datetime.datetime.now()

        # Calculate file size
        file_size_mb = round(sum([file.size for file in dbutils.fs.ls(file_path)]) / (1024 * 1024), 2)

        # Log metadata
        log_pipeline_stats(
            stage="ingestion",
            stats={
                "file_name": file_name,
                "start_time": start_time,  
                "read_end_time": read_end_time,
                "end_time": end_time,
                "file_size_mb": file_size_mb,
                "num_records_ingested": df.count(),
                "read_duration_seconds": (read_end_time - start_time).total_seconds(),
                "write_duration_seconds": (end_time - read_end_time).total_seconds()
            },
            table_name="pipeline.bronze_stats"
        )

        logging.info(f"Successfully ingested {file_name} into {table_name}.")
        print(f"Successfully ingested {file_name} into {table_name}.")
    except Exception as e:
        # Log errors if ingestion fails
        error_message = f"Failed to ingest file: {file_name}. Error: {str(e)}"
        logging.error(error_message)

        log_pipeline_stats(
            stage="ingestion",
            stats={
                "file_name": file_name,
                "error_message": str(e),
                "timestamp": datetime.datetime.now().isoformat()  # Convert datetime to ISO format
            },
            table_name="pipeline.bronze_stats"
        )

        logging.error(f"Error details: {str(e)}")
        raise RuntimeError(error_message) from e

Successfully ingested appearance.csv into bronze.appearance.
Successfully ingested club_games.csv into bronze.club_games.
Successfully ingested clubs.csv into bronze.clubs.
Successfully ingested competitions.csv into bronze.competitions.
Successfully ingested game_events.csv into bronze.game_events.
Successfully ingested game_lineups.csv into bronze.game_lineups.
Successfully ingested games.csv into bronze.games.
Successfully ingested players.csv into bronze.players.
Successfully ingested player_valuations.csv into bronze.player_valuations.
Successfully ingested transfers.csv into bronze.transfers.
