In [None]:
# ----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
# Data Loader for ETL Pipeline
# ----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
# This script is responsible for loading transformed data into SQL tables in the Load phase of our ETL pipeline.
# It handles the bulk insertion of data while ensuring data integrity through careful transaction management and dependency handling.

# Key Features:
# 1. **Bulk Insertion**:
#    - Uses the `bulk_insert` function to insert DataFrame rows into SQL tables efficiently.
#    - Dynamically generates SQL `INSERT` statements based on DataFrame columns.
#
# 2. **Transaction Management**:
#    - Wraps all insertions in a single transaction, ensuring atomicity (all-or-nothing behavior).
#    - Rolls back changes if any insertion fails, preserving database integrity.
#
# 3. **Dependency-Aware Insertion**:
#    - Inserts data in the correct order using the `insertion_order` list to respect foreign key relationships.
#
# 4. **Logging**:
#    - Provides detailed logs of the insertion process, including start and end times, per-table progress, and error reporting.
#
# 5. **Scalable Design**:
#    - Can handle large datasets using efficient bulk operations and supports a variety of table schemas.

# Notes:
# - The script assumes that the `insertion_data` dictionary contains pre-validated DataFrames for each table.
# - Requires a valid `.env` file with the database connection string defined as `CONNECT_STR`.


In [None]:
import os
import logging
import pandas as pd
import json
from sqlalchemy import create_engine, text
from sqlalchemy.exc import SQLAlchemyError, IntegrityError
from dotenv import load_dotenv
from datetime import datetime
import time

# Load environment variables
load_dotenv()

# Configure logging
logging.basicConfig(
    level=logging.INFO,
    format="%(asctime)s - %(levelname)s - %(message)s",
    handlers=[
        logging.StreamHandler(),
        logging.FileHandler("api_script.log"),
    ]
)

# Load connection string from environment
connection_string = os.getenv("CONNECT_STR")



# Define the bulk insert function
def bulk_insert(table_name, df, connection):
    """
    Inserts multiple rows from a DataFrame into a specified SQL table in a single bulk operation.

    Parameters:
    ----------
    table_name : str
        The name of the SQL table to insert data into.
    df : pandas.DataFrame
        The DataFrame containing the data to be inserted. Each column in the DataFrame must match a column in the SQL table.
    connection : sqlalchemy.engine.Connection
        The active database connection for executing the bulk insert.

    Raises:
    ------
    SQLAlchemyError
        Raised if an error occurs during the SQL execution. This will trigger a rollback in the main transaction block.
    
    Notes:
    ------
    This function performs a bulk insert by generating a parameterized SQL `INSERT` statement with placeholders for each
    column in the DataFrame. Using `executemany` under the hood, SQLAlchemy efficiently inserts all rows in one operation,
    significantly improving performance over row-by-row insertion instead of previously defined insert_data function that does a row-by-row insertion.
    """
    columns = ", ".join(df.columns)
    placeholders = ", ".join([f":{col}" for col in df.columns])
    insert_sql = f"INSERT INTO {table_name} ({columns}) VALUES ({placeholders})"
    data_to_insert = df.to_dict(orient="records")

    try:
        # Execute bulk insert
        connection.execute(text(insert_sql), data_to_insert)
        logging.info(f"Inserted {len(data_to_insert)} rows into {table_name}.")
    except SQLAlchemyError as e:
        logging.error(f"Error during bulk insert for {table_name}: {e}")
        raise  # Re-raise the exception for main transaction handling

# Start logging the data insertion process
start_time = datetime.now()
logging.info(f"Starting data insertion process at {start_time}.")

# Establish database connection and begin transaction for all insertions
engine = create_engine(connection_string)
with engine.connect() as conn:
    transaction = conn.begin()
    try:
        # Insert data in the specified order
        for table_name in insertion_order:
            if table_name in insertion_data:
                logging.info(f"Starting insertion for table '{table_name}'.")
                start = time.time()
                data = insertion_data[table_name]
                df = data["dataframe"]

                # Insert data using the bulk_insert function
                bulk_insert(table_name, df, conn)
                end = time.time()
                logging.info(f"Insertion completed for table '{table_name}' in {end - start:.2f} seconds.")
                # insert_data(conn, table_name, data["dataframe"], data["insert_sql"])
                # logging.info(f"Insertion completed for table '{table_name}'.")

        # Commit all insertions as a single transaction
        transaction.commit()
        logging.info("All data insertions committed successfully.")

    except SQLAlchemyError as e:
        # Rollback transaction if any insertion fails
        logging.error(f"Data insertion process failed, rolling back transaction: {e}")
        transaction.rollback()
        raise  # Reraise the exception for further handling

    finally:
        if transaction.is_active:
            logging.info("All data insertions committed successfully.")
            
end_time = datetime.now()
logging.info(f"Pipeline load process completed at {end_time}. Duration: {end_time - start_time}.")