In [7]:
import pandas as pd
from pprint import pprint

In [None]:
def dim_location_dataframe(**dataframes) -> pd.DataFrame:
    """
    Create the location dimension DataFrame from the extracted raw address data.

    Parameters:
    -----------
    dataframes : dict
        Dictionary containing raw DataFrames extracted from source tables.
        Must contain an 'address' DataFrame.

    Returns:
    --------
    pd.DataFrame
        A DataFrame representing the location dimension with columns:
        'address_id', 'address_line_1', 'address_line_2', 'district', 'city',
        'postal_code', 'country', and 'phone'.

    Raises:
    -------
    ValueError
        If the 'address' table is missing or any error occurs during transformation.
    """

    required_keys = ["address"]

    for key in required_keys:
        if key not in dataframes:
            raise ValueError(f"Error: Missing required dataframe '{key}'.")

    address_df = dataframes.get("address")
    try:
        dim_location = address_df[
            [
                "address_id",
                "address_line_1",
                "address_line_2",
                "district",
                "city",
                "postal_code",
                "country",
                "phone",
            ]
        ].drop_duplicates()
        dim_location.rename(columns={"address_id": "location_id"}, inplace=True)

        return dim_location

    except Exception as e:
        raise ValueError(f"Error creating dim_location: {e}")

In [9]:
address_df = pd.read_parquet(
    "../sql_local_tests/seed_data/address_2022-11-3_14-20-49_962000.parquet"
)
pprint(address_df)

    address_id           address_line_1    address_line_2         district  \
0            1          6826 Herzog Via              None             Avon   
1            2        179 Alexie Cliffs              None             None   
2            3         148 Sincere Fort              None             None   
3            4       6102 Rogahn Skyway              None     Bedfordshire   
4            5        34177 Upton Track              None             None   
5            6        846 Kailey Island              None             None   
6            7     75653 Ernestine Ways              None  Buckinghamshire   
7            8       0579 Durgan Common              None             None   
8            9        644 Edward Garden              None          Borders   
9           10        49967 Kaylah Flat  Tremaine Circles     Bedfordshire   
10          11      249 Bernier Mission              None  Buckinghamshire   
11          12  6461 Ernesto Expressway              None       

In [12]:
dim_location_df = dim_location_dataframe(address=address_df)
pprint(dim_location_df)

    location_id           address_line_1    address_line_2         district  \
0             1          6826 Herzog Via              None             Avon   
1             2        179 Alexie Cliffs              None             None   
2             3         148 Sincere Fort              None             None   
3             4       6102 Rogahn Skyway              None     Bedfordshire   
4             5        34177 Upton Track              None             None   
5             6        846 Kailey Island              None             None   
6             7     75653 Ernestine Ways              None  Buckinghamshire   
7             8       0579 Durgan Common              None             None   
8             9        644 Edward Garden              None          Borders   
9            10        49967 Kaylah Flat  Tremaine Circles     Bedfordshire   
10           11      249 Bernier Mission              None  Buckinghamshire   
11           12  6461 Ernesto Expressway            