Connecting to the Database 

In [None]:
import sqlalchemy as sa

# Create the connection string with placeholders for credentials
db_uri = "postgresql+psycopg2://<USERNAME>:<PASSWORD>@localhost:5433/layereddb"

# Create the Engine object, keeping pool_pre_ping for reliability
engine = sa.create_engine(db_uri, pool_pre_ping=True)

print("✅ Connection engine for the local 'layereddb' is ready.")

✅ Connection engine for the local 'layereddb' is ready.


Checking existing schemas

In [27]:
from sqlalchemy import inspect
# Create an inspector object from engine
inspector = inspect(engine)

# Get the list of schema names
schemas = inspector.get_schema_names()

print("Available schemas in the database:")
print(schemas)

Available schemas in the database:
['berlin_labels', 'berlin_recommender', 'berlin_source_data', 'dashboard_data', 'information_schema', 'public']


Creating a table 'post_offices' in 'berlin_source_data' schema

In [28]:
from sqlalchemy import text

# SQL statement to create the table
create_table_sql = """
DROP TABLE IF EXISTS berlin_source_data.post_offices;
CREATE TABLE berlin_source_data.post_offices (
    id VARCHAR(20) PRIMARY KEY,
    district_id VARCHAR(20),
    neighborhood_id VARCHAR(20),
    zip_code VARCHAR(10),
    city VARCHAR(20),
    street VARCHAR(200),
    house_no VARCHAR(20),
    location_type VARCHAR(200),
    location_name VARCHAR(200),
    closure_periods VARCHAR(400),
    opening_hours VARCHAR(400),
    latitude DECIMAL(9,6),
    longitude DECIMAL(9,6)
);
"""

# Connect using the engine and execute the SQL
try:
    with engine.connect() as connection:
        connection.execute(text(create_table_sql))
        connection.commit() # Commit the transaction to make the change permanent
    print("Table 'post_offices' created successfully using the SQLAlchemy engine.")
except Exception as e:
    print(f"An error occurred during table creation: {e}")

Table 'post_offices' created successfully using the SQLAlchemy engine.


Preparing the data for upload

In [29]:
import pandas as pd
from pathlib import Path
import io

# --- 1. Load the CSV file ---
file_to_load = Path('../clean/deutschepost_clean_with_distr.csv')
df = pd.read_csv(file_to_load)

# --- 2. Define the correct column order based on NEW CREATE TABLE statement ---
sql_column_order = [
    'id',
    'district_id',
    'neighborhood_id',
    'zip_code',
    'city',
    'street',
    'house_no',
    'location_type',
    'location_name',
    'closure_periods',
    'opening_hours',
    'latitude',
    'longitude'
]

# --- 3. Create the final DataFrame for upload with columns in the correct order ---
df_for_upload = df[sql_column_order]

print("DataFrame has been prepared for upload with the correct column order.")
df_for_upload.info()

DataFrame has been prepared for upload with the correct column order.
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 244 entries, 0 to 243
Data columns (total 13 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   id               244 non-null    int64  
 1   district_id      244 non-null    int64  
 2   neighborhood_id  244 non-null    int64  
 3   zip_code         244 non-null    int64  
 4   city             244 non-null    object 
 5   street           244 non-null    object 
 6   house_no         244 non-null    object 
 7   location_type    244 non-null    object 
 8   location_name    234 non-null    object 
 9   closure_periods  244 non-null    object 
 10  opening_hours    244 non-null    object 
 11  latitude         244 non-null    float64
 12  longitude        244 non-null    float64
dtypes: float64(2), int64(4), object(7)
memory usage: 24.9+ KB


Insert Data into Table

In [30]:
raw_conn = None
try:
    # Get a single, low-level connection from the engine
    raw_conn = engine.raw_connection()
    cursor = raw_conn.cursor()

    # Set the search path for this session
    cursor.execute("SET search_path TO berlin_source_data;")
    
    # --- Load data into the table ---
    # We need to save to the buffer WITH a header for the COPY CSV command to work correctly
    buffer = io.StringIO()
    df_for_upload.to_csv(buffer, index=False, header=True) # Note: header=True
    buffer.seek(0)
    
    # Use copy_expert with CSV format to correctly handle commas in data
    copy_sql = """
        COPY post_offices FROM STDIN WITH
            (FORMAT CSV, HEADER TRUE)
    """
    cursor.copy_expert(sql=copy_sql, file=buffer)
    
    # Commit the entire transaction
    raw_conn.commit()
    print(f"✅ Transaction committed successfully. {len(df_for_upload)} rows were copied.")

except Exception as e:
    print(f"❌ An error occurred: {e}")
    if raw_conn:
        raw_conn.rollback()
finally:
    if raw_conn:
        raw_conn.close()

✅ Transaction committed successfully. 244 rows were copied.


Adding the Foreign Key Constraint

In [31]:
# SQL statement to add the foreign key constraint
add_foreign_key_sql = """
ALTER TABLE berlin_source_data.post_offices
ADD CONSTRAINT district_id_fk FOREIGN KEY (district_id)
REFERENCES berlin_source_data.districts(district_id)
ON DELETE RESTRICT
ON UPDATE CASCADE;
"""

# Connect using the engine and execute the SQL
try:
    with engine.connect() as connection:
        connection.execute(text(add_foreign_key_sql))
        connection.commit()
    print("✅ Foreign key constraint 'district_id_fk' added successfully.")
except Exception as e:
    print(f"An error occurred while adding the foreign key: {e}")

✅ Foreign key constraint 'district_id_fk' added successfully.


In [32]:
# Final check: read the first 5 rows from the new table
try:
    check_df = pd.read_sql("SELECT * FROM berlin_source_data.post_offices LIMIT 5", engine)
    print("--- Verification: First 5 rows from the database ---")
    print(check_df)
except Exception as e:
    print(f"An error occurred during verification: {e}")

--- Verification: First 5 rows from the database ---
        id district_id neighborhood_id zip_code    city                street  \
0  4340626    11001001             101    10178  Berlin        Spandauer Str.   
1     6730    11001001             101    10178  Berlin           Rathausstr.   
2  4307374    11001001             101    10178  Berlin  Karl-Liebknecht-Str.   
3  4125530    11001001             101    10179  Berlin            Grunerstr.   
4  4326999    11001001             101    10179  Berlin           Brückenstr.   

  house_no            location_type                        location_name  \
0        2            RETAIL_OUTLET                            City Shop   
1        5  POSTBANK_FINANCE_CENTER                     Postbank Filiale   
2       13            RETAIL_OUTLET                     Lotto Post Tabak   
3       20            RETAIL_OUTLET  GECO im ALEXA, Untergeschoss/Baseme   
4       1a            RETAIL_OUTLET              Lotto-Post-Schreibwaren   

   

Final Validation via SQL

This section executes several SQL queries directly against the database to perform final validation on the newly loaded post_offices_test table. These checks verify the total row count, ensure all coordinates fall within the expected geographical boundaries, and confirm the referential integrity of the district_id by comparing the IDs in the main table against the reference districts table.

In [33]:
# --- Query 1: Total row count ---
query1 = "SELECT COUNT(*) AS total_rows FROM berlin_source_data.post_offices;"
df1 = pd.read_sql(query1, engine)
print("--- 1. Total row count in 'post_offices' ---")
print(df1)

--- 1. Total row count in 'post_offices' ---
   total_rows
0         244


In [34]:
# --- Query 2: Count of locations with coordinates outside Berlin ---
query2 = """
    SELECT COUNT(*) AS outliers
    FROM berlin_source_data.post_offices
    WHERE NOT (latitude BETWEEN 52.3 AND 52.7 AND longitude BETWEEN 13.0 AND 13.8);
"""
df2 = pd.read_sql(query2, engine)
print("\n--- 2. Count of locations outside Berlin's bounding box ---")
print(df2)


--- 2. Count of locations outside Berlin's bounding box ---
   outliers
0         0


In [35]:
# --- Query 3: Distinct district_id's in the post_offices table ---
query3 = "SELECT DISTINCT district_id FROM berlin_source_data.post_offices ORDER BY 1;"
df3 = pd.read_sql(query3, engine)
print("\n--- 3. Distinct district_id's found in the post_offices table ---")
print(df3)


--- 3. Distinct district_id's found in the post_offices table ---
   district_id
0     11001001
1     11002002
2     11003003
3     11004004
4     11005005
5     11006006
6     11007007
7     11008008
8     11009009
9     11010010
10    11011011
11    11012012


In [36]:
# --- Query 4: Distinct district_id's in the districts lookup table ---
query4 = "SELECT DISTINCT district_id FROM berlin_source_data.districts ORDER BY 1;"
df4 = pd.read_sql(query4, engine)
print("\n--- 4. Distinct district_id's from the reference 'districts' table ---")
print(df4)


--- 4. Distinct district_id's from the reference 'districts' table ---
   district_id
0     11001001
1     11002002
2     11003003
3     11004004
4     11005005
5     11006006
6     11007007
7     11008008
8     11009009
9     11010010
10    11011011
11    11012012


In [37]:
# --- Query 5:  Primary key uniqueness ---
query5 = "SELECT COUNT (DISTINCT id) FROM berlin_source_data.post_offices;"
df5 = pd.read_sql(query5, engine)
print("\n--- 5. Number of distinct id in the post_offices table ---")
print(df5)


--- 5. Number of distinct id in the post_offices table ---
   count
0    244


In [None]:
# Check for post offices that have a district_id with no match in the districts table.
query6 = "SELECT po.id, po.district_id FROM berlin_source_data.post_offices po LEFT JOIN berlin_source_data.districts d ON po.district_id = d.district_id WHERE d.district_id IS NULL; -- This finds rows in post_offices that have no match in districts"
df6 = pd.read_sql(query6, engine)
print("\n--- 6. Post offices with an invalid district_id (no match in the districts table) ---")
print(df6)


--- 6. Post offices with an invalid district_id (no match in the districts table) ---
Empty DataFrame
Columns: [id, district_id]
Index: []


In [None]:
# ---  Define the SQL query to get the table schema ---
query_schema = """
SELECT
    column_name,
    data_type,
    is_nullable
FROM
    information_schema.columns
WHERE
    table_schema = 'berlin_source_data' AND table_name = 'post_offices';
"""

# --- 2. Execute the query and print the result ---
try:
    print("\n--- Checking the schema of the 'post_offices' table ---")
    
    # Execute the query and load the result into a DataFrame
    df_schema = pd.read_sql(query_schema, engine)
    
    # Print the resulting schema information
    print(df_schema.to_string())

except Exception as e:
    print(f"\n❌ An error occurred while executing the query: {e}")


--- Checking the schema of the 'post_offices' table ---
        column_name          data_type is_nullable
0                id  character varying          NO
1       district_id  character varying         YES
2   neighborhood_id  character varying         YES
3          zip_code  character varying         YES
4              city  character varying         YES
5            street  character varying         YES
6          house_no  character varying         YES
7     location_type  character varying         YES
8     location_name  character varying         YES
9   closure_periods  character varying         YES
10    opening_hours  character varying         YES
11         latitude            numeric         YES
12        longitude            numeric         YES


I'm adding NOT NULL constraints to the required columns in the schema.

In [None]:
# --- Define the SQL commands to be executed ---
sql_commands = [
    "ALTER TABLE berlin_source_data.post_offices ALTER COLUMN district_id SET NOT NULL;",
    "ALTER TABLE berlin_source_data.post_offices ALTER COLUMN neighborhood_id SET NOT NULL;",
    "ALTER TABLE berlin_source_data.post_offices ALTER COLUMN zip_code SET NOT NULL;",
    "ALTER TABLE berlin_source_data.post_offices ALTER COLUMN city SET NOT NULL;",
    "ALTER TABLE berlin_source_data.post_offices ALTER COLUMN street SET NOT NULL;",
    "ALTER TABLE berlin_source_data.post_offices ALTER COLUMN house_no SET NOT NULL;",
    "ALTER TABLE berlin_source_data.post_offices ALTER COLUMN location_type SET NOT NULL;",
    "ALTER TABLE berlin_source_data.post_offices ALTER COLUMN latitude SET NOT NULL;",
    "ALTER TABLE berlin_source_data.post_offices ALTER COLUMN longitude SET NOT NULL;"
]   

# --- Execute the commands within a transaction ---
print("\n--- Applying NOT NULL constraints  ---")
try:
    with engine.connect() as connection:
        with connection.begin() as transaction:
            for command in sql_commands:
                print(f"Executing: {command.strip()}")
                connection.execute(text(command))
        
        print("\n✅ Constraints applied successfully.")

except Exception as e:
    print(f"\n❌ An error occurred while applying constraints: {e}")
    print("\nNOTE: This error can occur if one of the columns already contains NULL values.")


--- Applying NOT NULL constraints  ---
Executing: ALTER TABLE berlin_source_data.post_offices ALTER COLUMN district_id SET NOT NULL;
Executing: ALTER TABLE berlin_source_data.post_offices ALTER COLUMN neighborhood_id SET NOT NULL;
Executing: ALTER TABLE berlin_source_data.post_offices ALTER COLUMN zip_code SET NOT NULL;
Executing: ALTER TABLE berlin_source_data.post_offices ALTER COLUMN city SET NOT NULL;
Executing: ALTER TABLE berlin_source_data.post_offices ALTER COLUMN street SET NOT NULL;
Executing: ALTER TABLE berlin_source_data.post_offices ALTER COLUMN house_no SET NOT NULL;
Executing: ALTER TABLE berlin_source_data.post_offices ALTER COLUMN location_type SET NOT NULL;
Executing: ALTER TABLE berlin_source_data.post_offices ALTER COLUMN latitude SET NOT NULL;
Executing: ALTER TABLE berlin_source_data.post_offices ALTER COLUMN longitude SET NOT NULL;

✅ Constraints applied successfully.


Final check

In [45]:
try:
    print("\n--- Checking the schema of the 'post_offices' table ---")
    
    # Execute the query and load the result into a DataFrame
    df_schema = pd.read_sql(query_schema, engine)
    
    # Print the resulting schema information
    print(df_schema.to_string())

except Exception as e:
    print(f"\n❌ An error occurred while executing the query: {e}")


--- Checking the schema of the 'post_offices' table ---
        column_name          data_type is_nullable
0                id  character varying          NO
1       district_id  character varying          NO
2   neighborhood_id  character varying          NO
3          zip_code  character varying          NO
4              city  character varying          NO
5            street  character varying          NO
6          house_no  character varying          NO
7     location_type  character varying          NO
8     location_name  character varying         YES
9   closure_periods  character varying         YES
10    opening_hours  character varying         YES
11         latitude            numeric          NO
12        longitude            numeric          NO
