Connecting to the Database 

In [None]:
import sqlalchemy as sa
import io

# Create the connection string with placeholders for credentials
db_uri = "postgresql+psycopg2://<USERNAME>:<PASSWORD>@ep-falling-glitter-a5m0j5gk-pooler.us-east-2.aws.neon.tech:5432/neondb?sslmode=require"

# Create the Engine object
engine = sa.create_engine(db_uri, pool_pre_ping=True)

Checking existing schemas

In [2]:
from sqlalchemy import inspect
# Create an inspector object from engine
inspector = inspect(engine)

# Get the list of schema names
schemas = inspector.get_schema_names()

print("Available schemas in the database:")
print(schemas)

Available schemas in the database:
['dependency_example', 'information_schema', 'nyc_schools', 'public', 'test_berlin_data']


Creating a table 'post_offices_test' in 'test_berlin_data' schema

In [3]:
from sqlalchemy import text

# SQL statement to create the table
create_table_sql = """
DROP TABLE IF EXISTS test_berlin_data.post_offices_test;
CREATE TABLE test_berlin_data.post_offices_test (
    id VARCHAR(20) PRIMARY KEY,
    district_id VARCHAR(20),
    neighborhood_id VARCHAR(20),
    zip_code VARCHAR(10),
    city VARCHAR(20),
    street VARCHAR(200),
    house_no VARCHAR(20),
    location_type VARCHAR(200),
    location_name VARCHAR(200),
    closure_periods VARCHAR(400),
    opening_hours VARCHAR(400),
    latitude DECIMAL(9,6),
    longitude DECIMAL(9,6)
);
"""

# Connect using the engine and execute the SQL
try:
    with engine.connect() as connection:
        connection.execute(text(create_table_sql))
        connection.commit() # Commit the transaction to make the change permanent
    print("Table 'post_offices_test' created successfully using the SQLAlchemy engine.")
except Exception as e:
    print(f"An error occurred during table creation: {e}")

Table 'post_offices_test' created successfully using the SQLAlchemy engine.


Preparing the data for upload

In [4]:
import pandas as pd
from pathlib import Path
import io

# --- 1. Load the CSV file ---
file_to_load = Path('../clean/deutschepost_clean_with_distr.csv')
df = pd.read_csv(file_to_load)

# --- 2. Define the correct column order based on NEW CREATE TABLE statement ---
sql_column_order = [
    'id',
    'district_id',
    'neighborhood_id',
    'zip_code',
    'city',
    'street',
    'house_no',
    'location_type',
    'location_name',
    'closure_periods',
    'opening_hours',
    'latitude',
    'longitude'
]

# --- 3. Create the final DataFrame for upload with columns in the correct order ---
df_for_upload = df[sql_column_order]

print("DataFrame has been prepared for upload with the correct column order.")
df_for_upload.info()

DataFrame has been prepared for upload with the correct column order.
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 244 entries, 0 to 243
Data columns (total 13 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   id               244 non-null    int64  
 1   district_id      244 non-null    int64  
 2   neighborhood_id  244 non-null    int64  
 3   zip_code         244 non-null    int64  
 4   city             244 non-null    object 
 5   street           244 non-null    object 
 6   house_no         244 non-null    object 
 7   location_type    244 non-null    object 
 8   location_name    234 non-null    object 
 9   closure_periods  244 non-null    object 
 10  opening_hours    244 non-null    object 
 11  latitude         244 non-null    float64
 12  longitude        244 non-null    float64
dtypes: float64(2), int64(4), object(7)
memory usage: 24.9+ KB


Insert Data into Table

In [5]:
raw_conn = None
try:
    # Get a single, low-level connection from the engine
    raw_conn = engine.raw_connection()
    cursor = raw_conn.cursor()

    # Set the search path for this session
    cursor.execute("SET search_path TO test_berlin_data;")
    
    # --- Load data into the table ---
    # We need to save to the buffer WITH a header for the COPY CSV command to work correctly
    buffer = io.StringIO()
    df_for_upload.to_csv(buffer, index=False, header=True) # Note: header=True
    buffer.seek(0)
    
    # Use copy_expert with CSV format to correctly handle commas in data
    copy_sql = """
        COPY post_offices_test FROM STDIN WITH
            (FORMAT CSV, HEADER TRUE)
    """
    cursor.copy_expert(sql=copy_sql, file=buffer)
    
    # Commit the entire transaction
    raw_conn.commit()
    print(f"✅ Transaction committed successfully. {len(df_for_upload)} rows were copied.")

except Exception as e:
    print(f"❌ An error occurred: {e}")
    if raw_conn:
        raw_conn.rollback()
finally:
    if raw_conn:
        raw_conn.close()

✅ Transaction committed successfully. 244 rows were copied.


Adding the Foreign Key Constraint

In [6]:
# SQL statement to add the foreign key constraint
add_foreign_key_sql = """
ALTER TABLE test_berlin_data.post_offices_test
ADD CONSTRAINT district_id_fk FOREIGN KEY (district_id)
REFERENCES test_berlin_data.districts(district_id)
ON DELETE RESTRICT
ON UPDATE CASCADE;
"""

# Connect using the engine and execute the SQL
try:
    with engine.connect() as connection:
        connection.execute(text(add_foreign_key_sql))
        connection.commit()
    print("✅ Foreign key constraint 'district_id_fk' added successfully.")
except Exception as e:
    print(f"An error occurred while adding the foreign key: {e}")

✅ Foreign key constraint 'district_id_fk' added successfully.


In [7]:
# Final check: read the first 5 rows from the new table
try:
    check_df = pd.read_sql("SELECT * FROM test_berlin_data.post_offices_test LIMIT 5", engine)
    print("--- Verification: First 5 rows from the database ---")
    print(check_df)
except Exception as e:
    print(f"An error occurred during verification: {e}")

--- Verification: First 5 rows from the database ---
        id district_id neighborhood_id zip_code    city                street  \
0  4340626    11001001             101    10178  Berlin        Spandauer Str.   
1     6730    11001001             101    10178  Berlin           Rathausstr.   
2  4307374    11001001             101    10178  Berlin  Karl-Liebknecht-Str.   
3  4125530    11001001             101    10179  Berlin            Grunerstr.   
4  4326999    11001001             101    10179  Berlin           Brückenstr.   

  house_no            location_type                        location_name  \
0        2            RETAIL_OUTLET                            City Shop   
1        5  POSTBANK_FINANCE_CENTER                     Postbank Filiale   
2       13            RETAIL_OUTLET                     Lotto Post Tabak   
3       20            RETAIL_OUTLET  GECO im ALEXA, Untergeschoss/Baseme   
4       1a            RETAIL_OUTLET              Lotto-Post-Schreibwaren   

   

Final Validation via SQL

This section executes several SQL queries directly against the database to perform final validation on the newly loaded post_offices_test table. These checks verify the total row count, ensure all coordinates fall within the expected geographical boundaries, and confirm the referential integrity of the district_id by comparing the IDs in the main table against the reference districts table.

In [8]:
# --- Query 1: Total row count ---
query1 = "SELECT COUNT(*) AS total_rows FROM test_berlin_data.post_offices_test;"
df1 = pd.read_sql(query1, engine)
print("--- 1. Total row count in 'post_offices_test' ---")
print(df1)

--- 1. Total row count in 'post_offices_test' ---
   total_rows
0         244


In [9]:
# --- Query 2: Count of locations with coordinates outside Berlin ---
query2 = """
    SELECT COUNT(*) AS outliers
    FROM test_berlin_data.post_offices_test
    WHERE NOT (latitude BETWEEN 52.3 AND 52.7 AND longitude BETWEEN 13.0 AND 13.8);
"""
df2 = pd.read_sql(query2, engine)
print("\n--- 2. Count of locations outside Berlin's bounding box ---")
print(df2)


--- 2. Count of locations outside Berlin's bounding box ---
   outliers
0         0


In [12]:
# --- Query 3: Distinct district_id's in the post_offices table ---
query3 = "SELECT DISTINCT district_id FROM test_berlin_data.post_offices_test ORDER BY 1;"
df3 = pd.read_sql(query3, engine)
print("\n--- 3. Distinct district_id's found in the post_offices_test table ---")
print(df3)


--- 3. Distinct district_id's found in the post_offices_test table ---
   district_id
0     11001001
1     11002002
2     11003003
3     11004004
4     11005005
5     11006006
6     11007007
7     11008008
8     11009009
9     11010010
10    11011011
11    11012012


In [13]:
# --- Query 4: Distinct district_id's in the districts lookup table ---
query4 = "SELECT DISTINCT district_id FROM test_berlin_data.districts ORDER BY 1;"
df4 = pd.read_sql(query4, engine)
print("\n--- 4. Distinct district_id's from the reference 'districts' table ---")
print(df4)


--- 4. Distinct district_id's from the reference 'districts' table ---
   district_id
0     11001001
1     11002002
2     11003003
3     11004004
4     11005005
5     11006006
6     11007007
7     11008008
8     11009009
9     11010010
10    11011011
11    11012012
