Connecting to the Database 

In [None]:
import sqlalchemy as sa

# Create the connection string with placeholders for credentials
db_uri = "postgresql+psycopg2://<USERNAME>:<PASSWORD>@localhost:5433/layereddb"

# Create the Engine object, keeping pool_pre_ping for reliability
engine = sa.create_engine(db_uri, pool_pre_ping=True)

print("✅ Connection engine for the local 'layereddb' is ready.")

✅ Connection engine for the local 'layereddb' is ready.


Checking existing schemas

In [3]:
from sqlalchemy import inspect
# Create an inspector object from engine
inspector = inspect(engine)

# Get the list of schema names
schemas = inspector.get_schema_names()

print("Available schemas in the database:")
print(schemas)

Available schemas in the database:
['berlin_labels', 'berlin_recommender', 'berlin_source_data', 'dashboard_data', 'information_schema', 'public']


Creating a table 'night_clubs' in 'berlin_source_data' schema

In [4]:
from sqlalchemy import text, create_engine

# --- SQL Statement to Create New Table ---
# This is customized for your df.info() output
create_table_sql = """
-- Drop the table if it already exists to start fresh
DROP TABLE IF EXISTS berlin_source_data.night_clubs;

-- Create the new table
CREATE TABLE berlin_source_data.night_clubs (
    id VARCHAR(30) PRIMARY KEY,
    district_id VARCHAR(20) NOT NULL,
    neighborhood_id VARCHAR(20) NOT NULL,
    club_name VARCHAR(255),
    city VARCHAR(50),
    postcode VARCHAR(10),
    street VARCHAR(255),
    house_num VARCHAR(30),
    phone VARCHAR(50),
    email VARCHAR(255),
    website VARCHAR(500),
    opening_hours VARCHAR(500),
    wheelchair VARCHAR(30),
    toilets_wheelchair VARCHAR(30),
    wheelchair_description VARCHAR(500),
    live_music VARCHAR(30),
    longitude DECIMAL(9,6) NOT NULL,
    latitude DECIMAL(9,6) NOT NULL
);
"""

# --- Connect and Execute ---
try:
    with engine.connect() as connection:
        # Execute the SQL statement
        connection.execute(text(create_table_sql))
        
        # Commit the transaction
        connection.commit()
        
    print("Table 'berlin_source_data.night_clubs' created successfully. ✅")
except Exception as e:
    print(f"An error occurred during table creation: {e}")


Table 'berlin_source_data.night_clubs' created successfully. ✅


Preparing the data for upload

In [9]:
import pandas as pd
from pathlib import Path
import io

#  Load the CSV file
file_to_load = Path('../clean/night_clubs_clean_with_distr.csv')

# (FIX) Force pandas to read ALL ID columns and 'postcode' as strings (object)
df = pd.read_csv(
    file_to_load,
    dtype={
        'postcode': str,
        'district_id': str,
        'neighborhood_id': str
    }
)

# Define the correct column order based on 'night_clubs' CREATE TABLE statement 

sql_column_order = [
    'id',
    'district_id',
    'neighborhood_id',
    'club_name',
    'city',
    'postcode',
    'street',
    'house_num',
    'phone',
    'email',
    'website',
    'opening_hours',
    'wheelchair',
    'toilets_wheelchair',
    'wheelchair_description',
    'live_music',
    'longitude',
    'latitude'
]

# Create the final DataFrame for upload with columns in the correct order
df_for_upload = df[sql_column_order]

# Check the result
print("DataFrame has been prepared for upload with the new column order.")
df_for_upload.info()

DataFrame has been prepared for upload with the new column order.
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 141 entries, 0 to 140
Data columns (total 18 columns):
 #   Column                  Non-Null Count  Dtype  
---  ------                  --------------  -----  
 0   id                      141 non-null    object 
 1   district_id             141 non-null    object 
 2   neighborhood_id         141 non-null    object 
 3   club_name               141 non-null    object 
 4   city                    141 non-null    object 
 5   postcode                139 non-null    object 
 6   street                  140 non-null    object 
 7   house_num               108 non-null    object 
 8   phone                   59 non-null     object 
 9   email                   18 non-null     object 
 10  website                 100 non-null    object 
 11  opening_hours           53 non-null     object 
 12  wheelchair              89 non-null     object 
 13  toilets_wheelchair      36 no

Insert Data into Table

In [10]:
raw_conn = None
try:
    # Get a single, low-level connection from the engine
    raw_conn = engine.raw_connection()
    cursor = raw_conn.cursor()

    # Set the search path for this session
    cursor.execute("SET search_path TO berlin_source_data;")
    
    # --- Load data into the table ---
    # We need to save to the buffer WITH a header for the COPY CSV command to work correctly
    buffer = io.StringIO()
    df_for_upload.to_csv(buffer, index=False, header=True) # Note: header=True
    buffer.seek(0)
    
    # Use copy_expert with CSV format to correctly handle commas in data
    copy_sql = """
        COPY night_clubs FROM STDIN WITH
            (FORMAT CSV, HEADER TRUE)
    """
    cursor.copy_expert(sql=copy_sql, file=buffer)
    
    # Commit the entire transaction
    raw_conn.commit()
    print(f"✅ Transaction committed successfully. {len(df_for_upload)} rows were copied.")

except Exception as e:
    print(f"❌ An error occurred: {e}")
    if raw_conn:
        raw_conn.rollback()
finally:
    if raw_conn:
        raw_conn.close()

✅ Transaction committed successfully. 141 rows were copied.


Adding the Foreign Key Constraint

In [11]:
# SQL statement to add the foreign key constraint
add_foreign_key_sql = """
ALTER TABLE berlin_source_data.night_clubs
ADD CONSTRAINT district_id_fk FOREIGN KEY (district_id)
REFERENCES berlin_source_data.districts(district_id)
ON DELETE RESTRICT
ON UPDATE CASCADE;
"""

# Connect using the engine and execute the SQL
try:
    with engine.connect() as connection:
        connection.execute(text(add_foreign_key_sql))
        connection.commit()
    print("✅ Foreign key constraint 'district_id_fk' added successfully.")
except Exception as e:
    print(f"An error occurred while adding the foreign key: {e}")

✅ Foreign key constraint 'district_id_fk' added successfully.


In [12]:
# Final check: read the first 5 rows from the new table
try:
    check_df = pd.read_sql("SELECT * FROM berlin_source_data.night_clubs LIMIT 5", engine)
    print("--- Verification: First 5 rows from the database ---")
    print(check_df)
except Exception as e:
    print(f"An error occurred during verification: {e}")

--- Verification: First 5 rows from the database ---
             id district_id neighborhood_id                      club_name  \
0  way/23278633    11001001             101  Roadrunners Rock & Motor Club   
1  way/24248500    11001001             101                          Werk9   
2  way/24283864    11002002             201                Pride Warehouse   
3  way/36908987    11002002             202                       Gretchen   
4  way/41474936    11003003             301                        Duncker   

     city postcode            street house_num            phone  \
0  Berlin    10117  Unter den Linden      None  +49 30 78082991   
1  Berlin    10117  Unter den Linden      None  +49 30 20165823   
2  Berlin    10247       Colbestraße        26             None   
3  Berlin    10963   Obentrautstraße     19-21  +49 30 25922702   
4  Berlin    10439     Dunckerstraße        64   +49 30 4459509   

                       email                             website  \
0      

Final Validation via SQL

This section executes several SQL queries directly against the database to perform final validation on the newly loaded post_offices_test table. These checks verify the total row count, ensure all coordinates fall within the expected geographical boundaries, and confirm the referential integrity of the district_id by comparing the IDs in the main table against the reference districts table.

In [14]:
# --- Query 1: Total row count ---
query1 = "SELECT COUNT(*) AS total_rows FROM berlin_source_data.night_clubs;"
df1 = pd.read_sql(query1, engine)
print("Total row count in 'night_clubs'")
print(df1)

Total row count in 'night_clubs'
   total_rows
0         141


In [16]:
# --- Query 2: Count of locations with coordinates outside Berlin ---
query2 = """
    SELECT COUNT(*) AS outliers
    FROM berlin_source_data.night_clubs
    WHERE NOT (latitude BETWEEN 52.3 AND 52.7 AND longitude BETWEEN 13.0 AND 13.8);
"""
df2 = pd.read_sql(query2, engine)
print("\nCount of locations outside Berlin's bounding box")
print(df2)


Count of locations outside Berlin's bounding box
   outliers
0         0


In [17]:
# --- Query 3: Distinct district_id's in the night_clubs table ---
query3 = "SELECT DISTINCT district_id FROM berlin_source_data.night_clubs ORDER BY 1;"
df3 = pd.read_sql(query3, engine)
print("\n Distinct district_id's found in the night_clubs table")
print(df3)


 Distinct district_id's found in the night_clubs table
  district_id
0    11001001
1    11002002
2    11003003
3    11004004
4    11005005
5    11007007
6    11008008
7    11009009
8    11011011
9    11012012


In [24]:
# --- Query 4: Distinct district_id's in the districts lookup table ---
query4 = "SELECT DISTINCT district_id FROM berlin_source_data.districts ORDER BY 1;"
df4 = pd.read_sql(query4, engine)
print("\n Distinct district_id's from the reference 'districts' table")
print(df4)


 Distinct district_id's from the reference 'districts' table
   district_id
0     11001001
1     11002002
2     11003003
3     11004004
4     11005005
5     11006006
6     11007007
7     11008008
8     11009009
9     11010010
10    11011011
11    11012012


In [25]:
# --- Query 5:  Primary key uniqueness ---
query5 = "SELECT COUNT (DISTINCT id) FROM berlin_source_data.night_clubs;"
df5 = pd.read_sql(query5, engine)
print("\n Number of distinct id in the night_clubs table")
print(df5)


 Number of distinct id in the night_clubs table
   count
0    141


In [26]:
# Check for post offices that have a district_id with no match in the districts table.
query6 = "SELECT nc.id, nc.district_id FROM berlin_source_data.night_clubs nc LEFT JOIN berlin_source_data.districts d ON nc.district_id = d.district_id WHERE d.district_id IS NULL; "
df6 = pd.read_sql(query6, engine)
print("\n--- 6. Night cluns with an invalid district_id (no match in the districts table) ---")
print(df6)


--- 6. Night cluns with an invalid district_id (no match in the districts table) ---
Empty DataFrame
Columns: [id, district_id]
Index: []


In [27]:
# ---  Define the SQL query to get the table schema ---
query_schema = """
SELECT
    column_name,
    data_type,
    is_nullable
FROM
    information_schema.columns
WHERE
    table_schema = 'berlin_source_data' AND table_name = 'night_clubs';
"""

# --- 2. Execute the query and print the result ---
try:
    print("\nChecking the schema of the 'night_clubs' table")
    
    # Execute the query and load the result into a DataFrame
    df_schema = pd.read_sql(query_schema, engine)
    
    # Print the resulting schema information
    print(df_schema.to_string())

except Exception as e:
    print(f"\n❌ An error occurred while executing the query: {e}")


Checking the schema of the 'night_clubs' table
               column_name          data_type is_nullable
0                       id  character varying          NO
1              district_id  character varying          NO
2          neighborhood_id  character varying          NO
3                club_name  character varying         YES
4                     city  character varying         YES
5                 postcode  character varying         YES
6                   street  character varying         YES
7                house_num  character varying         YES
8                    phone  character varying         YES
9                    email  character varying         YES
10                 website  character varying         YES
11           opening_hours  character varying         YES
12              wheelchair  character varying         YES
13      toilets_wheelchair  character varying         YES
14  wheelchair_description  character varying         YES
15              live_mus

I'm adding NOT NULL constraints to the required columns in the schema.

In [28]:
# --- Define the SQL commands to be executed ---
sql_commands = [
    "ALTER TABLE berlin_source_data.night_clubs ALTER COLUMN club_name SET NOT NULL;",
]   

# --- Execute the commands within a transaction ---
print("\n--- Applying NOT NULL constraints  ---")
try:
    with engine.connect() as connection:
        with connection.begin() as transaction:
            for command in sql_commands:
                print(f"Executing: {command.strip()}")
                connection.execute(text(command))
        
        print("\n✅ Constraints applied successfully.")

except Exception as e:
    print(f"\n❌ An error occurred while applying constraints: {e}")
    print("\nNOTE: This error can occur if one of the columns already contains NULL values.")


--- Applying NOT NULL constraints  ---
Executing: ALTER TABLE berlin_source_data.night_clubs ALTER COLUMN club_name SET NOT NULL;

✅ Constraints applied successfully.


Final check

In [29]:
try:
    print("\n Checking the schema of the 'night_clubs' table ")
    
    # Execute the query and load the result into a DataFrame
    df_schema = pd.read_sql(query_schema, engine)
    
    # Print the resulting schema information
    print(df_schema.to_string())

except Exception as e:
    print(f"\n❌ An error occurred while executing the query: {e}")


 Checking the schema of the 'night_clubs' table 
               column_name          data_type is_nullable
0                       id  character varying          NO
1              district_id  character varying          NO
2          neighborhood_id  character varying          NO
3                club_name  character varying          NO
4                     city  character varying         YES
5                 postcode  character varying         YES
6                   street  character varying         YES
7                house_num  character varying         YES
8                    phone  character varying         YES
9                    email  character varying         YES
10                 website  character varying         YES
11           opening_hours  character varying         YES
12              wheelchair  character varying         YES
13      toilets_wheelchair  character varying         YES
14  wheelchair_description  character varying         YES
15              live_m