In [None]:
import sqlalchemy as sa

# Create the connection string with placeholders for credentials
db_uri = "postgresql+psycopg2://<USERNAME>:<PASSWORD>@localhost:5433/layereddb"

# Create the Engine object, keeping pool_pre_ping for reliability
engine = sa.create_engine(db_uri, pool_pre_ping=True)

print("✅ Connection engine for the local 'layereddb' is ready.")

✅ Connection engine for the local 'layereddb' is ready.


In [3]:
from sqlalchemy import inspect
# Create an inspector object from engine
inspector = inspect(engine)

# Get the list of schema names
schemas = inspector.get_schema_names()

print("Available schemas in the database:")
print(schemas)

Available schemas in the database:
['berlin_labels', 'berlin_recommender', 'berlin_source_data', 'dashboard_data', 'information_schema', 'public']


We are working in 'berlin_labels'

In [4]:
# Get the list of table names for the 'berlin_labels' schema
tables_in_schema = inspector.get_table_names(schema='berlin_labels')

# Print the list of tables
print("\nTables in schema 'berlin_labels':")
print(tables_in_schema)


Tables in schema 'berlin_labels':
['district_attributes', 'district_labels_new', 'district_features', 'district_labels', 'neighborhood_labels']


Since the 'district_features' table contains all the necessary counts and the 'district_attributes' table contains the calculated scaling coefficients, I will use these two tables for the labeling process.

In [5]:
import pandas as pd
# Full table name, including the schema
table_name = 'berlin_labels.district_features'

# SQL query to select all data (*) from your table
query = f"SELECT * FROM {table_name}"

# Execute the query and load the result into a Pandas DataFrame
# The read_sql function takes an SQL query and a connection object (engine)
district_features = pd.read_sql(query, engine)

print(district_features)

   district_id  bus_tram_stop_count  uban_station_count  sbahn_station_count  \
0     11001001                  222                  32                   45   
1     11002002                  120                  15                   20   
2     11003003                  292                   3                   37   
3     11004004                  264                  23                   37   
4     11005005                  283                   5                    3   
5     11006006                  313                   7                   38   
6     11007007                  138                  15                   33   
7     11008008                  261                  13                    8   
8     11009009                  314                   0                   49   
9     11010010                  222                   7                   25   
10    11011011                  216                   2                   20   
11    11012012                  254     

In [6]:
# Full table name, including the schema
table_name = 'berlin_labels.district_attributes'

# SQL query to select all data (*) from your table
query = f"SELECT * FROM {table_name}"

# Execute the query and load the result into a Pandas DataFrame
# The read_sql function takes an SQL query and a connection object (engine)
district_attributes = pd.read_sql(query, engine)

print(district_attributes)

   district_id  area_sq_km  inhabitants  area_coefficient  \
0     11004004   64.662978       343081          0.871208   
1     11002002   20.389118       293454          0.274704   
2     11011011   52.091363       311881          0.701830   
3     11010010   61.782422       291948          0.832398   
4     11001001   39.379173       397134          0.530558   
5     11008008   44.907902       330017          0.605047   
6     11003003  103.162091       424307          1.389909   
7     11012012   89.280780       268792          1.202885   
8     11005005   91.836013       257091          1.237312   
9     11006006  102.514181       310446          1.381179   
10    11007007   53.023264       355868          0.714385   
11    11009009  167.637176       294081          2.258585   

    population_coefficient  
0                 1.061595  
1                 0.908034  
2                 0.965053  
3                 0.903374  
4                 1.228851  
5                 1.021171  
6  

Let's move on to assigning the nightlife tags. The assignment logic compares the count against the mean multiplied by the area coefficient

In [7]:
mean_night_club=district_features['night_club_count'].mean()
print(mean_night_club)

11.75


In [8]:
mean_late_venue=district_features['late_venue_count_after_11pm'].mean()
print(mean_late_venue)

89.5


In [9]:
mean_evening_venue=district_features['evening_venue_count_9pm_11pm'].mean()
print(mean_evening_venue)

181.08333333333334


In [10]:
import numpy as np

# Merge DataFrames
analysis_df = pd.merge(district_features, district_attributes[['district_id', 'area_coefficient']], on='district_id', how='left')

# Calculate "Hub" Statuses using Correct Logic
# Compare actual count to the mean count adjusted by the area coefficient
has_many_clubs = analysis_df['night_club_count'] > (mean_night_club * analysis_df['area_coefficient'])
has_many_late_venues = analysis_df['late_venue_count_after_11pm'] > (mean_late_venue * analysis_df['area_coefficient'])
has_many_evening_venues = analysis_df['evening_venue_count_9pm_11pm'] > (mean_evening_venue * analysis_df['area_coefficient'])

# Apply Hierarchy for Primary Nightlife Tags 
conditions_primary = [
    has_many_clubs & has_many_late_venues, # Condition for #active_nightlife
    has_many_clubs,                       # Condition for #many_clubs (only if above is False)
    has_many_late_venues                  # Condition for #many_late_venues (only if both above are False)
]
choices_primary = [
    '#active_nightlife',
    '#many_clubs',
    '#many_late_venues'
]
analysis_df['primary_nightlife_tag'] = np.select(conditions_primary, choices_primary, default=None)

# Assign the Independent Evening Tag
analysis_df['evening_tag'] = np.where(has_many_evening_venues, '#many_evening_venues', None)

# Combine Tags into a Final Label
def combine_tags(row):
    tags = [tag for tag in [row['primary_nightlife_tag'], row['evening_tag']] if pd.notna(tag)]
    if not tags:
        return 'Not Tagged' # Use 'Not Tagged' to easily filter later
    else:
        return ', '.join(tags)

analysis_df['final_nightlife_label'] = analysis_df.apply(combine_tags, axis=1)

# Format the result for Nightlife
nightlife_tags_df = analysis_df[analysis_df['final_nightlife_label'] != 'Not Tagged'].copy()
nightlife_tags_df['category'] = 'Community & Lifestyle'
nightlife_tags_df.rename(columns={'final_nightlife_label': 'label'}, inplace=True)
nightlife_tags_df = nightlife_tags_df[['district_id', 'category', 'label']]

print("--- Results for Nightlife Tags (Based on Adjusted Mean Count) ---")
print(nightlife_tags_df)

--- Results for Nightlife Tags (Based on Adjusted Mean Count) ---
  district_id               category                                    label
0    11001001  Community & Lifestyle  #active_nightlife, #many_evening_venues
1    11002002  Community & Lifestyle  #active_nightlife, #many_evening_venues
3    11004004  Community & Lifestyle  #active_nightlife, #many_evening_venues
6    11007007  Community & Lifestyle  #many_late_venues, #many_evening_venues
7    11008008  Community & Lifestyle  #active_nightlife, #many_evening_venues


Let's move on to assigning the dining & drinks tags (#dining_and_drinks_hub, #many_restaurants, #many_bars, #many_cafes). The assignment logic compares the count against the mean multiplied by the area coefficient.

In [11]:
mean_restaurant=district_features['restaurant_count'].mean()
print(mean_restaurant)

385.8333333333333


In [12]:
mean_bar=district_features['bar_count'].mean()
print(mean_bar)

78.25


In [13]:
mean_cafe=district_features['cafe_count'].mean()
print(mean_cafe)

205.5


I decided to use the three-tier hierarchical logic and use a multiplier of 1.5 to make the conditions stricter.

In [14]:
import numpy as np

# Define the multiplier
threshold_multiplier = 1.5

# Calculate "Hub" Statuses using Adjusted Mean Logic (Multiplier = 1.5)
# Compare actual count to the mean count adjusted by the area coefficient * multiplier
has_many_restaurants = analysis_df['restaurant_count'] > (mean_restaurant * analysis_df['area_coefficient'] * threshold_multiplier)
has_many_bars = analysis_df['bar_count'] > (mean_bar * analysis_df['area_coefficient'] * threshold_multiplier)
has_many_cafes = analysis_df['cafe_count'] > (mean_cafe * analysis_df['area_coefficient'] * threshold_multiplier)

# Count How Many Conditions Are Met
analysis_df['hub_condition_count'] = (has_many_restaurants.astype(int) +
                                      has_many_bars.astype(int) +
                                      has_many_cafes.astype(int))

# Apply 3-Tier Hierarchical Logic (np.select)
conditions_dining_3tier = [
    analysis_df['hub_condition_count'] == 3, # Condition for Hub (Exactly 3)
    analysis_df['hub_condition_count'] == 2, # Condition for Good Selection (Exactly 2)
    has_many_restaurants,                   # Condition for #many_restaurants (Only if count == 1)
    has_many_bars,                          # Condition for #many_bars (Only if count == 1)
    has_many_cafes                          # Condition for #many_cafes (Only if count == 1)
]
choices_dining_3tier = [
    '#dining_and_drinks_hub', # Tag for 3 conditions
    '#good_venue_selection',  # Tag for 2 conditions
    '#many_restaurants',      # Tag if only restaurants is True
    '#many_bars',             # Tag if only bars is True
    '#many_cafes'             # Tag if only cafes is True
]
analysis_df['final_community_label'] = np.select(
    conditions_dining_3tier,
    choices_dining_3tier,
    default='Not Tagged'
)

# Format the result for Community & Lifestyle
community_tags_df = analysis_df[analysis_df['final_community_label'] != 'Not Tagged'].copy()
community_tags_df['category'] = 'Community & Lifestyle' 
community_tags_df.rename(columns={'final_community_label': 'label'}, inplace=True)
community_tags_df = community_tags_df[['district_id', 'category', 'label']]

print(f"--- Results for Community & Lifestyle Tags (Hierarchical Logic, Multiplier: {threshold_multiplier}) ---")
print(community_tags_df.to_string())

--- Results for Community & Lifestyle Tags (Hierarchical Logic, Multiplier: 1.5) ---
  district_id               category                   label
0    11001001  Community & Lifestyle  #dining_and_drinks_hub
1    11002002  Community & Lifestyle  #dining_and_drinks_hub
3    11004004  Community & Lifestyle   #good_venue_selection
6    11007007  Community & Lifestyle             #many_cafes
7    11008008  Community & Lifestyle   #good_venue_selection


In [16]:
from sqlalchemy import text

print("\n Preparing data for DB load")

# Process Nightlife tags: Split multi-tags into separate rows 'label' might contain strings like '#tag1, #tag2'
if not nightlife_tags_df.empty:
    nightlife_tags_processed_df = nightlife_tags_df.assign(
        label=nightlife_tags_df['label'].str.split(', ')
    ).explode('label').reset_index(drop=True)
    print(f"Processed {len(nightlife_tags_processed_df)} individual nightlife tags.")
else:
    nightlife_tags_processed_df = nightlife_tags_df # Keep it empty if it was empty
    print("No nightlife tags to process.")

# Combine with Community tags 'community_tags_df' already contains one tag per row
all_new_labels_df = pd.concat(
    [nightlife_tags_processed_df, community_tags_df],
    ignore_index=True
)

# Remove potential duplicates
all_new_labels_df.drop_duplicates(inplace=True)

print(f"Total tags to load: {len(all_new_labels_df)}")
if len(all_new_labels_df) > 0:
    print("Example of combined tags:")
    print(all_new_labels_df.head())
else:
    print("No tags were generated for loading.")

# Load data into the Database

target_schema = "berlin_labels"
target_table = "district_labels_new"
category_to_update = "Community & Lifestyle" 

# List of tags managed by this script
# We will only delete these specific tags before inserting, leaving other tags in the same category untouched.
managed_tags_list = [
    # From 'nightlife_tags_df'
    '#active_nightlife',
    '#many_clubs',
    '#many_late_venues',
    '#many_evening_venues',
    # From 'community_tags_df'
    '#dining_and_drinks_hub',
    '#good_venue_selection',
    '#many_restaurants',
    '#many_bars',
    '#many_cafes'
]

# For the SQL 'IN' query, we need a tuple
managed_tags_tuple = tuple(managed_tags_list)
# -----------------------------------------------------------------

if not all_new_labels_df.empty:
    try:
        with engine.begin() as connection:
            print(f"Connecting to DB to update table {target_schema}.{target_table}...")

            # Delete only the tags managed by this script
            print(f"Deleting existing managed tags in category: '{category_to_update}'")
            
            delete_sql = text(
                f"""
                DELETE FROM {target_schema}.{target_table}
                WHERE 
                    category = :category_name
                    AND label IN :managed_tags
                """
            )
            
            # Pass parameters safely
            result = connection.execute(
                delete_sql,
                {
                    "category_name": category_to_update,
                    "managed_tags": managed_tags_tuple  # Pass the tuple of tags
                }
            )
            print(f"Deleted {result.rowcount} old rows (matching managed tags).")

            # Append (insert) the new data
            print(f"Inserting {len(all_new_labels_df)} new rows...")
            all_new_labels_df.to_sql(
                name=target_table,      # Table name
                con=connection,         # Connection
                schema=target_schema,   # Schema
                if_exists='append',     # Append, since we deleted only relevant rows
                index=False             # Do not write the pandas index to the DB
            )
            print("New tags inserted successfully.")

        print("Database Update Complete")

    except Exception as e:
        print("\n ERROR")
        print(f"An error occurred during the DB operation: {e}")
        print("Data was NOT saved to the database.")
else:
    print("Skipping DB insertion as no tags were generated.")


 Preparing data for DB load
Processed 10 individual nightlife tags.
Total tags to load: 15
Example of combined tags:
  district_id               category                 label
0    11001001  Community & Lifestyle     #active_nightlife
1    11001001  Community & Lifestyle  #many_evening_venues
2    11002002  Community & Lifestyle     #active_nightlife
3    11002002  Community & Lifestyle  #many_evening_venues
4    11004004  Community & Lifestyle     #active_nightlife
Connecting to DB to update table berlin_labels.district_labels_new...
Deleting existing managed tags in category: 'Community & Lifestyle'
Deleted 0 old rows (matching managed tags).
Inserting 15 new rows...
New tags inserted successfully.
Database Update Complete


Final check

In [22]:
# Verifying Data in Database 

# Define the list of tags
managed_tags_tuple = (
    '#active_nightlife', '#many_clubs', '#many_late_venues', 
    '#many_evening_venues', '#dining_and_drinks_hub', 
    '#good_venue_selection', '#many_restaurants', '#many_bars', '#many_cafes'
)

# Define the parameters
verify_params = {
    'category_name': 'Community & Lifestyle', 
    'managed_tags': managed_tags_tuple
}

# Define the SQL query, wrapped in text()
verify_sql = text("""
    SELECT
        label,
        COUNT(district_id) AS total_districts
    FROM
        berlin_labels.district_labels_new
    WHERE
        category = :category_name
        AND label IN :managed_tags
    GROUP BY
        label
    ORDER BY
        label;
""")

# Execute the verification query
try:
    # Use pd.read_sql_query (or pd.read_sql)
    # Pass the text() object as the first argument
    verification_df = pd.read_sql_query(
        verify_sql,
        con=engine,      
        params=verify_params
    )
    
    print("Verification Successful")
    print(verification_df.to_string())

except Exception as e:
    print(f"ERROR during verification")
    print(f"An error occurred while trying to read data back: {e}")

Verification Successful
                    label  total_districts
0       #active_nightlife                4
1  #dining_and_drinks_hub                2
2   #good_venue_selection                2
3             #many_cafes                1
4    #many_evening_venues                5
5       #many_late_venues                1
